groups: - name: GolangExporter rules: # Threshold is a rough default. High-concurrency servers may legitimately run thousands of goroutines. Adjust to match your baseline. - alert: GoGoroutineCountHigh expr: 'go_goroutines > 1000' for: 5m labels: severity: warning annotations: summary: Go goroutine count high (instance {{ $labels.instance }}) description: "Go application has too many goroutines (> 1000), potential goroutine leak\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # quantile="1" is the maximum observed GC pause in the current summary window, not p99. # A single outlier pause can push this above 1s. The for: 5m ensures the max stays elevated. - alert: GoGcDurationHigh expr: 'go_gc_duration_seconds{quantile="1"} > 1' for: 5m labels: severity: warning annotations: summary: Go GC duration high (instance {{ $labels.instance }}) description: "Go GC pause duration is too high (max > 1s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # go_memstats_sys_bytes is the total memory obtained from the OS by the Go runtime, not total host memory. # This ratio measures Go-internal memory utilization, not system-level memory pressure. - alert: GoMemoryUsageHigh expr: '(go_memstats_heap_alloc_bytes / go_memstats_sys_bytes) * 100 > 90' for: 5m labels: severity: warning annotations: summary: Go memory usage high (instance {{ $labels.instance }}) description: "Go heap allocation is using most of the runtime's reserved memory (> 90%), indicating the process may need more memory or has a leak\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Threshold is workload-dependent. Applications with heavy CGo or blocking I/O may legitimately use more OS threads. Adjust to match your baseline. - alert: GoThreadCountHigh expr: 'go_threads > 500' for: 5m labels: severity: warning annotations: summary: Go thread count high (instance {{ $labels.instance }}) description: "Go OS thread count is high (> 500), potential blocking syscall or CGo leak\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Threshold is a rough default. Adjust based on your application's normal object count. - alert: GoHeapObjectsCountHigh expr: 'go_memstats_heap_objects > 10000000' for: 5m labels: severity: warning annotations: summary: Go heap objects count high (instance {{ $labels.instance }}) description: "Go heap has too many live objects (> 10M), high GC pressure\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # rate(go_gc_duration_seconds_sum) approximates the fraction of wall-clock time spent in GC. # This replaces go_memstats_gc_cpu_fraction which was removed in client_golang v1.12+. - alert: GoGcCpuFractionHigh expr: 'rate(go_gc_duration_seconds_sum[5m]) > 0.05' for: 5m labels: severity: warning annotations: summary: Go GC CPU fraction high (instance {{ $labels.instance }}) description: "Go GC is consuming too much CPU (> 5%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # A threshold of 100/s only catches catastrophic leaks (30k goroutines in 5m). 10/s catches gradual leaks (~3k in 5m). # Adjust based on your application's expected concurrency patterns. - alert: GoGoroutineSpike expr: 'deriv(go_goroutines[5m]) > 10' for: 5m labels: severity: warning annotations: summary: Go goroutine spike (instance {{ $labels.instance }}) description: "Go goroutine count is growing rapidly ({{ $value | printf \"%.0f\" }} goroutines/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Alerts when heap in-use grows by more than 10MB/s sustained over 10 minutes. # Adjust threshold based on your workload. - alert: GoHeapIn-useGrowing expr: 'deriv(go_memstats_heap_inuse_bytes[10m]) > 1e7' for: 0m labels: severity: warning annotations: summary: Go heap in-use growing (instance {{ $labels.instance }}) description: "Go heap in-use memory is growing steadily, potential memory leak or under-sized heap\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: GoMemoryLeak expr: 'rate(go_memstats_alloc_bytes_total[5m]) > 1e9' for: 5m labels: severity: warning annotations: summary: Go memory leak (instance {{ $labels.instance }}) description: "Go application has sustained high allocation rate (> 1GB/s), potential memory leak\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: GoStackMemoryHigh expr: 'go_memstats_stack_inuse_bytes > 1e9' for: 5m labels: severity: warning annotations: summary: Go stack memory high (instance {{ $labels.instance }}) description: "Go stack memory usage is high (> 1GB), likely excessive goroutines or deep recursion\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"