From 879436f440554fe6e4c89964474893316e6906f1 Mon Sep 17 00:00:00 2001 From: samber Date: Sun, 15 Mar 2026 18:47:04 +0000 Subject: [PATCH] Publish --- dist/rules/golang/golang-exporter.yml | 105 ++++++++++++++++++++++++++ dist/rules/jvm/jvm-exporter.yml | 105 ++++++++++++++++++++++++++ dist/rules/python/python-exporter.yml | 53 +++++++++++++ dist/rules/ruby/ruby-exporter.yml | 52 +++++++++++++ 4 files changed, 315 insertions(+) create mode 100644 dist/rules/golang/golang-exporter.yml create mode 100644 dist/rules/python/python-exporter.yml create mode 100644 dist/rules/ruby/ruby-exporter.yml diff --git a/dist/rules/golang/golang-exporter.yml b/dist/rules/golang/golang-exporter.yml new file mode 100644 index 0000000..cd5e777 --- /dev/null +++ b/dist/rules/golang/golang-exporter.yml @@ -0,0 +1,105 @@ +groups: + +- name: GolangExporter + + + rules: + + # Threshold is a rough default. High-concurrency servers may legitimately run thousands of goroutines. Adjust to match your baseline. + - alert: GoGoroutineCountHigh + expr: 'go_goroutines > 1000' + for: 5m + labels: + severity: warning + annotations: + summary: Go goroutine count high (instance {{ $labels.instance }}) + description: "Go application has too many goroutines (> 1000), potential goroutine leak\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + # quantile="1" is the maximum observed GC pause in the current summary window, not p99. + # A single outlier pause can push this above 1s. The for: 5m ensures the max stays elevated. + - alert: GoGcDurationHigh + expr: 'go_gc_duration_seconds{quantile="1"} > 1' + for: 5m + labels: + severity: warning + annotations: + summary: Go GC duration high (instance {{ $labels.instance }}) + description: "Go GC pause duration is too high (max > 1s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + # go_memstats_sys_bytes is the total memory obtained from the OS by the Go runtime, not total host memory. + # This ratio measures Go-internal memory utilization, not system-level memory pressure. + - alert: GoMemoryUsageHigh + expr: '(go_memstats_heap_alloc_bytes / go_memstats_sys_bytes) * 100 > 90' + for: 5m + labels: + severity: warning + annotations: + summary: Go memory usage high (instance {{ $labels.instance }}) + description: "Go heap allocation is using most of the runtime's reserved memory (> 90%), indicating the process may need more memory or has a leak\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + # Threshold is workload-dependent. Applications with heavy CGo or blocking I/O may legitimately use more OS threads. Adjust to match your baseline. + - alert: GoThreadCountHigh + expr: 'go_threads > 50' + for: 5m + labels: + severity: warning + annotations: + summary: Go thread count high (instance {{ $labels.instance }}) + description: "Go OS thread count is high (> 50), potential blocking syscall or CGo leak\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + # Threshold is a rough default. Adjust based on your application's normal object count. + - alert: GoHeapObjectsCountHigh + expr: 'go_memstats_heap_objects > 10000000' + for: 5m + labels: + severity: warning + annotations: + summary: Go heap objects count high (instance {{ $labels.instance }}) + description: "Go heap has too many live objects (> 10M), high GC pressure\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + # go_memstats_gc_cpu_fraction is deprecated since Go 1.20 and may return 0 in newer versions. + # Consider using runtime/metrics-based alternatives if running Go >= 1.20. + - alert: GoGcCpuFractionHigh + expr: 'go_memstats_gc_cpu_fraction > 0.05' + for: 5m + labels: + severity: warning + annotations: + summary: Go GC CPU fraction high (instance {{ $labels.instance }}) + description: "Go GC is consuming too much CPU (> 5%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: GoGoroutineSpike + expr: 'deriv(go_goroutines[5m]) > 100' + for: 5m + labels: + severity: warning + annotations: + summary: Go goroutine spike (instance {{ $labels.instance }}) + description: "Go goroutine count is growing rapidly\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: GoHeapFragmentation + expr: 'go_memstats_heap_idle_bytes / go_memstats_heap_sys_bytes > 0.9' + for: 5m + labels: + severity: warning + annotations: + summary: Go heap fragmentation (instance {{ $labels.instance }}) + description: "Go heap has high idle ratio (> 90%), indicating memory fragmentation\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: GoMemoryLeak + expr: 'rate(go_memstats_alloc_bytes_total[5m]) > 1e9' + for: 5m + labels: + severity: warning + annotations: + summary: Go memory leak (instance {{ $labels.instance }}) + description: "Go application has sustained high allocation rate (> 1GB/s), potential memory leak\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: GoStackMemoryHigh + expr: 'go_memstats_stack_inuse_bytes > 1e9' + for: 5m + labels: + severity: warning + annotations: + summary: Go stack memory high (instance {{ $labels.instance }}) + description: "Go stack memory usage is high (> 1GB), likely excessive goroutines or deep recursion\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/dist/rules/jvm/jvm-exporter.yml b/dist/rules/jvm/jvm-exporter.yml index 8828f52..67f7842 100644 --- a/dist/rules/jvm/jvm-exporter.yml +++ b/dist/rules/jvm/jvm-exporter.yml @@ -13,3 +13,108 @@ groups: annotations: summary: JVM memory filling up (instance {{ $labels.instance }}) description: "JVM memory is filling up (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + # Many JVM configurations leave metaspace unbounded, in which case jvm_memory_max_bytes{area="nonheap"} is -1 and this alert will not fire. + # The query filters out max_bytes <= 0 to avoid false negatives. + - alert: JvmNon-heapMemoryFillingUp + expr: '(sum by (instance)(jvm_memory_used_bytes{area="nonheap"}) / (sum by (instance)(jvm_memory_max_bytes{area="nonheap"}) > 0)) * 100 > 80' + for: 2m + labels: + severity: warning + annotations: + summary: JVM non-heap memory filling up (instance {{ $labels.instance }}) + description: "JVM non-heap memory (metaspace/code cache) is filling up (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: JvmGcTimeTooHigh + expr: 'sum by (instance)(rate(jvm_gc_collection_seconds_sum[5m])) > 0.05' + for: 5m + labels: + severity: warning + annotations: + summary: JVM GC time too high (instance {{ $labels.instance }}) + description: "JVM is spending too much time in garbage collection (> 5% of wall clock time)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: JvmThreadsDeadlocked + expr: 'jvm_threads_deadlocked > 0' + for: 1m + labels: + severity: critical + annotations: + summary: JVM threads deadlocked (instance {{ $labels.instance }}) + description: "JVM has deadlocked threads\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: JvmThreadCountHigh + expr: 'jvm_threads_current > 300' + for: 5m + labels: + severity: warning + annotations: + summary: JVM thread count high (instance {{ $labels.instance }}) + description: "JVM thread count is high (> 300), potential thread leak\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: JvmThreadsBlocked + expr: 'jvm_threads_state{state="BLOCKED"} > 50' + for: 5m + labels: + severity: warning + annotations: + summary: JVM threads BLOCKED (instance {{ $labels.instance }}) + description: "JVM has high number of BLOCKED threads, indicating lock contention\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + # This regex matches CMS, G1, and Parallel collector names. It will not match ZGC or Shenandoah cycle names. + # Adjust the gc label filter if you use a different collector. + - alert: JvmOldGenGcFrequency + expr: 'rate(jvm_gc_collection_seconds_count{gc=~".*old.*|.*major.*"}[5m]) > 0.3' + for: 5m + labels: + severity: warning + annotations: + summary: JVM old gen GC frequency (instance {{ $labels.instance }}) + description: "Frequent old/major GC cycles, indicating memory pressure\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: JvmDirectBufferPoolFillingUp + expr: '(jvm_buffer_pool_used_bytes / jvm_buffer_pool_capacity_bytes) * 100 > 90' + for: 5m + labels: + severity: warning + annotations: + summary: JVM direct buffer pool filling up (instance {{ $labels.instance }}) + description: "JVM direct buffer pool is filling up (> 90%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: JvmObjectsPendingFinalization + expr: 'jvm_memory_objects_pending_finalization > 1000' + for: 5m + labels: + severity: warning + annotations: + summary: JVM objects pending finalization (instance {{ $labels.instance }}) + description: "JVM has objects pending finalization, potential memory leak\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + # process_open_fds and process_max_fds are generic metrics from the Prometheus client library, not JVM-specific. + # This alert will also fire for Go, Python, or any process exposing these metrics. + - alert: JvmFileDescriptorsExhaustion + expr: '(process_open_fds / process_max_fds) * 100 > 90' + for: 5m + labels: + severity: warning + annotations: + summary: JVM file descriptors exhaustion (instance {{ $labels.instance }}) + description: "JVM process is running out of file descriptors (> 90% used)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: JvmClassLoadingAnomaly + expr: 'rate(jvm_classes_loaded_total[5m]) > 100' + for: 5m + labels: + severity: warning + annotations: + summary: JVM class loading anomaly (instance {{ $labels.instance }}) + description: "Rapid class loading detected, potential classloader leak\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: JvmCompilationTimeSpike + expr: 'rate(jvm_compilation_time_seconds_total[5m]) > 0.1' + for: 5m + labels: + severity: warning + annotations: + summary: JVM compilation time spike (instance {{ $labels.instance }}) + description: "Excessive JIT compilation time consuming CPU\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/dist/rules/python/python-exporter.yml b/dist/rules/python/python-exporter.yml new file mode 100644 index 0000000..2230ae5 --- /dev/null +++ b/dist/rules/python/python-exporter.yml @@ -0,0 +1,53 @@ +groups: + +- name: PythonExporter + + + rules: + + - alert: PythonGcObjectsUncollectable + expr: 'increase(python_gc_objects_uncollectable_total[5m]) > 0' + for: 5m + labels: + severity: warning + annotations: + summary: Python GC objects uncollectable (instance {{ $labels.instance }}) + description: "Python has uncollectable objects, potential memory leak via reference cycles\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PythonGcCollectionsHigh + expr: 'rate(python_gc_objects_collected_total[5m]) > 10000' + for: 5m + labels: + severity: warning + annotations: + summary: Python GC collections high (instance {{ $labels.instance }}) + description: "Python GC is collecting too many objects (> 10k/s), high allocation pressure\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + # process_open_fds and process_max_fds are generic metrics from the Prometheus client library, not Python-specific. + - alert: PythonFileDescriptorsExhaustion + expr: '(process_open_fds / process_max_fds) * 100 > 90' + for: 5m + labels: + severity: warning + annotations: + summary: Python file descriptors exhaustion (instance {{ $labels.instance }}) + description: "Python process is running out of file descriptors (> 90% used)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PythonGcGeneration2CollectionsHigh + expr: 'rate(python_gc_collections_total{generation="2"}[5m]) > 1' + for: 5m + labels: + severity: warning + annotations: + summary: Python GC generation 2 collections high (instance {{ $labels.instance }}) + description: "Python full GC (generation 2) is running too frequently, indicating memory pressure\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + # Threshold is a rough default. Adjust based on your application's expected memory footprint. + - alert: PythonVirtualMemoryHigh + expr: 'process_virtual_memory_bytes > 4e9' + for: 5m + labels: + severity: warning + annotations: + summary: Python virtual memory high (instance {{ $labels.instance }}) + description: "Python process virtual memory is high (> 4GB)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/dist/rules/ruby/ruby-exporter.yml b/dist/rules/ruby/ruby-exporter.yml new file mode 100644 index 0000000..0b526cb --- /dev/null +++ b/dist/rules/ruby/ruby-exporter.yml @@ -0,0 +1,52 @@ +groups: + +- name: RubyExporter + + + rules: + + # Threshold is a rough default. Adjust based on your application's normal heap size. + - alert: RubyHeapLiveSlotsHigh + expr: 'ruby_heap_live_slots > 500000' + for: 5m + labels: + severity: warning + annotations: + summary: Ruby heap live slots high (instance {{ $labels.instance }}) + description: "Ruby heap has too many live slots (> 500k), heap bloat\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: RubyHeapFreeSlotsHigh + expr: 'ruby_heap_free_slots > 500000' + for: 5m + labels: + severity: warning + annotations: + summary: Ruby heap free slots high (instance {{ $labels.instance }}) + description: "Ruby heap has too many free slots (> 500k), memory fragmentation after large allocations\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: RubyMajorGcRateHigh + expr: 'rate(ruby_major_gc_ops_total[5m]) > 5' + for: 5m + labels: + severity: warning + annotations: + summary: Ruby major GC rate high (instance {{ $labels.instance }}) + description: "Ruby is performing too many major GC cycles, indicating memory pressure\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: RubyRssHigh + expr: 'ruby_rss > 1e9' + for: 5m + labels: + severity: warning + annotations: + summary: Ruby RSS high (instance {{ $labels.instance }}) + description: "Ruby process RSS is high (> 1GB)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: RubyAllocatedObjectsSpike + expr: 'rate(ruby_allocated_objects_total[5m]) > 100000' + for: 5m + labels: + severity: warning + annotations: + summary: Ruby allocated objects spike (instance {{ $labels.instance }}) + description: "Ruby is allocating objects at a high rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"