From bf7b9028813a495f58df1a0c218752f85c26537f Mon Sep 17 00:00:00 2001 From: Samuel Berthe Date: Mon, 16 Mar 2026 03:31:18 +0100 Subject: [PATCH] feat: add process-exporter alerting rules (ncabatoff/process-exporter) (#514) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: add process-exporter alerting rules (ncabatoff/process-exporter) * docs: add Process to README services list * fix: address PR review feedback for process-exporter rules - Rename service from "Process" to "Process Exporter" for clarity - Fix grammar: "file descriptors usage" → "file descriptor usage" - Clarify CPU alert description as core-equivalent percentage - Rename "high disk IO" to "high disk write IO" for accuracy --- README.md | 1 + _data/rules.yml | 68 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+) diff --git a/README.md b/README.md index 9707db4..d4f1954 100644 --- a/README.md +++ b/README.md @@ -51,6 +51,7 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts - [Proxmox VE](https://samber.github.io/awesome-prometheus-alerts/rules#proxmox-ve) - [Netdata](https://samber.github.io/awesome-prometheus-alerts/rules#netdata) - [eBPF](https://samber.github.io/awesome-prometheus-alerts/rules#ebpf) +- [Process Exporter](https://samber.github.io/awesome-prometheus-alerts/rules#process-exporter) #### Databases and brokers diff --git a/_data/rules.yml b/_data/rules.yml index 898e2fc..ff3c3dc 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -742,6 +742,74 @@ groups: severity: warning for: 5m + - name: Process Exporter + exporters: + - name: ncabatoff/process-exporter + slug: process-exporter + doc_url: https://github.com/ncabatoff/process-exporter + rules: + - name: Process exporter group down + description: "No processes found for group {{ $labels.groupname }}. The service may have stopped. (instance {{ $labels.instance }})" + query: 'namedprocess_namegroup_num_procs == 0' + severity: critical + for: 2m + - name: Process exporter high memory usage + description: "Process group {{ $labels.groupname }} is using {{ $value | humanize }}B of resident memory. (instance {{ $labels.instance }})" + query: 'namedprocess_namegroup_memory_bytes{memtype="resident"} > 4e+09' + severity: warning + for: 5m + comments: | + Threshold of 4GB is arbitrary and depends on the process being monitored. Adjust per group. + - name: Process exporter high CPU usage + description: "Process group {{ $labels.groupname }} is using {{ $value }}% CPU (core-equivalent). (instance {{ $labels.instance }})" + query: 'rate(namedprocess_namegroup_cpu_seconds_total[5m]) * 100 > 80' + severity: warning + for: 5m + comments: | + Value is core-equivalent %: 100% = 1 full core, 200% = 2 cores, etc. Threshold of 80% is per-core. Adjust based on expected workload. + - name: Process exporter high file descriptor usage + description: "Process group {{ $labels.groupname }} is using more than 80% of its file descriptor limit. (instance {{ $labels.instance }})" + query: 'namedprocess_namegroup_worst_fd_ratio > 0.8' + severity: warning + for: 5m + - name: Process exporter file descriptors exhausted + description: "Process group {{ $labels.groupname }} has nearly exhausted its file descriptor limit. (instance {{ $labels.instance }})" + query: 'namedprocess_namegroup_worst_fd_ratio > 0.95' + severity: critical + for: 2m + - name: Process exporter high swap usage + description: "Process group {{ $labels.groupname }} is using {{ $value | humanize }}B of swap. (instance {{ $labels.instance }})" + query: 'namedprocess_namegroup_memory_bytes{memtype="swapped"} > 512e+06' + severity: warning + for: 5m + comments: | + Threshold of 512MB is arbitrary. Adjust per group and environment. + - name: Process exporter zombie processes + description: "Process group {{ $labels.groupname }} has {{ $value }} zombie processes. (instance {{ $labels.instance }})" + query: 'namedprocess_namegroup_states{state="Zombie"} > 0' + severity: warning + for: 5m + - name: Process exporter high context switching + description: "Process group {{ $labels.groupname }} has a high rate of context switches ({{ $value }}/s). (instance {{ $labels.instance }})" + query: 'rate(namedprocess_namegroup_context_switches_total[5m]) > 10000' + severity: warning + for: 5m + comments: | + Threshold of 10000 switches/s is a rough default. Adjust based on the workload profile. + - name: Process exporter high disk write IO + description: "Process group {{ $labels.groupname }} is performing {{ $value | humanize }}B/s of disk writes. (instance {{ $labels.instance }})" + query: 'rate(namedprocess_namegroup_write_bytes_total[5m]) > 100e+06' + severity: warning + for: 5m + comments: | + Threshold of 100MB/s is arbitrary. Adjust per group. + - name: Process exporter process restarting + description: "Process group {{ $labels.groupname }} has restarted (oldest process start time changed). (instance {{ $labels.instance }})" + query: 'changes(namedprocess_namegroup_oldest_start_time_seconds[5m]) > 0 and namedprocess_namegroup_num_procs > 0' + severity: info + comments: | + Detects restarts by watching for changes in the oldest process start time within the group. + - name: Databases and brokers services: - name: MySQL