From 281142567c7b02b6c0467a5da312ca50ba11f348 Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Mon, 16 Mar 2026 03:25:27 +0100
Subject: [PATCH 1/7] fix: use proper zero-traffic guard in Envoy ratio alerts
 (#511) (#513)

Replace `+ 1` denominator hack with `and ... > 0` filter in upstream
timeout rate and upstream 5xx error rate queries for mathematical
correctness and repo consistency.
---
 _data/rules.yml | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/_data/rules.yml b/_data/rules.yml
index 0e78127..4e989f7 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -2363,18 +2363,14 @@ groups:
                 for: 5m
               - name: Envoy high cluster upstream request timeout rate
                 description: "More than 5% of upstream requests are timing out in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}"
-                query: "increase(envoy_cluster_upstream_rq_timeout[5m]) / (increase(envoy_cluster_upstream_rq_completed[5m]) + 1) * 100 > 5"
+                query: "increase(envoy_cluster_upstream_rq_timeout[5m]) / increase(envoy_cluster_upstream_rq_completed[5m]) * 100 > 5 and increase(envoy_cluster_upstream_rq_completed[5m]) > 0"
                 severity: warning
                 for: 5m
-                comments: |
-                  The +1 in the denominator guards against division by zero.
               - name: Envoy high cluster upstream 5xx error rate
                 description: "More than 5% of upstream requests return 5xx in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}"
-                query: 'increase(envoy_cluster_upstream_rq_xx{envoy_response_code_class="5"}[5m]) / (increase(envoy_cluster_upstream_rq_completed[5m]) + 1) * 100 > 5'
+                query: 'increase(envoy_cluster_upstream_rq_xx{envoy_response_code_class="5"}[5m]) / increase(envoy_cluster_upstream_rq_completed[5m]) * 100 > 5 and increase(envoy_cluster_upstream_rq_completed[5m]) > 0'
                 severity: critical
                 for: 1m
-                comments: |
-                  The +1 in the denominator guards against division by zero.
               - name: Envoy cluster health check failures
                 description: "Health checks are consistently failing in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}"
                 query: "increase(envoy_cluster_health_check_failure[5m]) > 5"

From 2b239736cf6b80e954186d2282648f72f0235178 Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Mon, 16 Mar 2026 03:25:38 +0100
Subject: [PATCH 2/7] feat: add alerting rules for
 prometheus/memcached_exporter (#512)

---
 README.md       |  1 +
 _data/rules.yml | 60 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 61 insertions(+)

diff --git a/README.md b/README.md
index 6a6802f..9707db4 100644
--- a/README.md
+++ b/README.md
@@ -60,6 +60,7 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts
 - [Patroni](https://samber.github.io/awesome-prometheus-alerts/rules#patroni)
 - [PGBouncer](https://samber.github.io/awesome-prometheus-alerts/rules#pgbouncer)
 - [Redis](https://samber.github.io/awesome-prometheus-alerts/rules#redis)
+- [Memcached](https://samber.github.io/awesome-prometheus-alerts/rules#memcached)
 - [MongoDB](https://samber.github.io/awesome-prometheus-alerts/rules#mongodb)
 - [RabbitMQ](https://samber.github.io/awesome-prometheus-alerts/rules#rabbitmq)
 - [Elasticsearch](https://samber.github.io/awesome-prometheus-alerts/rules#elasticsearch)
diff --git a/_data/rules.yml b/_data/rules.yml
index 4e989f7..898e2fc 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -1053,6 +1053,66 @@ groups:
                 query: "increase(redis_rejected_connections_total[1m]) > 5"
                 severity: warning
 
+      - name: Memcached
+        exporters:
+          - name: prometheus/memcached_exporter
+            slug: memcached-exporter
+            doc_url: https://github.com/prometheus/memcached_exporter
+            rules:
+              - name: Memcached down
+                description: Memcached instance is down on {{ $labels.instance }}
+                query: "memcached_up == 0"
+                severity: critical
+                for: 1m
+                comments: |
+                  1m delay allows a restart without triggering an alert.
+              - name: Memcached connection limit approaching (> 80%)
+                description: "Memcached connection usage is above 80% on {{ $labels.instance }} (current value: {{ $value }}%)"
+                query: "(memcached_current_connections / memcached_max_connections * 100) > 80"
+                severity: warning
+                for: 2m
+              - name: Memcached connection limit approaching (> 95%)
+                description: "Memcached connection usage is above 95% on {{ $labels.instance }} (current value: {{ $value }}%)"
+                query: "(memcached_current_connections / memcached_max_connections * 100) > 95"
+                severity: critical
+                for: 2m
+              - name: Memcached out of memory errors
+                description: "Memcached is returning out-of-memory errors on {{ $labels.instance }}"
+                query: "sum without (slab) (rate(memcached_slab_items_outofmemory_total[5m])) > 0"
+                severity: warning
+                for: 5m
+              - name: Memcached memory usage high (> 90%)
+                description: "Memcached memory usage is above 90% on {{ $labels.instance }} (current value: {{ $value }}%)"
+                query: "(memcached_current_bytes / memcached_limit_bytes * 100) > 90"
+                severity: warning
+                for: 5m
+                comments: |
+                  High memory usage is expected if the cache is well-utilized. This alert fires when it approaches the configured limit, which may cause evictions.
+              - name: Memcached high eviction rate
+                description: "Memcached is evicting items at a high rate on {{ $labels.instance }} ({{ $value }} evictions/s)"
+                query: "rate(memcached_items_evicted_total[5m]) > 10"
+                severity: warning
+                for: 5m
+                comments: |
+                  A sustained eviction rate indicates memory pressure. Consider increasing memcached memory limit or reducing cache usage. Threshold of 10 evictions/s is a rough default — adjust based on your workload.
+              - name: Memcached low cache hit rate (< 80%)
+                description: "Memcached cache hit rate is below 80% on {{ $labels.instance }} (current value: {{ $value }}%)"
+                query: "(rate(memcached_commands_total{command=\"get\", status=\"hit\"}[5m]) / (rate(memcached_commands_total{command=\"get\", status=\"hit\"}[5m]) + rate(memcached_commands_total{command=\"get\", status=\"miss\"}[5m])) * 100) < 80 and (rate(memcached_commands_total{command=\"get\", status=\"hit\"}[5m]) + rate(memcached_commands_total{command=\"get\", status=\"miss\"}[5m])) > 0"
+                severity: warning
+                for: 10m
+                comments: |
+                  A low hit rate may indicate poor cache utilization, incorrect cache keys, or TTLs that are too short. Threshold of 80% is a rough default — adjust based on your workload and access patterns.
+              - name: Memcached connections rejected
+                description: "Memcached is rejecting connections on {{ $labels.instance }} ({{ $value }} rejections in the last 5m)"
+                query: "increase(memcached_connections_rejected_total[5m]) > 0"
+                severity: warning
+                for: 5m
+              - name: Memcached items too large
+                description: "Memcached is rejecting items exceeding max-item-size on {{ $labels.instance }} ({{ $value }} rejections in the last 5m)"
+                query: "increase(memcached_item_too_large_total[5m]) > 0"
+                severity: info
+                for: 5m
+
       - name: MongoDB
         exporters:
           - name: percona/mongodb_exporter

From d44bfd4c4b6dd1bc82e9a0503bbfa7302dbe8462 Mon Sep 17 00:00:00 2001
From: samber <samber@users.noreply.github.com>
Date: Mon, 16 Mar 2026 02:26:04 +0000
Subject: [PATCH 3/7] Publish

---
 dist/rules/envoy/embedded-exporter.yml      |  6 +-
 dist/rules/memcached/memcached-exporter.yml | 91 +++++++++++++++++++++
 2 files changed, 93 insertions(+), 4 deletions(-)
 create mode 100644 dist/rules/memcached/memcached-exporter.yml

diff --git a/dist/rules/envoy/embedded-exporter.yml b/dist/rules/envoy/embedded-exporter.yml
index bbe9aaa..f489b0c 100644
--- a/dist/rules/envoy/embedded-exporter.yml
+++ b/dist/rules/envoy/embedded-exporter.yml
@@ -77,9 +77,8 @@ groups:
         summary: Envoy high cluster upstream connection failures (instance {{ $labels.instance }})
         description: "High rate of upstream connection failures in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    # The +1 in the denominator guards against division by zero.
     - alert: EnvoyHighClusterUpstreamRequestTimeoutRate
-      expr: 'increase(envoy_cluster_upstream_rq_timeout[5m]) / (increase(envoy_cluster_upstream_rq_completed[5m]) + 1) * 100 > 5'
+      expr: 'increase(envoy_cluster_upstream_rq_timeout[5m]) / increase(envoy_cluster_upstream_rq_completed[5m]) * 100 > 5 and increase(envoy_cluster_upstream_rq_completed[5m]) > 0'
       for: 5m
       labels:
         severity: warning
@@ -87,9 +86,8 @@ groups:
         summary: Envoy high cluster upstream request timeout rate (instance {{ $labels.instance }})
         description: "More than 5% of upstream requests are timing out in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    # The +1 in the denominator guards against division by zero.
     - alert: EnvoyHighClusterUpstream5xxErrorRate
-      expr: 'increase(envoy_cluster_upstream_rq_xx{envoy_response_code_class="5"}[5m]) / (increase(envoy_cluster_upstream_rq_completed[5m]) + 1) * 100 > 5'
+      expr: 'increase(envoy_cluster_upstream_rq_xx{envoy_response_code_class="5"}[5m]) / increase(envoy_cluster_upstream_rq_completed[5m]) * 100 > 5 and increase(envoy_cluster_upstream_rq_completed[5m]) > 0'
       for: 1m
       labels:
         severity: critical
diff --git a/dist/rules/memcached/memcached-exporter.yml b/dist/rules/memcached/memcached-exporter.yml
new file mode 100644
index 0000000..c1a68b3
--- /dev/null
+++ b/dist/rules/memcached/memcached-exporter.yml
@@ -0,0 +1,91 @@
+groups:
+
+- name: MemcachedExporter
+
+  
+  rules:
+
+    # 1m delay allows a restart without triggering an alert.
+    - alert: MemcachedDown
+      expr: 'memcached_up == 0'
+      for: 1m
+      labels:
+        severity: critical
+      annotations:
+        summary: Memcached down (instance {{ $labels.instance }})
+        description: "Memcached instance is down on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MemcachedConnectionLimitApproaching(>80%)
+      expr: '(memcached_current_connections / memcached_max_connections * 100) > 80'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Memcached connection limit approaching (> 80%) (instance {{ $labels.instance }})
+        description: "Memcached connection usage is above 80% on {{ $labels.instance }} (current value: {{ $value }}%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MemcachedConnectionLimitApproaching(>95%)
+      expr: '(memcached_current_connections / memcached_max_connections * 100) > 95'
+      for: 2m
+      labels:
+        severity: critical
+      annotations:
+        summary: Memcached connection limit approaching (> 95%) (instance {{ $labels.instance }})
+        description: "Memcached connection usage is above 95% on {{ $labels.instance }} (current value: {{ $value }}%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MemcachedOutOfMemoryErrors
+      expr: 'sum without (slab) (rate(memcached_slab_items_outofmemory_total[5m])) > 0'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Memcached out of memory errors (instance {{ $labels.instance }})
+        description: "Memcached is returning out-of-memory errors on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    # High memory usage is expected if the cache is well-utilized. This alert fires when it approaches the configured limit, which may cause evictions.
+    - alert: MemcachedMemoryUsageHigh(>90%)
+      expr: '(memcached_current_bytes / memcached_limit_bytes * 100) > 90'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Memcached memory usage high (> 90%) (instance {{ $labels.instance }})
+        description: "Memcached memory usage is above 90% on {{ $labels.instance }} (current value: {{ $value }}%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    # A sustained eviction rate indicates memory pressure. Consider increasing memcached memory limit or reducing cache usage. Threshold of 10 evictions/s is a rough default — adjust based on your workload.
+    - alert: MemcachedHighEvictionRate
+      expr: 'rate(memcached_items_evicted_total[5m]) > 10'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Memcached high eviction rate (instance {{ $labels.instance }})
+        description: "Memcached is evicting items at a high rate on {{ $labels.instance }} ({{ $value }} evictions/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    # A low hit rate may indicate poor cache utilization, incorrect cache keys, or TTLs that are too short. Threshold of 80% is a rough default — adjust based on your workload and access patterns.
+    - alert: MemcachedLowCacheHitRate(<80%)
+      expr: '(rate(memcached_commands_total{command="get", status="hit"}[5m]) / (rate(memcached_commands_total{command="get", status="hit"}[5m]) + rate(memcached_commands_total{command="get", status="miss"}[5m])) * 100) < 80 and (rate(memcached_commands_total{command="get", status="hit"}[5m]) + rate(memcached_commands_total{command="get", status="miss"}[5m])) > 0'
+      for: 10m
+      labels:
+        severity: warning
+      annotations:
+        summary: Memcached low cache hit rate (< 80%) (instance {{ $labels.instance }})
+        description: "Memcached cache hit rate is below 80% on {{ $labels.instance }} (current value: {{ $value }}%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MemcachedConnectionsRejected
+      expr: 'increase(memcached_connections_rejected_total[5m]) > 0'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Memcached connections rejected (instance {{ $labels.instance }})
+        description: "Memcached is rejecting connections on {{ $labels.instance }} ({{ $value }} rejections in the last 5m)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MemcachedItemsTooLarge
+      expr: 'increase(memcached_item_too_large_total[5m]) > 0'
+      for: 5m
+      labels:
+        severity: info
+      annotations:
+        summary: Memcached items too large (instance {{ $labels.instance }})
+        description: "Memcached is rejecting items exceeding max-item-size on {{ $labels.instance }} ({{ $value }} rejections in the last 5m)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

From bf7b9028813a495f58df1a0c218752f85c26537f Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Mon, 16 Mar 2026 03:31:18 +0100
Subject: [PATCH 4/7] feat: add process-exporter alerting rules
 (ncabatoff/process-exporter) (#514)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* feat: add process-exporter alerting rules (ncabatoff/process-exporter)

* docs: add Process to README services list

* fix: address PR review feedback for process-exporter rules

- Rename service from "Process" to "Process Exporter" for clarity
- Fix grammar: "file descriptors usage" → "file descriptor usage"
- Clarify CPU alert description as core-equivalent percentage
- Rename "high disk IO" to "high disk write IO" for accuracy
---
 README.md       |  1 +
 _data/rules.yml | 68 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 69 insertions(+)

diff --git a/README.md b/README.md
index 9707db4..d4f1954 100644
--- a/README.md
+++ b/README.md
@@ -51,6 +51,7 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts
 - [Proxmox VE](https://samber.github.io/awesome-prometheus-alerts/rules#proxmox-ve)
 - [Netdata](https://samber.github.io/awesome-prometheus-alerts/rules#netdata)
 - [eBPF](https://samber.github.io/awesome-prometheus-alerts/rules#ebpf)
+- [Process Exporter](https://samber.github.io/awesome-prometheus-alerts/rules#process-exporter)
 
 #### Databases and brokers
 
diff --git a/_data/rules.yml b/_data/rules.yml
index 898e2fc..ff3c3dc 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -742,6 +742,74 @@ groups:
                 severity: warning
                 for: 5m
 
+      - name: Process Exporter
+        exporters:
+          - name: ncabatoff/process-exporter
+            slug: process-exporter
+            doc_url: https://github.com/ncabatoff/process-exporter
+            rules:
+              - name: Process exporter group down
+                description: "No processes found for group {{ $labels.groupname }}. The service may have stopped. (instance {{ $labels.instance }})"
+                query: 'namedprocess_namegroup_num_procs == 0'
+                severity: critical
+                for: 2m
+              - name: Process exporter high memory usage
+                description: "Process group {{ $labels.groupname }} is using {{ $value | humanize }}B of resident memory. (instance {{ $labels.instance }})"
+                query: 'namedprocess_namegroup_memory_bytes{memtype="resident"} > 4e+09'
+                severity: warning
+                for: 5m
+                comments: |
+                  Threshold of 4GB is arbitrary and depends on the process being monitored. Adjust per group.
+              - name: Process exporter high CPU usage
+                description: "Process group {{ $labels.groupname }} is using {{ $value }}% CPU (core-equivalent). (instance {{ $labels.instance }})"
+                query: 'rate(namedprocess_namegroup_cpu_seconds_total[5m]) * 100 > 80'
+                severity: warning
+                for: 5m
+                comments: |
+                  Value is core-equivalent %: 100% = 1 full core, 200% = 2 cores, etc. Threshold of 80% is per-core. Adjust based on expected workload.
+              - name: Process exporter high file descriptor usage
+                description: "Process group {{ $labels.groupname }} is using more than 80% of its file descriptor limit. (instance {{ $labels.instance }})"
+                query: 'namedprocess_namegroup_worst_fd_ratio > 0.8'
+                severity: warning
+                for: 5m
+              - name: Process exporter file descriptors exhausted
+                description: "Process group {{ $labels.groupname }} has nearly exhausted its file descriptor limit. (instance {{ $labels.instance }})"
+                query: 'namedprocess_namegroup_worst_fd_ratio > 0.95'
+                severity: critical
+                for: 2m
+              - name: Process exporter high swap usage
+                description: "Process group {{ $labels.groupname }} is using {{ $value | humanize }}B of swap. (instance {{ $labels.instance }})"
+                query: 'namedprocess_namegroup_memory_bytes{memtype="swapped"} > 512e+06'
+                severity: warning
+                for: 5m
+                comments: |
+                  Threshold of 512MB is arbitrary. Adjust per group and environment.
+              - name: Process exporter zombie processes
+                description: "Process group {{ $labels.groupname }} has {{ $value }} zombie processes. (instance {{ $labels.instance }})"
+                query: 'namedprocess_namegroup_states{state="Zombie"} > 0'
+                severity: warning
+                for: 5m
+              - name: Process exporter high context switching
+                description: "Process group {{ $labels.groupname }} has a high rate of context switches ({{ $value }}/s). (instance {{ $labels.instance }})"
+                query: 'rate(namedprocess_namegroup_context_switches_total[5m]) > 10000'
+                severity: warning
+                for: 5m
+                comments: |
+                  Threshold of 10000 switches/s is a rough default. Adjust based on the workload profile.
+              - name: Process exporter high disk write IO
+                description: "Process group {{ $labels.groupname }} is performing {{ $value | humanize }}B/s of disk writes. (instance {{ $labels.instance }})"
+                query: 'rate(namedprocess_namegroup_write_bytes_total[5m]) > 100e+06'
+                severity: warning
+                for: 5m
+                comments: |
+                  Threshold of 100MB/s is arbitrary. Adjust per group.
+              - name: Process exporter process restarting
+                description: "Process group {{ $labels.groupname }} has restarted (oldest process start time changed). (instance {{ $labels.instance }})"
+                query: 'changes(namedprocess_namegroup_oldest_start_time_seconds[5m]) > 0 and namedprocess_namegroup_num_procs > 0'
+                severity: info
+                comments: |
+                  Detects restarts by watching for changes in the oldest process start time within the group.
+
   - name: Databases and brokers
     services:
       - name: MySQL

From 32f639da3b5e1c40d856860e206285b6d4f7cedc Mon Sep 17 00:00:00 2001
From: samber <samber@users.noreply.github.com>
Date: Mon, 16 Mar 2026 02:31:48 +0000
Subject: [PATCH 5/7] Publish

---
 .../process-exporter/process-exporter.yml     | 102 ++++++++++++++++++
 1 file changed, 102 insertions(+)
 create mode 100644 dist/rules/process-exporter/process-exporter.yml

diff --git a/dist/rules/process-exporter/process-exporter.yml b/dist/rules/process-exporter/process-exporter.yml
new file mode 100644
index 0000000..8603ede
--- /dev/null
+++ b/dist/rules/process-exporter/process-exporter.yml
@@ -0,0 +1,102 @@
+groups:
+
+- name: ProcessExporter
+
+  
+  rules:
+
+    - alert: ProcessExporterGroupDown
+      expr: 'namedprocess_namegroup_num_procs == 0'
+      for: 2m
+      labels:
+        severity: critical
+      annotations:
+        summary: Process exporter group down (instance {{ $labels.instance }})
+        description: "No processes found for group {{ $labels.groupname }}. The service may have stopped. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    # Threshold of 4GB is arbitrary and depends on the process being monitored. Adjust per group.
+    - alert: ProcessExporterHighMemoryUsage
+      expr: 'namedprocess_namegroup_memory_bytes{memtype="resident"} > 4e+09'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Process exporter high memory usage (instance {{ $labels.instance }})
+        description: "Process group {{ $labels.groupname }} is using {{ $value | humanize }}B of resident memory. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    # Value is core-equivalent %: 100% = 1 full core, 200% = 2 cores, etc. Threshold of 80% is per-core. Adjust based on expected workload.
+    - alert: ProcessExporterHighCpuUsage
+      expr: 'rate(namedprocess_namegroup_cpu_seconds_total[5m]) * 100 > 80'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Process exporter high CPU usage (instance {{ $labels.instance }})
+        description: "Process group {{ $labels.groupname }} is using {{ $value }}% CPU (core-equivalent). (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: ProcessExporterHighFileDescriptorUsage
+      expr: 'namedprocess_namegroup_worst_fd_ratio > 0.8'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Process exporter high file descriptor usage (instance {{ $labels.instance }})
+        description: "Process group {{ $labels.groupname }} is using more than 80% of its file descriptor limit. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: ProcessExporterFileDescriptorsExhausted
+      expr: 'namedprocess_namegroup_worst_fd_ratio > 0.95'
+      for: 2m
+      labels:
+        severity: critical
+      annotations:
+        summary: Process exporter file descriptors exhausted (instance {{ $labels.instance }})
+        description: "Process group {{ $labels.groupname }} has nearly exhausted its file descriptor limit. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    # Threshold of 512MB is arbitrary. Adjust per group and environment.
+    - alert: ProcessExporterHighSwapUsage
+      expr: 'namedprocess_namegroup_memory_bytes{memtype="swapped"} > 512e+06'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Process exporter high swap usage (instance {{ $labels.instance }})
+        description: "Process group {{ $labels.groupname }} is using {{ $value | humanize }}B of swap. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: ProcessExporterZombieProcesses
+      expr: 'namedprocess_namegroup_states{state="Zombie"} > 0'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Process exporter zombie processes (instance {{ $labels.instance }})
+        description: "Process group {{ $labels.groupname }} has {{ $value }} zombie processes. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    # Threshold of 10000 switches/s is a rough default. Adjust based on the workload profile.
+    - alert: ProcessExporterHighContextSwitching
+      expr: 'rate(namedprocess_namegroup_context_switches_total[5m]) > 10000'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Process exporter high context switching (instance {{ $labels.instance }})
+        description: "Process group {{ $labels.groupname }} has a high rate of context switches ({{ $value }}/s). (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    # Threshold of 100MB/s is arbitrary. Adjust per group.
+    - alert: ProcessExporterHighDiskWriteIo
+      expr: 'rate(namedprocess_namegroup_write_bytes_total[5m]) > 100e+06'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Process exporter high disk write IO (instance {{ $labels.instance }})
+        description: "Process group {{ $labels.groupname }} is performing {{ $value | humanize }}B/s of disk writes. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    # Detects restarts by watching for changes in the oldest process start time within the group.
+    - alert: ProcessExporterProcessRestarting
+      expr: 'changes(namedprocess_namegroup_oldest_start_time_seconds[5m]) > 0 and namedprocess_namegroup_num_procs > 0'
+      for: 0m
+      labels:
+        severity: info
+      annotations:
+        summary: Process exporter process restarting (instance {{ $labels.instance }})
+        description: "Process group {{ $labels.groupname }} has restarted (oldest process start time changed). (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

From 20651aa10d54ba61700b8b22a1de114d408ffc2a Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Mon, 16 Mar 2026 03:43:51 +0100
Subject: [PATCH 6/7] feat: add OpenStack alerting rules (openstack-exporter)
 (#515)

* feat: add OpenStack alerting rules (openstack-exporter)

Add 20 alerting rules for openstack-exporter/openstack-exporter covering
Nova, Neutron, Cinder, Octavia, and Placement services.

* docs: add OpenStack to README services list

* fix: align OpenStack load balancer alert name with operating_status semantics

The operating_status label uses ONLINE/OFFLINE/DEGRADED/ERROR values,
not ACTIVE. Rename alert to "not online" and use the label in the
description for clarity.
---
 README.md       |   1 +
 _data/rules.yml | 111 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 112 insertions(+)

diff --git a/README.md b/README.md
index d4f1954..8cf7581 100644
--- a/README.md
+++ b/README.md
@@ -103,6 +103,7 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts
 - [Istio](https://samber.github.io/awesome-prometheus-alerts/rules#istio)
 - [ArgoCD](https://samber.github.io/awesome-prometheus-alerts/rules#argocd)
 - [FluxCD](https://samber.github.io/awesome-prometheus-alerts/rules#fluxcd)
+- [OpenStack](https://samber.github.io/awesome-prometheus-alerts/rules#openstack)
 
 #### Network, security and storage
 
diff --git a/_data/rules.yml b/_data/rules.yml
index ff3c3dc..2005fdc 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -3219,6 +3219,117 @@ groups:
                 severity: warning
                 for: 15m
 
+      - name: OpenStack
+        exporters:
+          - name: openstack-exporter/openstack-exporter
+            slug: openstack-exporter
+            doc_url: https://github.com/openstack-exporter/openstack-exporter
+            rules:
+              - name: OpenStack exporter down
+                description: The OpenStack exporter is down. OpenStack cloud metrics are no longer being collected.
+                query: 'up{job=~".*openstack.*"} == 0'
+                severity: critical
+                for: 2m
+              - name: OpenStack Nova agent down
+                description: "Nova agent {{ $labels.hostname }} ({{ $labels.service }}) is down in zone {{ $labels.zone }}"
+                query: 'openstack_nova_agent_state{adminState="enabled"} == 0'
+                severity: critical
+                for: 2m
+              - name: OpenStack Neutron agent down
+                description: "Neutron agent {{ $labels.hostname }} ({{ $labels.service }}) is down"
+                query: 'openstack_neutron_agent_state{adminState="enabled"} == 0'
+                severity: critical
+                for: 2m
+              - name: OpenStack Cinder agent down
+                description: "Cinder agent {{ $labels.hostname }} ({{ $labels.service }}) is down in zone {{ $labels.zone }}"
+                query: 'openstack_cinder_agent_state{adminState="enabled"} == 0'
+                severity: critical
+                for: 2m
+              - name: OpenStack hypervisor high vCPU usage
+                description: "Hypervisor {{ $labels.hostname }} vCPU usage is above 90%"
+                query: 'openstack_nova_vcpus_used / openstack_nova_vcpus_available > 0.9 and openstack_nova_vcpus_available > 0'
+                severity: warning
+                for: 5m
+                comments: |
+                  The threshold of 90% is a rough default. Adjust based on your overcommit ratio and workload patterns.
+              - name: OpenStack hypervisor high memory usage
+                description: "Hypervisor {{ $labels.hostname }} memory usage is above 90%"
+                query: 'openstack_nova_memory_used_bytes / openstack_nova_memory_available_bytes > 0.9 and openstack_nova_memory_available_bytes > 0'
+                severity: warning
+                for: 5m
+                comments: |
+                  The threshold of 90% is a rough default. Adjust based on your overcommit ratio and workload patterns.
+              - name: OpenStack hypervisor high disk usage
+                description: "Hypervisor {{ $labels.hostname }} local disk usage is above 90%"
+                query: 'openstack_nova_local_storage_used_bytes / openstack_nova_local_storage_available_bytes > 0.9 and openstack_nova_local_storage_available_bytes > 0'
+                severity: warning
+                for: 5m
+              - name: OpenStack Nova tenant vCPU quota nearly exhausted
+                description: "Tenant {{ $labels.tenant }} has used over 90% of its vCPU quota"
+                query: 'openstack_nova_limits_vcpus_used / openstack_nova_limits_vcpus_max > 0.9 and openstack_nova_limits_vcpus_max > 0'
+                severity: warning
+                comments: |
+                  A value of -1 for limits_vcpus_max means unlimited quota (no limit set).
+              - name: OpenStack Nova tenant memory quota nearly exhausted
+                description: "Tenant {{ $labels.tenant }} has used over 90% of its memory quota"
+                query: 'openstack_nova_limits_memory_used / openstack_nova_limits_memory_max > 0.9 and openstack_nova_limits_memory_max > 0'
+                severity: warning
+              - name: OpenStack Nova tenant instance quota nearly exhausted
+                description: "Tenant {{ $labels.tenant }} has used over 90% of its instance quota"
+                query: 'openstack_nova_limits_instances_used / openstack_nova_limits_instances_max > 0.9 and openstack_nova_limits_instances_max > 0'
+                severity: warning
+              - name: OpenStack Cinder tenant volume quota nearly exhausted
+                description: "Tenant {{ $labels.tenant }} has used over 90% of its volume storage quota"
+                query: 'openstack_cinder_limits_volume_used_gb / openstack_cinder_limits_volume_max_gb > 0.9 and openstack_cinder_limits_volume_max_gb > 0'
+                severity: warning
+              - name: OpenStack Cinder pool low free capacity
+                description: "Cinder storage pool {{ $labels.name }} has less than 10% free capacity"
+                query: 'openstack_cinder_pool_capacity_free_gb / openstack_cinder_pool_capacity_total_gb < 0.1 and openstack_cinder_pool_capacity_total_gb > 0'
+                severity: warning
+                for: 5m
+              - name: OpenStack Neutron floating IPs associated but not active
+                description: "{{ $value }} floating IPs are associated to a private IP but are not in ACTIVE state"
+                query: 'openstack_neutron_floating_ips_associated_not_active > 0'
+                severity: warning
+                for: 5m
+              - name: OpenStack Neutron routers not active
+                description: "{{ $value }} Neutron routers are not in ACTIVE state"
+                query: 'openstack_neutron_routers_not_active > 0'
+                severity: warning
+                for: 5m
+              - name: OpenStack Neutron subnet IP pool exhaustion
+                description: "Subnet {{ $labels.subnet_name }} on network {{ $labels.network_name }} has used over 90% of its IP pool"
+                query: 'openstack_neutron_network_ip_availabilities_used / openstack_neutron_network_ip_availabilities_total > 0.9 and openstack_neutron_network_ip_availabilities_total > 0'
+                severity: warning
+              - name: OpenStack Neutron ports without IPs
+                description: "{{ $value }} active ports have no IP addresses assigned"
+                query: 'openstack_neutron_ports_no_ips > 0'
+                severity: warning
+                for: 5m
+              - name: OpenStack load balancer not online
+                description: "Load balancer {{ $labels.name }} ({{ $labels.id }}) operating status is {{ $labels.operating_status }}"
+                query: 'openstack_loadbalancer_loadbalancer_status{operating_status!="ONLINE"} > 0'
+                severity: warning
+                for: 5m
+              - name: OpenStack Nova instances in ERROR state
+                description: "{{ $value }} Nova instances are in ERROR state"
+                query: 'sum(openstack_nova_server_status{status="ERROR"}) > 0'
+                severity: warning
+                for: 5m
+              - name: OpenStack Cinder volumes in error state
+                description: "{{ $value }} Cinder volumes are in an error state"
+                query: 'openstack_cinder_volume_status_counter{status=~"error.*"} > 0'
+                severity: warning
+                for: 5m
+              - name: OpenStack placement resource high usage
+                description: "Resource {{ $labels.resourcetype }} on host {{ $labels.hostname }} usage exceeds 90% of its allocation"
+                query: 'openstack_placement_resource_usage / (openstack_placement_resource_total * openstack_placement_resource_allocation_ratio) > 0.9 and openstack_placement_resource_total > 0'
+                severity: warning
+                for: 5m
+                comments: |
+                  This alert factors in the allocation ratio to compute effective capacity.
+                  The threshold of 90% is a rough default. Adjust based on your allocation ratios and workload patterns.
+
   - name: Network, security and storage
     services:
       - name: Ceph

From 258220b4f052614bb36f8557556b3edc4940825e Mon Sep 17 00:00:00 2001
From: samber <samber@users.noreply.github.com>
Date: Mon, 16 Mar 2026 02:44:20 +0000
Subject: [PATCH 7/7] Publish

---
 dist/rules/openstack/openstack-exporter.yml | 191 ++++++++++++++++++++
 1 file changed, 191 insertions(+)
 create mode 100644 dist/rules/openstack/openstack-exporter.yml

diff --git a/dist/rules/openstack/openstack-exporter.yml b/dist/rules/openstack/openstack-exporter.yml
new file mode 100644
index 0000000..d55688b
--- /dev/null
+++ b/dist/rules/openstack/openstack-exporter.yml
@@ -0,0 +1,191 @@
+groups:
+
+- name: OpenstackExporter
+
+  
+  rules:
+
+    - alert: OpenstackExporterDown
+      expr: 'up{job=~".*openstack.*"} == 0'
+      for: 2m
+      labels:
+        severity: critical
+      annotations:
+        summary: OpenStack exporter down (instance {{ $labels.instance }})
+        description: "The OpenStack exporter is down. OpenStack cloud metrics are no longer being collected.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: OpenstackNovaAgentDown
+      expr: 'openstack_nova_agent_state{adminState="enabled"} == 0'
+      for: 2m
+      labels:
+        severity: critical
+      annotations:
+        summary: OpenStack Nova agent down (instance {{ $labels.instance }})
+        description: "Nova agent {{ $labels.hostname }} ({{ $labels.service }}) is down in zone {{ $labels.zone }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: OpenstackNeutronAgentDown
+      expr: 'openstack_neutron_agent_state{adminState="enabled"} == 0'
+      for: 2m
+      labels:
+        severity: critical
+      annotations:
+        summary: OpenStack Neutron agent down (instance {{ $labels.instance }})
+        description: "Neutron agent {{ $labels.hostname }} ({{ $labels.service }}) is down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: OpenstackCinderAgentDown
+      expr: 'openstack_cinder_agent_state{adminState="enabled"} == 0'
+      for: 2m
+      labels:
+        severity: critical
+      annotations:
+        summary: OpenStack Cinder agent down (instance {{ $labels.instance }})
+        description: "Cinder agent {{ $labels.hostname }} ({{ $labels.service }}) is down in zone {{ $labels.zone }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    # The threshold of 90% is a rough default. Adjust based on your overcommit ratio and workload patterns.
+    - alert: OpenstackHypervisorHighVcpuUsage
+      expr: 'openstack_nova_vcpus_used / openstack_nova_vcpus_available > 0.9 and openstack_nova_vcpus_available > 0'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: OpenStack hypervisor high vCPU usage (instance {{ $labels.instance }})
+        description: "Hypervisor {{ $labels.hostname }} vCPU usage is above 90%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    # The threshold of 90% is a rough default. Adjust based on your overcommit ratio and workload patterns.
+    - alert: OpenstackHypervisorHighMemoryUsage
+      expr: 'openstack_nova_memory_used_bytes / openstack_nova_memory_available_bytes > 0.9 and openstack_nova_memory_available_bytes > 0'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: OpenStack hypervisor high memory usage (instance {{ $labels.instance }})
+        description: "Hypervisor {{ $labels.hostname }} memory usage is above 90%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: OpenstackHypervisorHighDiskUsage
+      expr: 'openstack_nova_local_storage_used_bytes / openstack_nova_local_storage_available_bytes > 0.9 and openstack_nova_local_storage_available_bytes > 0'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: OpenStack hypervisor high disk usage (instance {{ $labels.instance }})
+        description: "Hypervisor {{ $labels.hostname }} local disk usage is above 90%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    # A value of -1 for limits_vcpus_max means unlimited quota (no limit set).
+    - alert: OpenstackNovaTenantVcpuQuotaNearlyExhausted
+      expr: 'openstack_nova_limits_vcpus_used / openstack_nova_limits_vcpus_max > 0.9 and openstack_nova_limits_vcpus_max > 0'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: OpenStack Nova tenant vCPU quota nearly exhausted (instance {{ $labels.instance }})
+        description: "Tenant {{ $labels.tenant }} has used over 90% of its vCPU quota\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: OpenstackNovaTenantMemoryQuotaNearlyExhausted
+      expr: 'openstack_nova_limits_memory_used / openstack_nova_limits_memory_max > 0.9 and openstack_nova_limits_memory_max > 0'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: OpenStack Nova tenant memory quota nearly exhausted (instance {{ $labels.instance }})
+        description: "Tenant {{ $labels.tenant }} has used over 90% of its memory quota\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: OpenstackNovaTenantInstanceQuotaNearlyExhausted
+      expr: 'openstack_nova_limits_instances_used / openstack_nova_limits_instances_max > 0.9 and openstack_nova_limits_instances_max > 0'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: OpenStack Nova tenant instance quota nearly exhausted (instance {{ $labels.instance }})
+        description: "Tenant {{ $labels.tenant }} has used over 90% of its instance quota\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: OpenstackCinderTenantVolumeQuotaNearlyExhausted
+      expr: 'openstack_cinder_limits_volume_used_gb / openstack_cinder_limits_volume_max_gb > 0.9 and openstack_cinder_limits_volume_max_gb > 0'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: OpenStack Cinder tenant volume quota nearly exhausted (instance {{ $labels.instance }})
+        description: "Tenant {{ $labels.tenant }} has used over 90% of its volume storage quota\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: OpenstackCinderPoolLowFreeCapacity
+      expr: 'openstack_cinder_pool_capacity_free_gb / openstack_cinder_pool_capacity_total_gb < 0.1 and openstack_cinder_pool_capacity_total_gb > 0'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: OpenStack Cinder pool low free capacity (instance {{ $labels.instance }})
+        description: "Cinder storage pool {{ $labels.name }} has less than 10% free capacity\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: OpenstackNeutronFloatingIpsAssociatedButNotActive
+      expr: 'openstack_neutron_floating_ips_associated_not_active > 0'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: OpenStack Neutron floating IPs associated but not active (instance {{ $labels.instance }})
+        description: "{{ $value }} floating IPs are associated to a private IP but are not in ACTIVE state\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: OpenstackNeutronRoutersNotActive
+      expr: 'openstack_neutron_routers_not_active > 0'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: OpenStack Neutron routers not active (instance {{ $labels.instance }})
+        description: "{{ $value }} Neutron routers are not in ACTIVE state\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: OpenstackNeutronSubnetIpPoolExhaustion
+      expr: 'openstack_neutron_network_ip_availabilities_used / openstack_neutron_network_ip_availabilities_total > 0.9 and openstack_neutron_network_ip_availabilities_total > 0'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: OpenStack Neutron subnet IP pool exhaustion (instance {{ $labels.instance }})
+        description: "Subnet {{ $labels.subnet_name }} on network {{ $labels.network_name }} has used over 90% of its IP pool\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: OpenstackNeutronPortsWithoutIps
+      expr: 'openstack_neutron_ports_no_ips > 0'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: OpenStack Neutron ports without IPs (instance {{ $labels.instance }})
+        description: "{{ $value }} active ports have no IP addresses assigned\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: OpenstackLoadBalancerNotOnline
+      expr: 'openstack_loadbalancer_loadbalancer_status{operating_status!="ONLINE"} > 0'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: OpenStack load balancer not online (instance {{ $labels.instance }})
+        description: "Load balancer {{ $labels.name }} ({{ $labels.id }}) operating status is {{ $labels.operating_status }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: OpenstackNovaInstancesInErrorState
+      expr: 'sum(openstack_nova_server_status{status="ERROR"}) > 0'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: OpenStack Nova instances in ERROR state (instance {{ $labels.instance }})
+        description: "{{ $value }} Nova instances are in ERROR state\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: OpenstackCinderVolumesInErrorState
+      expr: 'openstack_cinder_volume_status_counter{status=~"error.*"} > 0'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: OpenStack Cinder volumes in error state (instance {{ $labels.instance }})
+        description: "{{ $value }} Cinder volumes are in an error state\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    # This alert factors in the allocation ratio to compute effective capacity.
+    # The threshold of 90% is a rough default. Adjust based on your allocation ratios and workload patterns.
+    - alert: OpenstackPlacementResourceHighUsage
+      expr: 'openstack_placement_resource_usage / (openstack_placement_resource_total * openstack_placement_resource_allocation_ratio) > 0.9 and openstack_placement_resource_total > 0'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: OpenStack placement resource high usage (instance {{ $labels.instance }})
+        description: "Resource {{ $labels.resourcetype }} on host {{ $labels.hostname }} usage exceeds 90% of its allocation\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"