Add couchdb alerts (#472)

* add : additional essential clickhouse alerts * Add new ClickHouse alert rules for monitoring * linting * add : couchdb roles config in rules.yml * add : couchdb alerts in rules directory --------- Co-authored-by: Samuel Berthe <dev@samuel-berthe.fr>
2026-06-21 00:47:18 +08:00 · 2025-09-01 17:10:42 +03:30 · 2025-09-01 17:10:42 +03:30 · a2c31358d1
commit a2c31358d1
parent edae18b8df
3 changed files with 261 additions and 183 deletions
--- a/_data/rules.yml
+++ b/_data/rules.yml
@ -1445,6 +1445,103 @@ groups:
                severity: critical
                for: 2m

+      - name: CouchDB
+        exporters:
+          - name: gesellix/couchdb-prometheus-exporter
+            slug: gesellix-couchdb-prometheus-exporter
+            doc_url: https://github.com/gesellix/couchdb-prometheus-exporter
+            rules:
+              - name: CouchDB node down
+                description: CouchDB node is not responding (node_up metric is 0) for more than 2 minutes
+                query: "couchdb_httpd_node_up == 0 or couchdb_httpd_up == 0"
+                severity: critical
+                for: 2m
+              - name: CouchDB atom memory usage critical
+                description: Atom memory usage is above 90% of limit
+                query: "couchdb_erlang_memory_atom_used > 0.9 * couchdb_erlang_memory_atom"
+                severity: critical
+                for: 5m
+              - name: CouchDB open databases critical
+                description: Number of open databases exceeds 90% of node capacity
+                query: "couchdb_httpd_open_databases > 0.9 * 1000"
+                severity: critical
+                for: 5m
+              - name: CouchDB open OS files critical
+                description: CouchDB is using more than 90% of allowed OS file descriptors, may fail to open new files
+                query: "couchdb_httpd_open_os_files > 0.9 * 65535"
+                severity: critical
+                for: 5m
+              - name: CouchDB 5xx error ratio high
+                description: More than 5% of HTTP requests are returning 5xx errors
+                query: "rate(couchdb_httpd_status_codes{code=~\"5..\"}[5m]) / rate(couchdb_httpd_requests[5m]) > 0.05"
+                severity: critical
+                for: 5m
+              - name: CouchDB temporary view read rate critical
+                description: Temporary view read rate exceeds 100 reads/sec, high risk of performance degradation
+                query: "rate(couchdb_httpd_temporary_view_reads[5m]) > 100"
+                severity: critical
+                for: 5m
+              - name: CouchDB Mango queries scanning too many docs
+                description: Some Mango queries are scanning too many documents, consider adding indexes
+                query: "rate(couchdb_mango_too_many_docs_scanned[5m]) > 50"
+                severity: warning
+                for: 5m
+              - name: CouchDB Mango queries failed due to invalid index
+                description: Some Mango queries failed to execute because the index was missing or invalid
+                query: "rate(couchdb_mango_query_invalid_index[5m]) > 5"
+                severity: warning
+                for: 5m
+              - name: CouchDB Mango docs examined high
+                description: High number of documents examined per Mango queries, consider indexing
+                query: "rate(couchdb_mango_docs_examined[5m]) > 1000"
+                severity: warning
+                for: 5m
+              - name: CouchDB Replicator manager died
+                description: Replication manager process has crashed
+                query: "increase(couchdb_replicator_changes_manager_deaths[5m]) > 0"
+                severity: critical
+                for: 1m
+              - name: CouchDB Replicator queue process died
+                description: Replication queue process has crashed
+                query: "increase(couchdb_replicator_changes_queue_deaths[5m]) > 0"
+                severity: critical
+                for: 1m
+              - name: CouchDB Replicator reader process died
+                description: Replication reader process has crashed
+                query: "increase(couchdb_replicator_changes_reader_deaths[5m]) > 0"
+                severity: critical
+                for: 1m
+              - name: CouchDB Replicator failed to start
+                description: One or more replication tasks failed to start
+                query: "increase(couchdb_replicator_failed_starts[5m]) > 0"
+                severity: critical
+                for: 1m
+              - name: CouchDB replication cluster unstable
+                description: The replication cluster is unstable, replication may be interrupted
+                query: "couchdb_replicator_cluster_is_stable == 0"
+                severity: critical
+                for: 2m
+              - name: CouchDB replication read failures
+                description: Replication changes feed has failed reads more than 5 times in 5 minutes
+                query: "increase(couchdb_replicator_changes_read_failures[5m]) > 5"
+                severity: warning
+                for: 5m
+              - name: CouchDB file descriptors high
+                description: Process is using more than 85% of allowed file descriptors
+                query: "process_open_fds / process_max_fds > 0.85"
+                severity: warning
+                for: 5m
+              - name: CouchDB process restarted
+                description: CouchDB process has restarted recently
+                query: "changes(process_start_time_seconds[1h]) > 0"
+                severity: critical
+                for: 1m
+              - name: CouchDB critical log entries
+                description: Critical or error log entries detected in the last 5 minutes
+                query: "increase(couchdb_server_couch_log{level=~\"error|critical\"}[5m]) > 0"
+                severity: critical
+                for: 1m
+
      - name: Zookeeper
        exporters:
          - name: cloudflare/kafka_zookeeper_exporter
--- a/dist/rules/clickhouse/embedded-exporter.yml
+++ b/dist/rules/clickhouse/embedded-exporter.yml
@ -1,185 +1,2 @@
-groups:

- name: EmbeddedExporter

-  rules:
-
-    - alert: ClickhouseNodeDown
-      expr: 'up{job="clickhouse"} == 0'
-      for: 2m
-      labels:
-        severity: critical
-      annotations:
-        summary: ClickHouse node down (instance {{ $labels.instance }})
-        description: "No metrics received from ClickHouse exporter for over 2 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: ClickhouseMemoryUsageCritical
-      expr: 'ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 90'
-      for: 5m
-      labels:
-        severity: critical
-      annotations:
-        summary: ClickHouse Memory Usage Critical (instance {{ $labels.instance }})
-        description: "Memory usage is critically high, over 90%.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: ClickhouseMemoryUsageWarning
-      expr: 'ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 80'
-      for: 5m
-      labels:
-        severity: warning
-      annotations:
-        summary: ClickHouse Memory Usage Warning (instance {{ $labels.instance }})
-        description: "Memory usage is over 80%.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: ClickhouseDiskSpaceLowOnDefault
-      expr: 'ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 20'
-      for: 2m
-      labels:
-        severity: warning
-      annotations:
-        summary: ClickHouse Disk Space Low on Default (instance {{ $labels.instance }})
-        description: "Disk space on default is below 20%.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: ClickhouseDiskSpaceCriticalOnDefault
-      expr: 'ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 10'
-      for: 2m
-      labels:
-        severity: critical
-      annotations:
-        summary: ClickHouse Disk Space Critical on Default (instance {{ $labels.instance }})
-        description: "Disk space on default disk is critically low, below 10%.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: ClickhouseDiskSpaceLowOnBackups
-      expr: 'ClickHouseAsyncMetrics_DiskAvailable_backups / (ClickHouseAsyncMetrics_DiskAvailable_backups + ClickHouseAsyncMetrics_DiskUsed_backups) * 100 < 20'
-      for: 2m
-      labels:
-        severity: warning
-      annotations:
-        summary: ClickHouse Disk Space Low on Backups (instance {{ $labels.instance }})
-        description: "Disk space on backups is below 20%.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: ClickhouseReplicaErrors
-      expr: 'ClickHouseErrorMetric_ALL_REPLICAS_ARE_STALE == 1 or ClickHouseErrorMetric_ALL_REPLICAS_LOST == 1'
-      for: 0m
-      labels:
-        severity: critical
-      annotations:
-        summary: ClickHouse Replica Errors (instance {{ $labels.instance }})
-        description: "Critical replica errors detected, either all replicas are stale or lost.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: ClickhouseNoAvailableReplicas
-      expr: 'ClickHouseErrorMetric_NO_AVAILABLE_REPLICA == 1'
-      for: 0m
-      labels:
-        severity: critical
-      annotations:
-        summary: ClickHouse No Available Replicas (instance {{ $labels.instance }})
-        description: "No available replicas in ClickHouse.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: ClickhouseNoLiveReplicas
-      expr: 'ClickHouseErrorMetric_TOO_FEW_LIVE_REPLICAS == 1'
-      for: 0m
-      labels:
-        severity: critical
-      annotations:
-        summary: ClickHouse No Live Replicas (instance {{ $labels.instance }})
-        description: "There are too few live replicas available, risking data loss and service disruption.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: ClickhouseHighNetworkTraffic
-      expr: 'ClickHouseMetrics_NetworkSend > 250 or ClickHouseMetrics_NetworkReceive > 250'
-      for: 5m
-      labels:
-        severity: warning
-      annotations:
-        summary: ClickHouse High Network Traffic (instance {{ $labels.instance }})
-        description: "Network traffic is unusually high, may affect cluster performance.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: ClickhouseHighTcpConnections
-      expr: 'ClickHouseMetrics_TCPConnection > 400'
-      for: 5m
-      labels:
-        severity: warning
-      annotations:
-        summary: ClickHouse High TCP Connections (instance {{ $labels.instance }})
-        description: "High number of TCP connections, indicating heavy client or inter-cluster communication.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: ClickhouseInterserverConnectionIssues
-      expr: 'increase(ClickHouseMetrics_InterserverConnection[5m]) > 0'
-      for: 1m
-      labels:
-        severity: warning
-      annotations:
-        summary: ClickHouse Interserver Connection Issues (instance {{ $labels.instance }})
-        description: "An increase in interserver connections may indicate replication or distributed query handling issues.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: ClickhouseZookeeperConnectionIssues
-      expr: 'avg(ClickHouseMetrics_ZooKeeperSession) != 1'
-      for: 3m
-      labels:
-        severity: warning
-      annotations:
-        summary: ClickHouse ZooKeeper Connection Issues (instance {{ $labels.instance }})
-        description: "ClickHouse is experiencing issues with ZooKeeper connections, which may affect cluster state and coordination.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: ClickhouseAuthenticationFailures
-      expr: 'increase(ClickHouseErrorMetric_AUTHENTICATION_FAILED[5m]) > 0'
-      for: 0m
-      labels:
-        severity: info
-      annotations:
-        summary: ClickHouse Authentication Failures (instance {{ $labels.instance }})
-        description: "Authentication failures detected, indicating potential security issues or misconfiguration.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: ClickhouseAccessDeniedErrors
-      expr: 'increase(ClickHouseErrorMetric_RESOURCE_ACCESS_DENIED[5m]) > 0'
-      for: 0m
-      labels:
-        severity: info
-      annotations:
-        summary: ClickHouse Access Denied Errors (instance {{ $labels.instance }})
-        description: "Access denied errors have been logged, which could indicate permission issues or unauthorized access attempts.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: ClickhouseRejectedInsertQueries
-      expr: 'increase(ClickHouseProfileEvents_RejectedInserts[1m]) > 0'
-      for: 1m
-      labels:
-        severity: warning
-      annotations:
-        summary: ClickHouse rejected insert queries (instance {{ $labels.instance }})
-        description: "INSERTs rejected due to too many active data parts. Reduce insert frequency.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: ClickhouseDelayedInsertQueries
-      expr: 'increase(ClickHouseProfileEvents_DelayedInserts[5m]) > 0'
-      for: 2m
-      labels:
-        severity: warning
-      annotations:
-        summary: ClickHouse delayed insert queries (instance {{ $labels.instance }})
-        description: "INSERTs delayed due to high number of active parts.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: ClickhouseZookeeperHardwareException
-      expr: 'increase(ClickHouseProfileEvents_ZooKeeperHardwareExceptions[1m]) > 0'
-      for: 1m
-      labels:
-        severity: critical
-      annotations:
-        summary: ClickHouse zookeeper hardware exception (instance {{ $labels.instance }})
-        description: "Zookeeper hardware exception: network issues communicating with ZooKeeper\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: ClickhouseHighNetworkUsage
-      expr: 'rate(ClickHouseProfileEvents_NetworkSendBytes[1m]) > 100*1024*1024 or rate(ClickHouseProfileEvents_NetworkReceiveBytes[1m]) > 100*1024*1024'
-      for: 2m
-      labels:
-        severity: warning
-      annotations:
-        summary: ClickHouse high network usage (instance {{ $labels.instance }})
-        description: "High network usage. ClickHouse network usage exceeds 100MB/s.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: ClickhouseDistributedRejectedInserts
-      expr: 'increase(ClickHouseProfileEvents_DistributedRejectedInserts[5m]) > 0'
-      for: 2m
-      labels:
-        severity: critical
-      annotations:
-        summary: ClickHouse distributed rejected inserts (instance {{ $labels.instance }})
-        description: "INSERTs into Distributed tables rejected due to pending bytes limit.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/dist/rules/couchdb/gesellix-couchdb-prometheus-exporter.yml
+++ b/dist/rules/couchdb/gesellix-couchdb-prometheus-exporter.yml
@ -0,0 +1,164 @@
+groups:
+  - name: couchdb-alerts
+    rules:
+      - alert: CouchDBNodeDown
+        expr: expr: couchdb_httpd_node_up == 0 or couchdb_httpd_up == 0
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "CouchDB node down"
+          description: "CouchDB node is not responding (node_up metric is 0) for more than 2 minutes."
+
+      - alert: CouchDBAtomMemoryHigh
+        expr: couchdb_erlang_memory_atom_used > 0.9 * couchdb_erlang_memory_atom
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "CouchDB atom memory usage critical"
+          description: "Atom memory usage is above 90% of limit."
+
+      - alert: CouchDBOpenDatabasesCritical
+        expr: couchdb_httpd_open_databases > 0.9 * 1000
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "CouchDB open databases critical"
+          description: "Number of open databases exceeds 90% of node capacity."
+
+      - alert: CouchDBOpenOSFilesCritical
+        expr: couchdb_httpd_open_os_files > 0.9 * 65535
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "CouchDB open OS files critical"
+          description: "CouchDB is using more than 90% of allowed OS file descriptors, may fail to open new files."
+
+      - alert: CouchDB5xxErrorsHigh
+        expr: rate(couchdb_httpd_status_codes{code=~"5.."}[5m]) / rate(couchdb_httpd_requests[5m]) > 0.05
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "CouchDB 5xx error ratio high"
+          description: "More than 5% of HTTP requests are returning 5xx errors."
+
+      - alert: CouchDBTemporaryViewReadsCritical
+        expr: rate(couchdb_httpd_temporary_view_reads[5m]) > 100
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "CouchDB temporary view read rate critical"
+          description: "Temporary view read rate exceeds 100 reads/sec, high risk of performance degradation."
+
+      - alert: CouchDBMangoTooManyDocsScanned
+        expr: rate(couchdb_mango_too_many_docs_scanned[5m]) > 50
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "CouchDB Mango queries scanning too many docs"
+          description: "Some Mango queries are scanning too many documents, consider adding indexes."
+
+      - alert: CouchDBMangoInvalidIndexQueries
+        expr: rate(couchdb_mango_query_invalid_index[5m]) > 5
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "CouchDB Mango queries failed due to invalid index"
+          description: "Some Mango queries failed to execute because the index was missing or invalid."
+
+      - alert: CouchDBMangoDocsExaminedHigh
+        expr: rate(couchdb_mango_docs_examined[5m]) > 1000
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "CouchDB Mango docs examined high"
+          description: "High number of documents examined per Mango queries, consider indexing."
+
+      - alert: CouchDBReplicatorManagerDeaths
+        expr: increase(couchdb_replicator_changes_manager_deaths[5m]) > 0
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "CouchDB Replicator manager died"
+          description: "Replication manager process has crashed."
+
+      - alert: CouchDBReplicatorQueueDeaths
+        expr: increase(couchdb_replicator_changes_queue_deaths[5m]) > 0
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "CouchDB Replicator queue process died"
+          description: "Replication queue process has crashed."
+
+      - alert: CouchDBReplicatorReaderDeaths
+        expr: increase(couchdb_replicator_changes_reader_deaths[5m]) > 0
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "CouchDB Replicator reader process died"
+          description: "Replication reader process has crashed."
+
+      - alert: CouchDBReplicatorFailedStarts
+        expr: increase(couchdb_replicator_failed_starts[5m]) > 0
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "CouchDB Replicator failed to start"
+          description: "One or more replication tasks failed to start."
+
+      - alert: CouchDBReplicatorClusterUnstable
+        expr: couchdb_replicator_cluster_is_stable == 0
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "CouchDB replication cluster unstable"
+          description: "The replication cluster is unstable, replication may be interrupted."
+
+      - alert: CouchDBReplicatorChangesReadFailures
+        expr: increase(couchdb_replicator_changes_read_failures[5m]) > 5
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "CouchDB replication read failures"
+          description: "Replication changes feed has failed reads more than 5 times in 5 minutes."
+
+      - alert: CouchDBOpenFDsHigh
+        expr: process_open_fds / process_max_fds > 0.85
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "CouchDB file descriptors high"
+          description: "Process is using more than 85% of allowed file descriptors."
+
+      - alert: CouchDBProcessRestarted
+        expr: changes(process_start_time_seconds[1h]) > 0
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "CouchDB process restarted"
+          description: "CouchDB process has restarted recently."
+
+      - alert: CouchDBCriticalLogs
+        expr: increase(couchdb_server_couch_log{level=~"error|critical"}[5m]) > 0
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "CouchDB critical log entries"
+          description: "Critical or error log entries detected in the last 5 minutes."