diff --git a/_data/rules.yml b/_data/rules.yml index 649f746..b1d1dda 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -1445,6 +1445,120 @@ groups: severity: critical for: 2m + - name: CouchDB + exporters: + - name: gesellix/couchdb-prometheus-exporter + slug: gesellix-couchdb-prometheus-exporter + doc_url: https://github.com/gesellix/couchdb-prometheus-exporter + rules: + - name: CouchDB node down + description: CouchDB node is not responding (node_up metric is 0) for more than 2 minutes + query: "couchdb_httpd_node_up == 0 or couchdb_httpd_up == 0" + severity: critical + for: 2m + + - name: CouchDB atom memory usage critical + description: Atom memory usage is above 90% of limit + query: "couchdb_erlang_memory_atom_used > 0.9 * couchdb_erlang_memory_atom" + severity: critical + for: 5m + + - name: CouchDB open databases critical + description: Number of open databases exceeds 90% of node capacity + query: "couchdb_httpd_open_databases > 0.9 * 1000" + severity: critical + for: 5m + + - name: CouchDB open OS files critical + description: CouchDB is using more than 90% of allowed OS file descriptors, may fail to open new files + query: "couchdb_httpd_open_os_files > 0.9 * 65535" + severity: critical + for: 5m + + - name: CouchDB 5xx error ratio high + description: More than 5% of HTTP requests are returning 5xx errors + query: "rate(couchdb_httpd_status_codes{code=~\"5..\"}[5m]) / rate(couchdb_httpd_requests[5m]) > 0.05" + severity: critical + for: 5m + + - name: CouchDB temporary view read rate critical + description: Temporary view read rate exceeds 100 reads/sec, high risk of performance degradation + query: "rate(couchdb_httpd_temporary_view_reads[5m]) > 100" + severity: critical + for: 5m + + - name: CouchDB Mango queries scanning too many docs + description: Some Mango queries are scanning too many documents, consider adding indexes + query: "rate(couchdb_mango_too_many_docs_scanned[5m]) > 50" + severity: warning + for: 5m + + - name: CouchDB Mango queries failed due to invalid index + description: Some Mango queries failed to execute because the index was missing or invalid + query: "rate(couchdb_mango_query_invalid_index[5m]) > 5" + severity: warning + for: 5m + + - name: CouchDB Mango docs examined high + description: High number of documents examined per Mango queries, consider indexing + query: "rate(couchdb_mango_docs_examined[5m]) > 1000" + severity: warning + for: 5m + + - name: CouchDB Replicator manager died + description: Replication manager process has crashed + query: "increase(couchdb_replicator_changes_manager_deaths[5m]) > 0" + severity: critical + for: 1m + + - name: CouchDB Replicator queue process died + description: Replication queue process has crashed + query: "increase(couchdb_replicator_changes_queue_deaths[5m]) > 0" + severity: critical + for: 1m + + - name: CouchDB Replicator reader process died + description: Replication reader process has crashed + query: "increase(couchdb_replicator_changes_reader_deaths[5m]) > 0" + severity: critical + for: 1m + + - name: CouchDB Replicator failed to start + description: One or more replication tasks failed to start + query: "increase(couchdb_replicator_failed_starts[5m]) > 0" + severity: critical + for: 1m + + - name: CouchDB replication cluster unstable + description: The replication cluster is unstable, replication may be interrupted + query: "couchdb_replicator_cluster_is_stable == 0" + severity: critical + for: 2m + + - name: CouchDB replication read failures + description: Replication changes feed has failed reads more than 5 times in 5 minutes + query: "increase(couchdb_replicator_changes_read_failures[5m]) > 5" + severity: warning + for: 5m + + - name: CouchDB file descriptors high + description: Process is using more than 85% of allowed file descriptors + query: "process_open_fds / process_max_fds > 0.85" + severity: warning + for: 5m + + - name: CouchDB process restarted + description: CouchDB process has restarted recently + query: "changes(process_start_time_seconds[1h]) > 0" + severity: critical + for: 1m + + - name: CouchDB critical log entries + description: Critical or error log entries detected in the last 5 minutes + query: "increase(couchdb_server_couch_log{level=~\"error|critical\"}[5m]) > 0" + severity: critical + for: 1m + - name: Zookeeper exporters: - name: cloudflare/kafka_zookeeper_exporter