mirror of
https://github.com/samber/awesome-prometheus-alerts.git
synced 2026-06-23 01:47:17 +08:00
168 lines
7.2 KiB
YAML
168 lines
7.2 KiB
YAML
groups:
|
|
|
|
- name: GesellixCouchdbPrometheusExporter
|
|
|
|
|
|
rules:
|
|
|
|
- alert: CouchdbNodeDown
|
|
expr: 'couchdb_httpd_node_up == 0 or couchdb_httpd_up == 0'
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: CouchDB node down (instance {{ $labels.instance }})
|
|
description: "CouchDB node is not responding (node_up metric is 0) for more than 2 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: CouchdbAtomMemoryUsageCritical
|
|
expr: 'couchdb_erlang_memory_atom_used > 0.9 * couchdb_erlang_memory_atom'
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: CouchDB atom memory usage critical (instance {{ $labels.instance }})
|
|
description: "Atom memory usage is above 90% of limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: CouchdbOpenDatabasesCritical
|
|
expr: 'couchdb_httpd_open_databases > 0.9 * 1000'
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: CouchDB open databases critical (instance {{ $labels.instance }})
|
|
description: "Number of open databases exceeds 90% of node capacity\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: CouchdbOpenOsFilesCritical
|
|
expr: 'couchdb_httpd_open_os_files > 0.9 * 65535'
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: CouchDB open OS files critical (instance {{ $labels.instance }})
|
|
description: "CouchDB is using more than 90% of allowed OS file descriptors, may fail to open new files\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: Couchdb5xxErrorRatioHigh
|
|
expr: 'rate(couchdb_httpd_status_codes{code=~"5.."}[5m]) / rate(couchdb_httpd_requests[5m]) > 0.05'
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: CouchDB 5xx error ratio high (instance {{ $labels.instance }})
|
|
description: "More than 5% of HTTP requests are returning 5xx errors\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: CouchdbTemporaryViewReadRateCritical
|
|
expr: 'rate(couchdb_httpd_temporary_view_reads[5m]) > 100'
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: CouchDB temporary view read rate critical (instance {{ $labels.instance }})
|
|
description: "Temporary view read rate exceeds 100 reads/sec, high risk of performance degradation\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: CouchdbMangoQueriesScanningTooManyDocs
|
|
expr: 'rate(couchdb_mango_too_many_docs_scanned[5m]) > 50'
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: CouchDB Mango queries scanning too many docs (instance {{ $labels.instance }})
|
|
description: "Some Mango queries are scanning too many documents, consider adding indexes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: CouchdbMangoQueriesFailedDueToInvalidIndex
|
|
expr: 'rate(couchdb_mango_query_invalid_index[5m]) > 5'
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: CouchDB Mango queries failed due to invalid index (instance {{ $labels.instance }})
|
|
description: "Some Mango queries failed to execute because the index was missing or invalid\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: CouchdbMangoDocsExaminedHigh
|
|
expr: 'rate(couchdb_mango_docs_examined[5m]) > 1000'
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: CouchDB Mango docs examined high (instance {{ $labels.instance }})
|
|
description: "High number of documents examined per Mango queries, consider indexing\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: CouchdbReplicatorManagerDied
|
|
expr: 'increase(couchdb_replicator_changes_manager_deaths[5m]) > 0'
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: CouchDB Replicator manager died (instance {{ $labels.instance }})
|
|
description: "Replication manager process has crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: CouchdbReplicatorQueueProcessDied
|
|
expr: 'increase(couchdb_replicator_changes_queue_deaths[5m]) > 0'
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: CouchDB Replicator queue process died (instance {{ $labels.instance }})
|
|
description: "Replication queue process has crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: CouchdbReplicatorReaderProcessDied
|
|
expr: 'increase(couchdb_replicator_changes_reader_deaths[5m]) > 0'
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: CouchDB Replicator reader process died (instance {{ $labels.instance }})
|
|
description: "Replication reader process has crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: CouchdbReplicatorFailedToStart
|
|
expr: 'increase(couchdb_replicator_failed_starts[5m]) > 0'
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: CouchDB Replicator failed to start (instance {{ $labels.instance }})
|
|
description: "One or more replication tasks failed to start\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: CouchdbReplicationClusterUnstable
|
|
expr: 'couchdb_replicator_cluster_is_stable == 0'
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: CouchDB replication cluster unstable (instance {{ $labels.instance }})
|
|
description: "The replication cluster is unstable, replication may be interrupted\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: CouchdbReplicationReadFailures
|
|
expr: 'increase(couchdb_replicator_changes_read_failures[5m]) > 5'
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: CouchDB replication read failures (instance {{ $labels.instance }})
|
|
description: "Replication changes feed has failed reads more than 5 times in 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: CouchdbFileDescriptorsHigh
|
|
expr: 'process_open_fds / process_max_fds > 0.85'
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: CouchDB file descriptors high (instance {{ $labels.instance }})
|
|
description: "Process is using more than 85% of allowed file descriptors\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: CouchdbProcessRestarted
|
|
expr: 'changes(process_start_time_seconds[1h]) > 0'
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: CouchDB process restarted (instance {{ $labels.instance }})
|
|
description: "CouchDB process has restarted recently\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: CouchdbCriticalLogEntries
|
|
expr: 'increase(couchdb_server_couch_log{level=~"error|critical"}[5m]) > 0'
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: CouchDB critical log entries (instance {{ $labels.instance }})
|
|
description: "Critical or error log entries detected in the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|