This commit is contained in:
samber 2026-03-16 00:27:40 +00:00
parent c37ef8f50c
commit e2af1325c6
22 changed files with 109 additions and 176 deletions

View file

@ -15,7 +15,7 @@ groups:
description: "Cassandra hints count has changed on {{ $labels.instance }} some nodes may go down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraCompactionTaskPending
expr: 'avg_over_time(cassandra_stats{name="org:apache:cassandra:metrics:compaction:pendingtasks:value"}[1m]) > 100'
expr: 'cassandra_stats{name="org:apache:cassandra:metrics:compaction:pendingtasks:value"} > 100'
for: 2m
labels:
severity: warning
@ -24,7 +24,7 @@ groups:
description: "Many Cassandra compaction tasks are pending. You might need to increase I/O capacity by adding nodes to the cluster.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraViewwriteLatency
expr: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:viewwrite:viewwritelatency:99thpercentile",service="cas"} > 100000'
expr: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:viewwrite:viewwritelatency:99thpercentile"} > 100000'
for: 2m
labels:
severity: warning
@ -32,18 +32,19 @@ groups:
summary: Cassandra viewwrite latency (instance {{ $labels.instance }})
description: "High viewwrite latency on {{ $labels.instance }} cassandra node\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraBadHacker
- alert: CassandraAuthenticationFailures
expr: 'rate(cassandra_stats{name="org:apache:cassandra:metrics:client:authfailure:count"}[1m]) > 5'
for: 2m
labels:
severity: warning
annotations:
summary: Cassandra bad hacker (instance {{ $labels.instance }})
summary: Cassandra authentication failures (instance {{ $labels.instance }})
description: "Increase of Cassandra authentication failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# 1m delay allows a restart without triggering an alert.
- alert: CassandraNodeDown
expr: 'sum(cassandra_stats{name="org:apache:cassandra:net:failuredetector:downendpointcount"}) by (service,group,cluster,env) > 0'
for: 0m
for: 1m
labels:
severity: critical
annotations:
@ -141,7 +142,7 @@ groups:
description: "Read failures have occurred because too many nodes are unavailable\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraClientRequestWriteFailure
expr: 'increase(cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:write:failures:oneminuterate"}[1m]) > 0'
expr: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:write:failures:oneminuterate"} > 0'
for: 0m
labels:
severity: critical
@ -150,7 +151,7 @@ groups:
description: "A lot of write failures encountered. A write failure is a non-timeout exception encountered during a write request. Examine the reason map to find to the root cause. The most common cause for this type of error is when batch sizes are too large.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraClientRequestReadFailure
expr: 'increase(cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:read:failures:oneminuterate"}[1m]) > 0'
expr: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:read:failures:oneminuterate"} > 0'
for: 0m
labels:
severity: critical

View file

@ -5,9 +5,10 @@ groups:
rules:
# 1m delay allows a restart without triggering an alert.
- alert: CassandraNodeIsUnavailable
expr: 'sum(cassandra_endpoint_active) by (cassandra_cluster,instance,exported_endpoint) < 1'
for: 0m
expr: 'cassandra_endpoint_active < 1'
for: 1m
labels:
severity: critical
annotations:
@ -51,7 +52,7 @@ groups:
description: "Some Cassandra flush writer tasks are blocked - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraConnectionTimeoutsTotal
expr: 'avg(cassandra_client_request_timeouts_total) by (cassandra_cluster,instance) > 5'
expr: 'sum by (cassandra_cluster,instance) (rate(cassandra_client_request_timeouts_total[5m])) > 5'
for: 2m
labels:
severity: critical
@ -102,7 +103,7 @@ groups:
severity: critical
annotations:
summary: Cassandra client request write failure (instance {{ $labels.instance }})
description: "Read failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Write failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraClientRequestReadFailure
expr: 'increase(cassandra_client_request_failures_total{operation="read"}[1m]) > 0'

View file

@ -5,6 +5,7 @@ groups:
rules:
# Adjust the job label to match your Prometheus configuration.
- alert: ClickhouseNodeDown
expr: 'up{job="clickhouse"} == 0'
for: 2m
@ -15,7 +16,7 @@ groups:
description: "No metrics received from ClickHouse exporter for over 2 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ClickhouseMemoryUsageCritical
expr: 'ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 90'
expr: 'ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 90 and ClickHouseAsyncMetrics_CGroupMemoryTotal > 0'
for: 5m
labels:
severity: critical
@ -24,7 +25,7 @@ groups:
description: "Memory usage is critically high, over 90%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ClickhouseMemoryUsageWarning
expr: 'ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 80'
expr: 'ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 80 and ClickHouseAsyncMetrics_CGroupMemoryTotal > 0'
for: 5m
labels:
severity: warning
@ -86,16 +87,6 @@ groups:
summary: ClickHouse No Live Replicas (instance {{ $labels.instance }})
description: "There are too few live replicas available, risking data loss and service disruption.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Please replace the threshold with an appropriate value
- alert: ClickhouseHighNetworkTraffic
expr: 'ClickHouseMetrics_NetworkSend > 250 or ClickHouseMetrics_NetworkReceive > 250'
for: 5m
labels:
severity: warning
annotations:
summary: ClickHouse High Network Traffic (instance {{ $labels.instance }})
description: "Network traffic is unusually high, may affect cluster performance.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Please replace the threshold with an appropriate value
- alert: ClickhouseHighTcpConnections
expr: 'ClickHouseMetrics_TCPConnection > 400'
@ -106,17 +97,18 @@ groups:
summary: ClickHouse High TCP Connections (instance {{ $labels.instance }})
description: "High number of TCP connections, indicating heavy client or inter-cluster communication.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Adjust the threshold based on your cluster size and expected replication traffic.
- alert: ClickhouseInterserverConnectionIssues
expr: 'increase(ClickHouseMetrics_InterserverConnection[5m]) > 0'
for: 1m
expr: 'ClickHouseMetrics_InterserverConnection > 50'
for: 5m
labels:
severity: warning
annotations:
summary: ClickHouse Interserver Connection Issues (instance {{ $labels.instance }})
description: "An increase in interserver connections may indicate replication or distributed query handling issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "High number of interserver connections may indicate replication or distributed query handling issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ClickhouseZookeeperConnectionIssues
expr: 'avg(ClickHouseMetrics_ZooKeeperSession) != 1'
expr: 'ClickHouseMetrics_ZooKeeperSession != 1'
for: 3m
labels:
severity: warning

View file

@ -42,7 +42,7 @@ groups:
description: "CouchDB is using more than 90% of allowed OS file descriptors, may fail to open new files\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: Couchdb5xxErrorRatioHigh
expr: 'rate(couchdb_httpd_status_codes{code=~"5.."}[5m]) / rate(couchdb_httpd_requests[5m]) > 0.05'
expr: 'rate(couchdb_httpd_status_codes{code=~"5.."}[5m]) / rate(couchdb_httpd_requests[5m]) > 0.05 and rate(couchdb_httpd_requests[5m]) > 0'
for: 5m
labels:
severity: critical
@ -153,7 +153,7 @@ groups:
expr: 'changes(process_start_time_seconds[1h]) > 0'
for: 1m
labels:
severity: critical
severity: info
annotations:
summary: CouchDB process restarted (instance {{ $labels.instance }})
description: "CouchDB process has restarted recently\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -59,18 +59,20 @@ groups:
summary: Elasticsearch Cluster Yellow (instance {{ $labels.instance }})
description: "Elastic Cluster Yellow status\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# 1m delay allows a restart without triggering an alert.
- alert: ElasticsearchHealthyNodes
expr: 'elasticsearch_cluster_health_number_of_nodes < 3'
for: 0m
for: 1m
labels:
severity: critical
annotations:
summary: Elasticsearch Healthy Nodes (instance {{ $labels.instance }})
description: "Missing node in Elasticsearch cluster\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# 1m delay allows a restart without triggering an alert.
- alert: ElasticsearchHealthyDataNodes
expr: 'elasticsearch_cluster_health_number_of_data_nodes < 3'
for: 0m
for: 1m
labels:
severity: critical
annotations:
@ -115,7 +117,7 @@ groups:
- alert: ElasticsearchUnassignedShards
expr: 'elasticsearch_cluster_health_unassigned_shards > 0'
for: 0m
for: 2m
labels:
severity: critical
annotations:
@ -141,7 +143,7 @@ groups:
description: "No new documents for 10 min!\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ElasticsearchHighIndexingLatency
expr: 'increase(elasticsearch_indices_indexing_index_time_seconds_total[1m]) / increase(elasticsearch_indices_indexing_index_total[1m]) > 0.0005'
expr: 'increase(elasticsearch_indices_indexing_index_time_seconds_total[1m]) / increase(elasticsearch_indices_indexing_index_total[1m]) > 0.0005 and increase(elasticsearch_indices_indexing_index_total[1m]) > 0'
for: 10m
labels:
severity: warning
@ -168,7 +170,7 @@ groups:
description: "The query rate on Elasticsearch cluster is higher than the threshold.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ElasticsearchHighQueryLatency
expr: 'increase(elasticsearch_indices_search_fetch_time_seconds[1m]) / increase(elasticsearch_indices_search_fetch_total[1m]) > 1'
expr: 'increase(elasticsearch_indices_search_query_time_seconds[1m]) / increase(elasticsearch_indices_search_query_total[1m]) > 1 and increase(elasticsearch_indices_search_query_total[1m]) > 0'
for: 5m
labels:
severity: warning

View file

@ -42,7 +42,7 @@ groups:
description: "Available HDFS disk space is running low.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HadoopMapReduceTaskFailures
expr: 'hadoop_mapreduce_task_failures_total > 100'
expr: 'increase(hadoop_mapreduce_task_failures_total[1h]) > 100'
for: 10m
labels:
severity: critical
@ -60,7 +60,7 @@ groups:
description: "The Hadoop ResourceManager is approaching its memory limit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HadoopYarnContainerAllocationFailures
expr: 'hadoop_yarn_container_allocation_failures_total > 10'
expr: 'increase(hadoop_yarn_container_allocation_failures_total[1h]) > 10'
for: 10m
labels:
severity: warning
@ -78,10 +78,10 @@ groups:
description: "The HBase cluster has an unusually high number of regions.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HadoopHbaseRegionServerHeapLow
expr: 'hadoop_hbase_region_server_heap_bytes / hadoop_hbase_region_server_max_heap_bytes < 0.2'
expr: 'hadoop_hbase_region_server_heap_bytes / hadoop_hbase_region_server_max_heap_bytes > 0.8'
for: 10m
labels:
severity: critical
severity: warning
annotations:
summary: Hadoop HBase Region Server Heap Low (instance {{ $labels.instance }})
description: "HBase Region Servers are running low on heap space.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -14,11 +14,11 @@ groups:
summary: Kafka topics replicas (instance {{ $labels.instance }})
description: "Kafka topic in-sync partition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KafkaConsumersGroup
expr: 'sum(kafka_consumergroup_lag) by (consumergroup) > 50'
- alert: KafkaConsumerGroupLag
expr: 'sum(kafka_consumergroup_lag) by (consumergroup) > 10000'
for: 1m
labels:
severity: critical
severity: warning
annotations:
summary: Kafka consumers group (instance {{ $labels.instance }})
description: "Kafka consumers group\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Kafka consumer group lag (instance {{ $labels.instance }})
description: "Kafka consumer group {{ $labels.consumergroup }} is lagging behind ({{ $value }} messages)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -12,7 +12,7 @@ groups:
severity: warning
annotations:
summary: Meilisearch index is empty (instance {{ $labels.instance }})
description: "Meilisearch instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Meilisearch index {{ $labels.index }} has zero documents\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MeilisearchHttpResponseTime
expr: 'meilisearch_http_response_time_seconds > 0.5'

View file

@ -78,19 +78,10 @@ groups:
description: "Too many cursors are timing out\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MongodbTooManyConnections
expr: 'avg by(instance) (rate(mongodb_connections{state="current"}[1m])) / avg by(instance) (sum (mongodb_connections) by (instance)) * 100 > 80'
expr: 'mongodb_connections{state="current"} / (mongodb_connections{state="current"} + mongodb_connections{state="available"}) * 100 > 80'
for: 2m
labels:
severity: warning
annotations:
summary: MongoDB too many connections (instance {{ $labels.instance }})
description: "Too many connections (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MongodbVirtualMemoryUsage
expr: '(sum(mongodb_memory{type="virtual"}) BY (instance) / sum(mongodb_memory{type="mapped"}) BY (instance)) > 3'
for: 2m
labels:
severity: warning
annotations:
summary: MongoDB virtual memory usage (instance {{ $labels.instance }})
description: "High memory usage\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -5,18 +5,20 @@ groups:
rules:
# 1m delay allows a restart without triggering an alert.
- alert: MongodbDown
expr: 'mongodb_up == 0'
for: 0m
for: 1m
labels:
severity: critical
annotations:
summary: MongoDB Down (instance {{ $labels.instance }})
description: "MongoDB instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# 1m delay allows a restart without triggering an alert.
- alert: MongodbReplicaMemberUnhealthy
expr: 'mongodb_rs_members_health == 0'
for: 0m
for: 1m
labels:
severity: critical
annotations:
@ -32,6 +34,7 @@ groups:
summary: MongoDB replication lag (instance {{ $labels.instance }})
description: "Mongodb replication lag is more than 10s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# This query mixes old (mongodb_mongod_*) and new (mongodb_rs_*) metric names. It requires the Percona exporter to run with --compatible-mode to expose both.
- alert: MongodbReplicationHeadroom
expr: 'sum(avg(mongodb_mongod_replset_oplog_head_timestamp - mongodb_mongod_replset_oplog_tail_timestamp)) - sum(avg(mongodb_rs_members_optimeDate{member_state="PRIMARY"} - on (set) group_right mongodb_rs_members_optimeDate{member_state="SECONDARY"})) <= 0'
for: 0m
@ -60,7 +63,7 @@ groups:
description: "Too many cursors are timing out\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MongodbTooManyConnections
expr: 'avg by(instance) (rate(mongodb_ss_connections{conn_type="current"}[1m])) / avg by(instance) (sum (mongodb_ss_connections) by (instance)) * 100 > 80'
expr: 'mongodb_ss_connections{conn_type="current"} / (mongodb_ss_connections{conn_type="current"} + mongodb_ss_connections{conn_type="available"}) * 100 > 80'
for: 2m
labels:
severity: warning

View file

@ -5,9 +5,10 @@ groups:
rules:
# 1m delay allows a restart without triggering an alert.
- alert: MysqlDown
expr: 'mysql_up == 0'
for: 0m
for: 1m
labels:
severity: critical
annotations:
@ -41,18 +42,20 @@ groups:
summary: MySQL high threads running (instance {{ $labels.instance }})
description: "More than 60% of MySQL connections are in running state on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# 1m delay allows a restart without triggering an alert.
- alert: MysqlSlaveIoThreadNotRunning
expr: '( mysql_slave_status_slave_io_running and ON (instance) mysql_slave_status_master_server_id > 0 ) == 0'
for: 0m
for: 1m
labels:
severity: critical
annotations:
summary: MySQL Slave IO thread not running (instance {{ $labels.instance }})
description: "MySQL Slave IO thread not running on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# 1m delay allows a restart without triggering an alert.
- alert: MysqlSlaveSqlThreadNotRunning
expr: '( mysql_slave_status_slave_sql_running and ON (instance) mysql_slave_status_master_server_id > 0) == 0'
for: 0m
for: 1m
labels:
severity: critical
annotations:

View file

@ -5,24 +5,6 @@ groups:
rules:
- alert: NatsHighConnectionCount
expr: 'gnatsd_varz_connections > 100'
for: 3m
labels:
severity: warning
annotations:
summary: Nats high connection count (instance {{ $labels.instance }})
description: "High number of NATS connections ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsHighSubscriptionsCount
expr: 'gnatsd_connz_subscriptions > 50'
for: 3m
labels:
severity: warning
annotations:
summary: Nats high subscriptions count (instance {{ $labels.instance }})
description: "High number of NATS subscriptions ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsHighRoutesCount
expr: 'gnatsd_varz_routes > 10'
for: 3m
@ -59,8 +41,9 @@ groups:
summary: Nats server down (instance {{ $labels.instance }})
description: "NATS server has been down for more than 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# gnatsd_varz_cpu is a gauge reporting CPU percentage (0-100 scale).
- alert: NatsHighCpuUsage
expr: 'rate(gnatsd_varz_cpu[5m]) > 0.8'
expr: 'gnatsd_varz_cpu > 80'
for: 5m
labels:
severity: warning
@ -78,7 +61,7 @@ groups:
description: "NATS server has more than 1000 active connections\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsHighJetstreamStoreUsage
expr: 'gnatsd_varz_jetstream_stats_storage / gnatsd_varz_jetstream_config_max_storage > 0.8'
expr: 'gnatsd_varz_jetstream_stats_storage / gnatsd_varz_jetstream_config_max_storage > 0.8 and gnatsd_varz_jetstream_config_max_storage > 0'
for: 5m
labels:
severity: warning
@ -87,7 +70,7 @@ groups:
description: "JetStream store usage is over 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsHighJetstreamMemoryUsage
expr: 'gnatsd_varz_jetstream_stats_memory / gnatsd_varz_jetstream_config_max_memory > 0.8'
expr: 'gnatsd_varz_jetstream_stats_memory / gnatsd_varz_jetstream_config_max_memory > 0.8 and gnatsd_varz_jetstream_config_max_memory > 0'
for: 5m
labels:
severity: warning
@ -122,56 +105,20 @@ groups:
summary: Nats too many errors (instance {{ $labels.instance }})
description: "NATS server has encountered errors in the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsJetstreamConsumersExceeded
- alert: NatsJetstreamAccountsExceeded
expr: 'sum(gnatsd_varz_jetstream_stats_accounts) > 100'
for: 5m
labels:
severity: warning
annotations:
summary: Nats JetStream consumers exceeded (instance {{ $labels.instance }})
description: "JetStream has more than 100 active consumers\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsFrequentAuthenticationTimeouts
expr: 'increase(gnatsd_varz_auth_timeout[5m]) > 5'
for: 5m
labels:
severity: warning
annotations:
summary: Nats frequent authentication timeouts (instance {{ $labels.instance }})
description: "There have been more than 5 authentication timeouts in the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsMaxPayloadSizeExceeded
expr: 'max(gnatsd_varz_max_payload) > 1024 * 1024'
for: 5m
labels:
severity: critical
annotations:
summary: Nats max payload size exceeded (instance {{ $labels.instance }})
description: "The max payload size allowed by NATS has been exceeded (1MB)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Nats JetStream accounts exceeded (instance {{ $labels.instance }})
description: "JetStream has more than 100 active accounts\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsLeafNodeConnectionIssue
expr: 'increase(gnatsd_varz_leafnodes[5m]) == 0'
for: 5m
labels:
severity: critical
annotations:
summary: Nats leaf node connection issue (instance {{ $labels.instance }})
description: "No leaf node connections have been established in the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsMaxPingOperationsExceeded
expr: 'gnatsd_varz_ping_max > 50'
expr: 'gnatsd_varz_leafnodes == 0'
for: 5m
labels:
severity: warning
annotations:
summary: Nats max ping operations exceeded (instance {{ $labels.instance }})
description: "The maximum number of ping operations in NATS has exceeded 50\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsWriteDeadlineExceeded
expr: 'gnatsd_varz_write_deadline > 10'
for: 5m
labels:
severity: critical
annotations:
summary: Nats write deadline exceeded (instance {{ $labels.instance }})
description: "The write deadline has been exceeded in NATS, indicating potential message delivery issues\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Nats leaf node connection issue (instance {{ $labels.instance }})
description: "No leaf node connections on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -5,9 +5,10 @@ groups:
rules:
# 1m delay allows a restart without triggering an alert.
- alert: PatroniHasNoLeader
expr: '(max by (scope) (patroni_master) < 1) and (max by (scope) (patroni_standby_leader) < 1)'
for: 0m
expr: '(max by (scope) (patroni_primary) < 1) and (max by (scope) (patroni_standby_leader) < 1)'
for: 1m
labels:
severity: critical
annotations:

View file

@ -21,10 +21,10 @@ groups:
severity: warning
annotations:
summary: PGBouncer errors (instance {{ $labels.instance }})
description: "PGBouncer is logging errors. This may be due to a a server restart or an admin typing commands at the pgbouncer console.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "PGBouncer is logging errors. This may be due to a server restart or an admin typing commands at the pgbouncer console.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PgbouncerMaxConnections
expr: 'increase(pgbouncer_errors_count{errmsg="no more connections allowed (max_client_conn)"}[30s]) > 0'
expr: 'increase(pgbouncer_errors_count{errmsg="no more connections allowed (max_client_conn)"}[2m]) > 0'
for: 0m
labels:
severity: critical

View file

@ -5,9 +5,10 @@ groups:
rules:
# 1m delay allows a restart without triggering an alert.
- alert: PostgresqlDown
expr: 'pg_up == 0'
for: 0m
for: 1m
labels:
severity: critical
annotations:
@ -104,24 +105,6 @@ groups:
summary: Postgresql low XID consumption (instance {{ $labels.instance }})
description: "Postgresql seems to be consuming transaction IDs very slowly\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlHighRateStatementTimeout
expr: 'rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3'
for: 0m
labels:
severity: critical
annotations:
summary: Postgresql high rate statement timeout (instance {{ $labels.instance }})
description: "Postgres transactions showing high rate of statement timeouts\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlHighRateDeadlock
expr: 'increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1'
for: 0m
labels:
severity: critical
annotations:
summary: Postgresql high rate deadlock (instance {{ $labels.instance }})
description: "Postgres detected deadlocks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlUnusedReplicationSlot
expr: '(pg_replication_slots_active == 0) and (pg_replication_is_replica == 0)'
for: 1m
@ -150,7 +133,7 @@ groups:
description: "Postgres Database configuration change has occurred\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlSslCompressionActive
expr: 'sum(pg_stat_ssl_compression) > 0'
expr: 'sum by (instance) (pg_stat_ssl_compression) > 0'
for: 0m
labels:
severity: warning
@ -159,7 +142,7 @@ groups:
description: "Database allows connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlTooManyLocksAcquired
expr: '((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20'
expr: '((sum by (instance) (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20'
for: 2m
labels:
severity: critical

View file

@ -24,7 +24,7 @@ groups:
description: "The number of subscription backlog entries is over 100k\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PulsarTopicLargeBacklogStorageSize
expr: 'sum(pulsar_storage_size > 5*1024*1024*1024) by (topic)'
expr: 'sum(pulsar_storage_size) by (topic) > 5*1024*1024*1024'
for: 1h
labels:
severity: warning
@ -33,7 +33,7 @@ groups:
description: "The topic backlog storage size is over 5 GB\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PulsarTopicVeryLargeBacklogStorageSize
expr: 'sum(pulsar_storage_size > 20*1024*1024*1024) by (topic)'
expr: 'sum(pulsar_storage_size) by (topic) > 20*1024*1024*1024'
for: 1h
labels:
severity: critical
@ -78,7 +78,7 @@ groups:
description: "Observing Readonly Bookies\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PulsarHighNumberOfFunctionErrors
expr: 'sum((rate(pulsar_function_user_exceptions_total{}[1m]) + rate(pulsar_function_system_exceptions_total{}[1m])) > 10) by (name)'
expr: 'sum(rate(pulsar_function_user_exceptions_total[1m]) + rate(pulsar_function_system_exceptions_total[1m])) by (name) > 10'
for: 1m
labels:
severity: critical
@ -87,7 +87,7 @@ groups:
description: "Observing more than 10 Function errors per minute\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PulsarHighNumberOfSinkErrors
expr: 'sum(rate(pulsar_sink_sink_exceptions_total{}[1m]) > 10) by (name)'
expr: 'sum(rate(pulsar_sink_sink_exceptions_total[1m])) by (name) > 10'
for: 1m
labels:
severity: critical

View file

@ -5,18 +5,20 @@ groups:
rules:
# 1m delay allows a restart without triggering an alert.
- alert: RabbitmqDown
expr: 'rabbitmq_up == 0'
for: 0m
for: 1m
labels:
severity: critical
annotations:
summary: RabbitMQ down (instance {{ $labels.instance }})
description: "RabbitMQ node down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# 1m delay allows a restart without triggering an alert.
- alert: RabbitmqClusterDown
expr: 'sum(rabbitmq_running) < 3'
for: 0m
for: 1m
labels:
severity: critical
annotations:
@ -33,13 +35,13 @@ groups:
description: "Cluster partition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqOutOfMemory
expr: 'rabbitmq_node_mem_used / rabbitmq_node_mem_limit * 100 > 90'
expr: 'rabbitmq_node_mem_used / rabbitmq_node_mem_limit * 100 > 90 and rabbitmq_node_mem_limit > 0'
for: 2m
labels:
severity: warning
annotations:
summary: RabbitMQ out of memory (instance {{ $labels.instance }})
description: "Memory available for RabbmitMQ is low (< 10%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Memory available for RabbitMQ is low (< 10%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqTooManyConnections
expr: 'rabbitmq_connectionsTotal > 1000'
@ -80,9 +82,10 @@ groups:
summary: RabbitMQ slow queue consuming (instance {{ $labels.instance }})
description: "Queue messages are consumed slowly (> 60s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Allows a short service restart.
- alert: RabbitmqNoConsumer
expr: 'rabbitmq_queue_consumers == 0'
for: 1m
for: 5m
labels:
severity: critical
annotations:
@ -100,11 +103,11 @@ groups:
description: "Queue should have only 1 consumer\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Indicate the exchange name in dedicated label.
- alert: RabbitmqUnactiveExchange
- alert: RabbitmqInactiveExchange
expr: 'rate(rabbitmq_exchange_messages_published_in_total{exchange="my-exchange"}[1m]) < 5'
for: 2m
labels:
severity: warning
annotations:
summary: RabbitMQ unactive exchange (instance {{ $labels.instance }})
summary: RabbitMQ inactive exchange (instance {{ $labels.instance }})
description: "Exchange receive less than 5 msgs per second\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -5,18 +5,20 @@ groups:
rules:
# 1m delay allows a restart without triggering an alert.
- alert: RabbitmqNodeDown
expr: 'sum(rabbitmq_build_info) < 3'
for: 0m
for: 1m
labels:
severity: critical
annotations:
summary: RabbitMQ node down (instance {{ $labels.instance }})
description: "Less than 3 nodes running in RabbitMQ cluster\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# 1m delay allows a restart without triggering an alert.
- alert: RabbitmqNodeNotDistributed
expr: 'erlang_vm_dist_node_state < 3'
for: 0m
for: 1m
labels:
severity: critical
annotations:
@ -33,7 +35,7 @@ groups:
description: "Running different version of RabbitMQ in the same cluster, can lead to failure.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqMemoryHigh
expr: 'rabbitmq_process_resident_memory_bytes / rabbitmq_resident_memory_limit_bytes * 100 > 90'
expr: 'rabbitmq_process_resident_memory_bytes / rabbitmq_resident_memory_limit_bytes * 100 > 90 and rabbitmq_resident_memory_limit_bytes > 0'
for: 2m
labels:
severity: warning
@ -42,7 +44,7 @@ groups:
description: "A node use more than 90% of allocated RAM\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqFileDescriptorsUsage
expr: 'rabbitmq_process_open_fds / rabbitmq_process_max_fds * 100 > 90'
expr: 'rabbitmq_process_open_fds / rabbitmq_process_max_fds * 100 > 90 and rabbitmq_process_max_fds > 0'
for: 2m
labels:
severity: warning
@ -57,7 +59,7 @@ groups:
severity: warning
annotations:
summary: RabbitMQ too many ready messages (instance {{ $labels.instance }})
description: "RabbitMQ too many ready messages on {{ $labels.instace }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "RabbitMQ too many ready messages on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqTooManyUnackMessages
expr: 'sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000'

View file

@ -5,9 +5,10 @@ groups:
rules:
# 1m delay allows a restart without triggering an alert.
- alert: RedisDown
expr: 'redis_up == 0'
for: 0m
for: 1m
labels:
severity: critical
annotations:
@ -23,9 +24,10 @@ groups:
summary: Redis missing master (instance {{ $labels.instance }})
description: "Redis cluster has no node marked as master.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# 1m delay allows a restart without triggering an alert.
- alert: RedisTooManyMasters
expr: 'count(redis_instance_info{role="master"}) > 1'
for: 0m
for: 1m
labels:
severity: critical
annotations:
@ -60,13 +62,13 @@ groups:
description: "Changes have been detected in Redis replica connection. This can occur when replica nodes lose connection to the master and reconnect (a.k.a flapping).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RedisMissingBackup
expr: 'time() - redis_rdb_last_save_timestamp_seconds > 60 * 60 * 24'
expr: 'time() - redis_rdb_last_save_timestamp_seconds > 60 * 60 * 48'
for: 0m
labels:
severity: critical
annotations:
summary: Redis missing backup (instance {{ $labels.instance }})
description: "Redis has not been backuped for 24 hours\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Redis has not been backed up for 48 hours\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# The exporter must be started with --include-system-metrics flag or REDIS_EXPORTER_INCL_SYSTEM_METRICS=true environment variable.
- alert: RedisOutOfSystemMemory
@ -106,10 +108,10 @@ groups:
description: "Redis instance should have more connections (> 5)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RedisRejectedConnections
expr: 'increase(redis_rejected_connections_total[1m]) > 0'
expr: 'increase(redis_rejected_connections_total[1m]) > 5'
for: 0m
labels:
severity: critical
severity: warning
annotations:
summary: Redis rejected connections (instance {{ $labels.instance }})
description: "Some connections to Redis has been rejected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -30,7 +30,7 @@ groups:
severity: critical
annotations:
summary: Solr replication errors (instance {{ $labels.instance }})
description: "Solr collection {{ $labels.collection }} has failed updates for replica {{ $labels.replica }} on {{ $labels.base_url }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Solr collection {{ $labels.collection }} has replication errors for replica {{ $labels.replica }} on {{ $labels.base_url }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SolrLowLiveNodeCount
expr: 'solr_collections_live_nodes < 2'

View file

@ -5,9 +5,10 @@ groups:
rules:
# 1m delay allows a restart without triggering an alert.
- alert: SqlServerDown
expr: 'mssql_up == 0'
for: 0m
for: 1m
labels:
severity: critical
annotations:
@ -15,10 +16,10 @@ groups:
description: "SQL server instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SqlServerDeadlock
expr: 'increase(mssql_deadlocks[1m]) > 5'
for: 0m
expr: 'mssql_deadlocks > 5'
for: 1m
labels:
severity: warning
annotations:
summary: SQL Server deadlock (instance {{ $labels.instance }})
description: "SQL Server is having some deadlock.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "SQL Server {{ $labels.instance }} is experiencing deadlocks ({{ $value }}/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -5,9 +5,10 @@ groups:
rules:
# 1m delay allows a restart without triggering an alert.
- alert: ZookeeperDown
expr: 'zk_up == 0'
for: 0m
for: 1m
labels:
severity: critical
annotations: