diff --git a/dist/rules/cassandra/criteo-cassandra-exporter.yml b/dist/rules/cassandra/criteo-cassandra-exporter.yml index 7ff24bf..82ab06c 100644 --- a/dist/rules/cassandra/criteo-cassandra-exporter.yml +++ b/dist/rules/cassandra/criteo-cassandra-exporter.yml @@ -15,7 +15,7 @@ groups: description: "Cassandra hints count has changed on {{ $labels.instance }} some nodes may go down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CassandraCompactionTaskPending - expr: 'avg_over_time(cassandra_stats{name="org:apache:cassandra:metrics:compaction:pendingtasks:value"}[1m]) > 100' + expr: 'cassandra_stats{name="org:apache:cassandra:metrics:compaction:pendingtasks:value"} > 100' for: 2m labels: severity: warning @@ -24,7 +24,7 @@ groups: description: "Many Cassandra compaction tasks are pending. You might need to increase I/O capacity by adding nodes to the cluster.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CassandraViewwriteLatency - expr: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:viewwrite:viewwritelatency:99thpercentile",service="cas"} > 100000' + expr: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:viewwrite:viewwritelatency:99thpercentile"} > 100000' for: 2m labels: severity: warning @@ -32,18 +32,19 @@ groups: summary: Cassandra viewwrite latency (instance {{ $labels.instance }}) description: "High viewwrite latency on {{ $labels.instance }} cassandra node\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: CassandraBadHacker + - alert: CassandraAuthenticationFailures expr: 'rate(cassandra_stats{name="org:apache:cassandra:metrics:client:authfailure:count"}[1m]) > 5' for: 2m labels: severity: warning annotations: - summary: Cassandra bad hacker (instance {{ $labels.instance }}) + summary: Cassandra authentication failures (instance {{ $labels.instance }}) description: "Increase of Cassandra authentication failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # 1m delay allows a restart without triggering an alert. - alert: CassandraNodeDown expr: 'sum(cassandra_stats{name="org:apache:cassandra:net:failuredetector:downendpointcount"}) by (service,group,cluster,env) > 0' - for: 0m + for: 1m labels: severity: critical annotations: @@ -141,7 +142,7 @@ groups: description: "Read failures have occurred because too many nodes are unavailable\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CassandraClientRequestWriteFailure - expr: 'increase(cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:write:failures:oneminuterate"}[1m]) > 0' + expr: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:write:failures:oneminuterate"} > 0' for: 0m labels: severity: critical @@ -150,7 +151,7 @@ groups: description: "A lot of write failures encountered. A write failure is a non-timeout exception encountered during a write request. Examine the reason map to find to the root cause. The most common cause for this type of error is when batch sizes are too large.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CassandraClientRequestReadFailure - expr: 'increase(cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:read:failures:oneminuterate"}[1m]) > 0' + expr: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:read:failures:oneminuterate"} > 0' for: 0m labels: severity: critical diff --git a/dist/rules/cassandra/instaclustr-cassandra-exporter.yml b/dist/rules/cassandra/instaclustr-cassandra-exporter.yml index 2d397f5..7369835 100644 --- a/dist/rules/cassandra/instaclustr-cassandra-exporter.yml +++ b/dist/rules/cassandra/instaclustr-cassandra-exporter.yml @@ -5,9 +5,10 @@ groups: rules: + # 1m delay allows a restart without triggering an alert. - alert: CassandraNodeIsUnavailable - expr: 'sum(cassandra_endpoint_active) by (cassandra_cluster,instance,exported_endpoint) < 1' - for: 0m + expr: 'cassandra_endpoint_active < 1' + for: 1m labels: severity: critical annotations: @@ -51,7 +52,7 @@ groups: description: "Some Cassandra flush writer tasks are blocked - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CassandraConnectionTimeoutsTotal - expr: 'avg(cassandra_client_request_timeouts_total) by (cassandra_cluster,instance) > 5' + expr: 'sum by (cassandra_cluster,instance) (rate(cassandra_client_request_timeouts_total[5m])) > 5' for: 2m labels: severity: critical @@ -102,7 +103,7 @@ groups: severity: critical annotations: summary: Cassandra client request write failure (instance {{ $labels.instance }}) - description: "Read failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Write failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CassandraClientRequestReadFailure expr: 'increase(cassandra_client_request_failures_total{operation="read"}[1m]) > 0' diff --git a/dist/rules/clickhouse/embedded-exporter.yml b/dist/rules/clickhouse/embedded-exporter.yml index 5553722..f1e983d 100644 --- a/dist/rules/clickhouse/embedded-exporter.yml +++ b/dist/rules/clickhouse/embedded-exporter.yml @@ -5,6 +5,7 @@ groups: rules: + # Adjust the job label to match your Prometheus configuration. - alert: ClickhouseNodeDown expr: 'up{job="clickhouse"} == 0' for: 2m @@ -15,7 +16,7 @@ groups: description: "No metrics received from ClickHouse exporter for over 2 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ClickhouseMemoryUsageCritical - expr: 'ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 90' + expr: 'ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 90 and ClickHouseAsyncMetrics_CGroupMemoryTotal > 0' for: 5m labels: severity: critical @@ -24,7 +25,7 @@ groups: description: "Memory usage is critically high, over 90%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ClickhouseMemoryUsageWarning - expr: 'ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 80' + expr: 'ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 80 and ClickHouseAsyncMetrics_CGroupMemoryTotal > 0' for: 5m labels: severity: warning @@ -86,16 +87,6 @@ groups: summary: ClickHouse No Live Replicas (instance {{ $labels.instance }}) description: "There are too few live replicas available, risking data loss and service disruption.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - # Please replace the threshold with an appropriate value - - alert: ClickhouseHighNetworkTraffic - expr: 'ClickHouseMetrics_NetworkSend > 250 or ClickHouseMetrics_NetworkReceive > 250' - for: 5m - labels: - severity: warning - annotations: - summary: ClickHouse High Network Traffic (instance {{ $labels.instance }}) - description: "Network traffic is unusually high, may affect cluster performance.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - # Please replace the threshold with an appropriate value - alert: ClickhouseHighTcpConnections expr: 'ClickHouseMetrics_TCPConnection > 400' @@ -106,17 +97,18 @@ groups: summary: ClickHouse High TCP Connections (instance {{ $labels.instance }}) description: "High number of TCP connections, indicating heavy client or inter-cluster communication.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Adjust the threshold based on your cluster size and expected replication traffic. - alert: ClickhouseInterserverConnectionIssues - expr: 'increase(ClickHouseMetrics_InterserverConnection[5m]) > 0' - for: 1m + expr: 'ClickHouseMetrics_InterserverConnection > 50' + for: 5m labels: severity: warning annotations: summary: ClickHouse Interserver Connection Issues (instance {{ $labels.instance }}) - description: "An increase in interserver connections may indicate replication or distributed query handling issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "High number of interserver connections may indicate replication or distributed query handling issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ClickhouseZookeeperConnectionIssues - expr: 'avg(ClickHouseMetrics_ZooKeeperSession) != 1' + expr: 'ClickHouseMetrics_ZooKeeperSession != 1' for: 3m labels: severity: warning diff --git a/dist/rules/couchdb/gesellix-couchdb-prometheus-exporter.yml b/dist/rules/couchdb/gesellix-couchdb-prometheus-exporter.yml index 0280f18..5f434c5 100644 --- a/dist/rules/couchdb/gesellix-couchdb-prometheus-exporter.yml +++ b/dist/rules/couchdb/gesellix-couchdb-prometheus-exporter.yml @@ -42,7 +42,7 @@ groups: description: "CouchDB is using more than 90% of allowed OS file descriptors, may fail to open new files\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: Couchdb5xxErrorRatioHigh - expr: 'rate(couchdb_httpd_status_codes{code=~"5.."}[5m]) / rate(couchdb_httpd_requests[5m]) > 0.05' + expr: 'rate(couchdb_httpd_status_codes{code=~"5.."}[5m]) / rate(couchdb_httpd_requests[5m]) > 0.05 and rate(couchdb_httpd_requests[5m]) > 0' for: 5m labels: severity: critical @@ -153,7 +153,7 @@ groups: expr: 'changes(process_start_time_seconds[1h]) > 0' for: 1m labels: - severity: critical + severity: info annotations: summary: CouchDB process restarted (instance {{ $labels.instance }}) description: "CouchDB process has restarted recently\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml b/dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml index 1980c1d..cf4386a 100644 --- a/dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml +++ b/dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml @@ -59,18 +59,20 @@ groups: summary: Elasticsearch Cluster Yellow (instance {{ $labels.instance }}) description: "Elastic Cluster Yellow status\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # 1m delay allows a restart without triggering an alert. - alert: ElasticsearchHealthyNodes expr: 'elasticsearch_cluster_health_number_of_nodes < 3' - for: 0m + for: 1m labels: severity: critical annotations: summary: Elasticsearch Healthy Nodes (instance {{ $labels.instance }}) description: "Missing node in Elasticsearch cluster\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # 1m delay allows a restart without triggering an alert. - alert: ElasticsearchHealthyDataNodes expr: 'elasticsearch_cluster_health_number_of_data_nodes < 3' - for: 0m + for: 1m labels: severity: critical annotations: @@ -115,7 +117,7 @@ groups: - alert: ElasticsearchUnassignedShards expr: 'elasticsearch_cluster_health_unassigned_shards > 0' - for: 0m + for: 2m labels: severity: critical annotations: @@ -141,7 +143,7 @@ groups: description: "No new documents for 10 min!\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ElasticsearchHighIndexingLatency - expr: 'increase(elasticsearch_indices_indexing_index_time_seconds_total[1m]) / increase(elasticsearch_indices_indexing_index_total[1m]) > 0.0005' + expr: 'increase(elasticsearch_indices_indexing_index_time_seconds_total[1m]) / increase(elasticsearch_indices_indexing_index_total[1m]) > 0.0005 and increase(elasticsearch_indices_indexing_index_total[1m]) > 0' for: 10m labels: severity: warning @@ -168,7 +170,7 @@ groups: description: "The query rate on Elasticsearch cluster is higher than the threshold.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: ElasticsearchHighQueryLatency - expr: 'increase(elasticsearch_indices_search_fetch_time_seconds[1m]) / increase(elasticsearch_indices_search_fetch_total[1m]) > 1' + expr: 'increase(elasticsearch_indices_search_query_time_seconds[1m]) / increase(elasticsearch_indices_search_query_total[1m]) > 1 and increase(elasticsearch_indices_search_query_total[1m]) > 0' for: 5m labels: severity: warning diff --git a/dist/rules/hadoop/jmx_exporter.yml b/dist/rules/hadoop/jmx_exporter.yml index 5a94f8a..d9f3b8e 100644 --- a/dist/rules/hadoop/jmx_exporter.yml +++ b/dist/rules/hadoop/jmx_exporter.yml @@ -42,7 +42,7 @@ groups: description: "Available HDFS disk space is running low.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HadoopMapReduceTaskFailures - expr: 'hadoop_mapreduce_task_failures_total > 100' + expr: 'increase(hadoop_mapreduce_task_failures_total[1h]) > 100' for: 10m labels: severity: critical @@ -60,7 +60,7 @@ groups: description: "The Hadoop ResourceManager is approaching its memory limit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HadoopYarnContainerAllocationFailures - expr: 'hadoop_yarn_container_allocation_failures_total > 10' + expr: 'increase(hadoop_yarn_container_allocation_failures_total[1h]) > 10' for: 10m labels: severity: warning @@ -78,10 +78,10 @@ groups: description: "The HBase cluster has an unusually high number of regions.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HadoopHbaseRegionServerHeapLow - expr: 'hadoop_hbase_region_server_heap_bytes / hadoop_hbase_region_server_max_heap_bytes < 0.2' + expr: 'hadoop_hbase_region_server_heap_bytes / hadoop_hbase_region_server_max_heap_bytes > 0.8' for: 10m labels: - severity: critical + severity: warning annotations: summary: Hadoop HBase Region Server Heap Low (instance {{ $labels.instance }}) description: "HBase Region Servers are running low on heap space.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/dist/rules/kafka/danielqsj-kafka-exporter.yml b/dist/rules/kafka/danielqsj-kafka-exporter.yml index 5348361..69f4cd0 100644 --- a/dist/rules/kafka/danielqsj-kafka-exporter.yml +++ b/dist/rules/kafka/danielqsj-kafka-exporter.yml @@ -14,11 +14,11 @@ groups: summary: Kafka topics replicas (instance {{ $labels.instance }}) description: "Kafka topic in-sync partition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: KafkaConsumersGroup - expr: 'sum(kafka_consumergroup_lag) by (consumergroup) > 50' + - alert: KafkaConsumerGroupLag + expr: 'sum(kafka_consumergroup_lag) by (consumergroup) > 10000' for: 1m labels: - severity: critical + severity: warning annotations: - summary: Kafka consumers group (instance {{ $labels.instance }}) - description: "Kafka consumers group\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: Kafka consumer group lag (instance {{ $labels.instance }}) + description: "Kafka consumer group {{ $labels.consumergroup }} is lagging behind ({{ $value }} messages)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/dist/rules/meilisearch/embedded-exporter.yml b/dist/rules/meilisearch/embedded-exporter.yml index 9e31806..61929c5 100644 --- a/dist/rules/meilisearch/embedded-exporter.yml +++ b/dist/rules/meilisearch/embedded-exporter.yml @@ -12,7 +12,7 @@ groups: severity: warning annotations: summary: Meilisearch index is empty (instance {{ $labels.instance }}) - description: "Meilisearch instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Meilisearch index {{ $labels.index }} has zero documents\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MeilisearchHttpResponseTime expr: 'meilisearch_http_response_time_seconds > 0.5' diff --git a/dist/rules/mongodb/dcu-mongodb-exporter.yml b/dist/rules/mongodb/dcu-mongodb-exporter.yml index 9ef62a0..422e224 100644 --- a/dist/rules/mongodb/dcu-mongodb-exporter.yml +++ b/dist/rules/mongodb/dcu-mongodb-exporter.yml @@ -78,19 +78,10 @@ groups: description: "Too many cursors are timing out\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MongodbTooManyConnections - expr: 'avg by(instance) (rate(mongodb_connections{state="current"}[1m])) / avg by(instance) (sum (mongodb_connections) by (instance)) * 100 > 80' + expr: 'mongodb_connections{state="current"} / (mongodb_connections{state="current"} + mongodb_connections{state="available"}) * 100 > 80' for: 2m labels: severity: warning annotations: summary: MongoDB too many connections (instance {{ $labels.instance }}) description: "Too many connections (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: MongodbVirtualMemoryUsage - expr: '(sum(mongodb_memory{type="virtual"}) BY (instance) / sum(mongodb_memory{type="mapped"}) BY (instance)) > 3' - for: 2m - labels: - severity: warning - annotations: - summary: MongoDB virtual memory usage (instance {{ $labels.instance }}) - description: "High memory usage\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/dist/rules/mongodb/percona-mongodb-exporter.yml b/dist/rules/mongodb/percona-mongodb-exporter.yml index 5dc503a..25d7642 100644 --- a/dist/rules/mongodb/percona-mongodb-exporter.yml +++ b/dist/rules/mongodb/percona-mongodb-exporter.yml @@ -5,18 +5,20 @@ groups: rules: + # 1m delay allows a restart without triggering an alert. - alert: MongodbDown expr: 'mongodb_up == 0' - for: 0m + for: 1m labels: severity: critical annotations: summary: MongoDB Down (instance {{ $labels.instance }}) description: "MongoDB instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # 1m delay allows a restart without triggering an alert. - alert: MongodbReplicaMemberUnhealthy expr: 'mongodb_rs_members_health == 0' - for: 0m + for: 1m labels: severity: critical annotations: @@ -32,6 +34,7 @@ groups: summary: MongoDB replication lag (instance {{ $labels.instance }}) description: "Mongodb replication lag is more than 10s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # This query mixes old (mongodb_mongod_*) and new (mongodb_rs_*) metric names. It requires the Percona exporter to run with --compatible-mode to expose both. - alert: MongodbReplicationHeadroom expr: 'sum(avg(mongodb_mongod_replset_oplog_head_timestamp - mongodb_mongod_replset_oplog_tail_timestamp)) - sum(avg(mongodb_rs_members_optimeDate{member_state="PRIMARY"} - on (set) group_right mongodb_rs_members_optimeDate{member_state="SECONDARY"})) <= 0' for: 0m @@ -60,7 +63,7 @@ groups: description: "Too many cursors are timing out\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: MongodbTooManyConnections - expr: 'avg by(instance) (rate(mongodb_ss_connections{conn_type="current"}[1m])) / avg by(instance) (sum (mongodb_ss_connections) by (instance)) * 100 > 80' + expr: 'mongodb_ss_connections{conn_type="current"} / (mongodb_ss_connections{conn_type="current"} + mongodb_ss_connections{conn_type="available"}) * 100 > 80' for: 2m labels: severity: warning diff --git a/dist/rules/mysql/mysqld-exporter.yml b/dist/rules/mysql/mysqld-exporter.yml index a19fc17..b581754 100644 --- a/dist/rules/mysql/mysqld-exporter.yml +++ b/dist/rules/mysql/mysqld-exporter.yml @@ -5,9 +5,10 @@ groups: rules: + # 1m delay allows a restart without triggering an alert. - alert: MysqlDown expr: 'mysql_up == 0' - for: 0m + for: 1m labels: severity: critical annotations: @@ -41,18 +42,20 @@ groups: summary: MySQL high threads running (instance {{ $labels.instance }}) description: "More than 60% of MySQL connections are in running state on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # 1m delay allows a restart without triggering an alert. - alert: MysqlSlaveIoThreadNotRunning expr: '( mysql_slave_status_slave_io_running and ON (instance) mysql_slave_status_master_server_id > 0 ) == 0' - for: 0m + for: 1m labels: severity: critical annotations: summary: MySQL Slave IO thread not running (instance {{ $labels.instance }}) description: "MySQL Slave IO thread not running on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # 1m delay allows a restart without triggering an alert. - alert: MysqlSlaveSqlThreadNotRunning expr: '( mysql_slave_status_slave_sql_running and ON (instance) mysql_slave_status_master_server_id > 0) == 0' - for: 0m + for: 1m labels: severity: critical annotations: diff --git a/dist/rules/nats/nats-exporter.yml b/dist/rules/nats/nats-exporter.yml index e9ed81e..ee29e1c 100644 --- a/dist/rules/nats/nats-exporter.yml +++ b/dist/rules/nats/nats-exporter.yml @@ -5,24 +5,6 @@ groups: rules: - - alert: NatsHighConnectionCount - expr: 'gnatsd_varz_connections > 100' - for: 3m - labels: - severity: warning - annotations: - summary: Nats high connection count (instance {{ $labels.instance }}) - description: "High number of NATS connections ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: NatsHighSubscriptionsCount - expr: 'gnatsd_connz_subscriptions > 50' - for: 3m - labels: - severity: warning - annotations: - summary: Nats high subscriptions count (instance {{ $labels.instance }}) - description: "High number of NATS subscriptions ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: NatsHighRoutesCount expr: 'gnatsd_varz_routes > 10' for: 3m @@ -59,8 +41,9 @@ groups: summary: Nats server down (instance {{ $labels.instance }}) description: "NATS server has been down for more than 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # gnatsd_varz_cpu is a gauge reporting CPU percentage (0-100 scale). - alert: NatsHighCpuUsage - expr: 'rate(gnatsd_varz_cpu[5m]) > 0.8' + expr: 'gnatsd_varz_cpu > 80' for: 5m labels: severity: warning @@ -78,7 +61,7 @@ groups: description: "NATS server has more than 1000 active connections\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NatsHighJetstreamStoreUsage - expr: 'gnatsd_varz_jetstream_stats_storage / gnatsd_varz_jetstream_config_max_storage > 0.8' + expr: 'gnatsd_varz_jetstream_stats_storage / gnatsd_varz_jetstream_config_max_storage > 0.8 and gnatsd_varz_jetstream_config_max_storage > 0' for: 5m labels: severity: warning @@ -87,7 +70,7 @@ groups: description: "JetStream store usage is over 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NatsHighJetstreamMemoryUsage - expr: 'gnatsd_varz_jetstream_stats_memory / gnatsd_varz_jetstream_config_max_memory > 0.8' + expr: 'gnatsd_varz_jetstream_stats_memory / gnatsd_varz_jetstream_config_max_memory > 0.8 and gnatsd_varz_jetstream_config_max_memory > 0' for: 5m labels: severity: warning @@ -122,56 +105,20 @@ groups: summary: Nats too many errors (instance {{ $labels.instance }}) description: "NATS server has encountered errors in the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: NatsJetstreamConsumersExceeded + - alert: NatsJetstreamAccountsExceeded expr: 'sum(gnatsd_varz_jetstream_stats_accounts) > 100' for: 5m labels: severity: warning annotations: - summary: Nats JetStream consumers exceeded (instance {{ $labels.instance }}) - description: "JetStream has more than 100 active consumers\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: NatsFrequentAuthenticationTimeouts - expr: 'increase(gnatsd_varz_auth_timeout[5m]) > 5' - for: 5m - labels: - severity: warning - annotations: - summary: Nats frequent authentication timeouts (instance {{ $labels.instance }}) - description: "There have been more than 5 authentication timeouts in the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: NatsMaxPayloadSizeExceeded - expr: 'max(gnatsd_varz_max_payload) > 1024 * 1024' - for: 5m - labels: - severity: critical - annotations: - summary: Nats max payload size exceeded (instance {{ $labels.instance }}) - description: "The max payload size allowed by NATS has been exceeded (1MB)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: Nats JetStream accounts exceeded (instance {{ $labels.instance }}) + description: "JetStream has more than 100 active accounts\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NatsLeafNodeConnectionIssue - expr: 'increase(gnatsd_varz_leafnodes[5m]) == 0' - for: 5m - labels: - severity: critical - annotations: - summary: Nats leaf node connection issue (instance {{ $labels.instance }}) - description: "No leaf node connections have been established in the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: NatsMaxPingOperationsExceeded - expr: 'gnatsd_varz_ping_max > 50' + expr: 'gnatsd_varz_leafnodes == 0' for: 5m labels: severity: warning annotations: - summary: Nats max ping operations exceeded (instance {{ $labels.instance }}) - description: "The maximum number of ping operations in NATS has exceeded 50\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: NatsWriteDeadlineExceeded - expr: 'gnatsd_varz_write_deadline > 10' - for: 5m - labels: - severity: critical - annotations: - summary: Nats write deadline exceeded (instance {{ $labels.instance }}) - description: "The write deadline has been exceeded in NATS, indicating potential message delivery issues\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: Nats leaf node connection issue (instance {{ $labels.instance }}) + description: "No leaf node connections on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/dist/rules/patroni/embedded-exporter-patroni.yml b/dist/rules/patroni/embedded-exporter-patroni.yml index 87528f3..8055230 100644 --- a/dist/rules/patroni/embedded-exporter-patroni.yml +++ b/dist/rules/patroni/embedded-exporter-patroni.yml @@ -5,9 +5,10 @@ groups: rules: + # 1m delay allows a restart without triggering an alert. - alert: PatroniHasNoLeader - expr: '(max by (scope) (patroni_master) < 1) and (max by (scope) (patroni_standby_leader) < 1)' - for: 0m + expr: '(max by (scope) (patroni_primary) < 1) and (max by (scope) (patroni_standby_leader) < 1)' + for: 1m labels: severity: critical annotations: diff --git a/dist/rules/pgbouncer/spreaker-pgbouncer-exporter.yml b/dist/rules/pgbouncer/spreaker-pgbouncer-exporter.yml index ec83f51..1171fae 100644 --- a/dist/rules/pgbouncer/spreaker-pgbouncer-exporter.yml +++ b/dist/rules/pgbouncer/spreaker-pgbouncer-exporter.yml @@ -21,10 +21,10 @@ groups: severity: warning annotations: summary: PGBouncer errors (instance {{ $labels.instance }}) - description: "PGBouncer is logging errors. This may be due to a a server restart or an admin typing commands at the pgbouncer console.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "PGBouncer is logging errors. This may be due to a server restart or an admin typing commands at the pgbouncer console.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PgbouncerMaxConnections - expr: 'increase(pgbouncer_errors_count{errmsg="no more connections allowed (max_client_conn)"}[30s]) > 0' + expr: 'increase(pgbouncer_errors_count{errmsg="no more connections allowed (max_client_conn)"}[2m]) > 0' for: 0m labels: severity: critical diff --git a/dist/rules/postgresql/postgres-exporter.yml b/dist/rules/postgresql/postgres-exporter.yml index a8b5846..b08bef7 100644 --- a/dist/rules/postgresql/postgres-exporter.yml +++ b/dist/rules/postgresql/postgres-exporter.yml @@ -5,9 +5,10 @@ groups: rules: + # 1m delay allows a restart without triggering an alert. - alert: PostgresqlDown expr: 'pg_up == 0' - for: 0m + for: 1m labels: severity: critical annotations: @@ -104,24 +105,6 @@ groups: summary: Postgresql low XID consumption (instance {{ $labels.instance }}) description: "Postgresql seems to be consuming transaction IDs very slowly\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: PostgresqlHighRateStatementTimeout - expr: 'rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3' - for: 0m - labels: - severity: critical - annotations: - summary: Postgresql high rate statement timeout (instance {{ $labels.instance }}) - description: "Postgres transactions showing high rate of statement timeouts\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: PostgresqlHighRateDeadlock - expr: 'increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1' - for: 0m - labels: - severity: critical - annotations: - summary: Postgresql high rate deadlock (instance {{ $labels.instance }}) - description: "Postgres detected deadlocks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: PostgresqlUnusedReplicationSlot expr: '(pg_replication_slots_active == 0) and (pg_replication_is_replica == 0)' for: 1m @@ -150,7 +133,7 @@ groups: description: "Postgres Database configuration change has occurred\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PostgresqlSslCompressionActive - expr: 'sum(pg_stat_ssl_compression) > 0' + expr: 'sum by (instance) (pg_stat_ssl_compression) > 0' for: 0m labels: severity: warning @@ -159,7 +142,7 @@ groups: description: "Database allows connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PostgresqlTooManyLocksAcquired - expr: '((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20' + expr: '((sum by (instance) (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20' for: 2m labels: severity: critical diff --git a/dist/rules/pulsar/embedded-exporter.yml b/dist/rules/pulsar/embedded-exporter.yml index c6ba4ae..8885369 100644 --- a/dist/rules/pulsar/embedded-exporter.yml +++ b/dist/rules/pulsar/embedded-exporter.yml @@ -24,7 +24,7 @@ groups: description: "The number of subscription backlog entries is over 100k\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PulsarTopicLargeBacklogStorageSize - expr: 'sum(pulsar_storage_size > 5*1024*1024*1024) by (topic)' + expr: 'sum(pulsar_storage_size) by (topic) > 5*1024*1024*1024' for: 1h labels: severity: warning @@ -33,7 +33,7 @@ groups: description: "The topic backlog storage size is over 5 GB\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PulsarTopicVeryLargeBacklogStorageSize - expr: 'sum(pulsar_storage_size > 20*1024*1024*1024) by (topic)' + expr: 'sum(pulsar_storage_size) by (topic) > 20*1024*1024*1024' for: 1h labels: severity: critical @@ -78,7 +78,7 @@ groups: description: "Observing Readonly Bookies\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PulsarHighNumberOfFunctionErrors - expr: 'sum((rate(pulsar_function_user_exceptions_total{}[1m]) + rate(pulsar_function_system_exceptions_total{}[1m])) > 10) by (name)' + expr: 'sum(rate(pulsar_function_user_exceptions_total[1m]) + rate(pulsar_function_system_exceptions_total[1m])) by (name) > 10' for: 1m labels: severity: critical @@ -87,7 +87,7 @@ groups: description: "Observing more than 10 Function errors per minute\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PulsarHighNumberOfSinkErrors - expr: 'sum(rate(pulsar_sink_sink_exceptions_total{}[1m]) > 10) by (name)' + expr: 'sum(rate(pulsar_sink_sink_exceptions_total[1m])) by (name) > 10' for: 1m labels: severity: critical diff --git a/dist/rules/rabbitmq/kbudde-rabbitmq-exporter.yml b/dist/rules/rabbitmq/kbudde-rabbitmq-exporter.yml index 85a19e4..fd83621 100644 --- a/dist/rules/rabbitmq/kbudde-rabbitmq-exporter.yml +++ b/dist/rules/rabbitmq/kbudde-rabbitmq-exporter.yml @@ -5,18 +5,20 @@ groups: rules: + # 1m delay allows a restart without triggering an alert. - alert: RabbitmqDown expr: 'rabbitmq_up == 0' - for: 0m + for: 1m labels: severity: critical annotations: summary: RabbitMQ down (instance {{ $labels.instance }}) description: "RabbitMQ node down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # 1m delay allows a restart without triggering an alert. - alert: RabbitmqClusterDown expr: 'sum(rabbitmq_running) < 3' - for: 0m + for: 1m labels: severity: critical annotations: @@ -33,13 +35,13 @@ groups: description: "Cluster partition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: RabbitmqOutOfMemory - expr: 'rabbitmq_node_mem_used / rabbitmq_node_mem_limit * 100 > 90' + expr: 'rabbitmq_node_mem_used / rabbitmq_node_mem_limit * 100 > 90 and rabbitmq_node_mem_limit > 0' for: 2m labels: severity: warning annotations: summary: RabbitMQ out of memory (instance {{ $labels.instance }}) - description: "Memory available for RabbmitMQ is low (< 10%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Memory available for RabbitMQ is low (< 10%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: RabbitmqTooManyConnections expr: 'rabbitmq_connectionsTotal > 1000' @@ -80,9 +82,10 @@ groups: summary: RabbitMQ slow queue consuming (instance {{ $labels.instance }}) description: "Queue messages are consumed slowly (> 60s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Allows a short service restart. - alert: RabbitmqNoConsumer expr: 'rabbitmq_queue_consumers == 0' - for: 1m + for: 5m labels: severity: critical annotations: @@ -100,11 +103,11 @@ groups: description: "Queue should have only 1 consumer\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Indicate the exchange name in dedicated label. - - alert: RabbitmqUnactiveExchange + - alert: RabbitmqInactiveExchange expr: 'rate(rabbitmq_exchange_messages_published_in_total{exchange="my-exchange"}[1m]) < 5' for: 2m labels: severity: warning annotations: - summary: RabbitMQ unactive exchange (instance {{ $labels.instance }}) + summary: RabbitMQ inactive exchange (instance {{ $labels.instance }}) description: "Exchange receive less than 5 msgs per second\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/dist/rules/rabbitmq/rabbitmq-exporter.yml b/dist/rules/rabbitmq/rabbitmq-exporter.yml index 46a23ab..5ddcc95 100644 --- a/dist/rules/rabbitmq/rabbitmq-exporter.yml +++ b/dist/rules/rabbitmq/rabbitmq-exporter.yml @@ -5,18 +5,20 @@ groups: rules: + # 1m delay allows a restart without triggering an alert. - alert: RabbitmqNodeDown expr: 'sum(rabbitmq_build_info) < 3' - for: 0m + for: 1m labels: severity: critical annotations: summary: RabbitMQ node down (instance {{ $labels.instance }}) description: "Less than 3 nodes running in RabbitMQ cluster\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # 1m delay allows a restart without triggering an alert. - alert: RabbitmqNodeNotDistributed expr: 'erlang_vm_dist_node_state < 3' - for: 0m + for: 1m labels: severity: critical annotations: @@ -33,7 +35,7 @@ groups: description: "Running different version of RabbitMQ in the same cluster, can lead to failure.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: RabbitmqMemoryHigh - expr: 'rabbitmq_process_resident_memory_bytes / rabbitmq_resident_memory_limit_bytes * 100 > 90' + expr: 'rabbitmq_process_resident_memory_bytes / rabbitmq_resident_memory_limit_bytes * 100 > 90 and rabbitmq_resident_memory_limit_bytes > 0' for: 2m labels: severity: warning @@ -42,7 +44,7 @@ groups: description: "A node use more than 90% of allocated RAM\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: RabbitmqFileDescriptorsUsage - expr: 'rabbitmq_process_open_fds / rabbitmq_process_max_fds * 100 > 90' + expr: 'rabbitmq_process_open_fds / rabbitmq_process_max_fds * 100 > 90 and rabbitmq_process_max_fds > 0' for: 2m labels: severity: warning @@ -57,7 +59,7 @@ groups: severity: warning annotations: summary: RabbitMQ too many ready messages (instance {{ $labels.instance }}) - description: "RabbitMQ too many ready messages on {{ $labels.instace }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "RabbitMQ too many ready messages on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: RabbitmqTooManyUnackMessages expr: 'sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000' diff --git a/dist/rules/redis/oliver006-redis-exporter.yml b/dist/rules/redis/oliver006-redis-exporter.yml index 7ca53b6..db96953 100644 --- a/dist/rules/redis/oliver006-redis-exporter.yml +++ b/dist/rules/redis/oliver006-redis-exporter.yml @@ -5,9 +5,10 @@ groups: rules: + # 1m delay allows a restart without triggering an alert. - alert: RedisDown expr: 'redis_up == 0' - for: 0m + for: 1m labels: severity: critical annotations: @@ -23,9 +24,10 @@ groups: summary: Redis missing master (instance {{ $labels.instance }}) description: "Redis cluster has no node marked as master.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # 1m delay allows a restart without triggering an alert. - alert: RedisTooManyMasters expr: 'count(redis_instance_info{role="master"}) > 1' - for: 0m + for: 1m labels: severity: critical annotations: @@ -60,13 +62,13 @@ groups: description: "Changes have been detected in Redis replica connection. This can occur when replica nodes lose connection to the master and reconnect (a.k.a flapping).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: RedisMissingBackup - expr: 'time() - redis_rdb_last_save_timestamp_seconds > 60 * 60 * 24' + expr: 'time() - redis_rdb_last_save_timestamp_seconds > 60 * 60 * 48' for: 0m labels: severity: critical annotations: summary: Redis missing backup (instance {{ $labels.instance }}) - description: "Redis has not been backuped for 24 hours\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Redis has not been backed up for 48 hours\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # The exporter must be started with --include-system-metrics flag or REDIS_EXPORTER_INCL_SYSTEM_METRICS=true environment variable. - alert: RedisOutOfSystemMemory @@ -106,10 +108,10 @@ groups: description: "Redis instance should have more connections (> 5)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: RedisRejectedConnections - expr: 'increase(redis_rejected_connections_total[1m]) > 0' + expr: 'increase(redis_rejected_connections_total[1m]) > 5' for: 0m labels: - severity: critical + severity: warning annotations: summary: Redis rejected connections (instance {{ $labels.instance }}) description: "Some connections to Redis has been rejected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/dist/rules/solr/embedded-exporter.yml b/dist/rules/solr/embedded-exporter.yml index 5b653fb..c8f662b 100644 --- a/dist/rules/solr/embedded-exporter.yml +++ b/dist/rules/solr/embedded-exporter.yml @@ -30,7 +30,7 @@ groups: severity: critical annotations: summary: Solr replication errors (instance {{ $labels.instance }}) - description: "Solr collection {{ $labels.collection }} has failed updates for replica {{ $labels.replica }} on {{ $labels.base_url }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Solr collection {{ $labels.collection }} has replication errors for replica {{ $labels.replica }} on {{ $labels.base_url }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SolrLowLiveNodeCount expr: 'solr_collections_live_nodes < 2' diff --git a/dist/rules/sql-server/ozarklake-mssql-exporter.yml b/dist/rules/sql-server/ozarklake-mssql-exporter.yml index a699402..51ffd94 100644 --- a/dist/rules/sql-server/ozarklake-mssql-exporter.yml +++ b/dist/rules/sql-server/ozarklake-mssql-exporter.yml @@ -5,9 +5,10 @@ groups: rules: + # 1m delay allows a restart without triggering an alert. - alert: SqlServerDown expr: 'mssql_up == 0' - for: 0m + for: 1m labels: severity: critical annotations: @@ -15,10 +16,10 @@ groups: description: "SQL server instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SqlServerDeadlock - expr: 'increase(mssql_deadlocks[1m]) > 5' - for: 0m + expr: 'mssql_deadlocks > 5' + for: 1m labels: severity: warning annotations: summary: SQL Server deadlock (instance {{ $labels.instance }}) - description: "SQL Server is having some deadlock.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "SQL Server {{ $labels.instance }} is experiencing deadlocks ({{ $value }}/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/dist/rules/zookeeper/dabealu-zookeeper-exporter.yml b/dist/rules/zookeeper/dabealu-zookeeper-exporter.yml index 0e3747e..69e29e5 100644 --- a/dist/rules/zookeeper/dabealu-zookeeper-exporter.yml +++ b/dist/rules/zookeeper/dabealu-zookeeper-exporter.yml @@ -5,9 +5,10 @@ groups: rules: + # 1m delay allows a restart without triggering an alert. - alert: ZookeeperDown expr: 'zk_up == 0' - for: 0m + for: 1m labels: severity: critical annotations: