mirror of
https://github.com/samber/awesome-prometheus-alerts.git
synced 2026-06-21 00:47:18 +08:00
Publish
This commit is contained in:
parent
c37ef8f50c
commit
e2af1325c6
22 changed files with 109 additions and 176 deletions
|
|
@ -15,7 +15,7 @@ groups:
|
|||
description: "Cassandra hints count has changed on {{ $labels.instance }} some nodes may go down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CassandraCompactionTaskPending
|
||||
expr: 'avg_over_time(cassandra_stats{name="org:apache:cassandra:metrics:compaction:pendingtasks:value"}[1m]) > 100'
|
||||
expr: 'cassandra_stats{name="org:apache:cassandra:metrics:compaction:pendingtasks:value"} > 100'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -24,7 +24,7 @@ groups:
|
|||
description: "Many Cassandra compaction tasks are pending. You might need to increase I/O capacity by adding nodes to the cluster.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CassandraViewwriteLatency
|
||||
expr: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:viewwrite:viewwritelatency:99thpercentile",service="cas"} > 100000'
|
||||
expr: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:viewwrite:viewwritelatency:99thpercentile"} > 100000'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -32,18 +32,19 @@ groups:
|
|||
summary: Cassandra viewwrite latency (instance {{ $labels.instance }})
|
||||
description: "High viewwrite latency on {{ $labels.instance }} cassandra node\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CassandraBadHacker
|
||||
- alert: CassandraAuthenticationFailures
|
||||
expr: 'rate(cassandra_stats{name="org:apache:cassandra:metrics:client:authfailure:count"}[1m]) > 5'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Cassandra bad hacker (instance {{ $labels.instance }})
|
||||
summary: Cassandra authentication failures (instance {{ $labels.instance }})
|
||||
description: "Increase of Cassandra authentication failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# 1m delay allows a restart without triggering an alert.
|
||||
- alert: CassandraNodeDown
|
||||
expr: 'sum(cassandra_stats{name="org:apache:cassandra:net:failuredetector:downendpointcount"}) by (service,group,cluster,env) > 0'
|
||||
for: 0m
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
|
|
@ -141,7 +142,7 @@ groups:
|
|||
description: "Read failures have occurred because too many nodes are unavailable\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CassandraClientRequestWriteFailure
|
||||
expr: 'increase(cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:write:failures:oneminuterate"}[1m]) > 0'
|
||||
expr: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:write:failures:oneminuterate"} > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
|
|
@ -150,7 +151,7 @@ groups:
|
|||
description: "A lot of write failures encountered. A write failure is a non-timeout exception encountered during a write request. Examine the reason map to find to the root cause. The most common cause for this type of error is when batch sizes are too large.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CassandraClientRequestReadFailure
|
||||
expr: 'increase(cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:read:failures:oneminuterate"}[1m]) > 0'
|
||||
expr: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:read:failures:oneminuterate"} > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
|
|
|
|||
|
|
@ -5,9 +5,10 @@ groups:
|
|||
|
||||
rules:
|
||||
|
||||
# 1m delay allows a restart without triggering an alert.
|
||||
- alert: CassandraNodeIsUnavailable
|
||||
expr: 'sum(cassandra_endpoint_active) by (cassandra_cluster,instance,exported_endpoint) < 1'
|
||||
for: 0m
|
||||
expr: 'cassandra_endpoint_active < 1'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
|
|
@ -51,7 +52,7 @@ groups:
|
|||
description: "Some Cassandra flush writer tasks are blocked - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CassandraConnectionTimeoutsTotal
|
||||
expr: 'avg(cassandra_client_request_timeouts_total) by (cassandra_cluster,instance) > 5'
|
||||
expr: 'sum by (cassandra_cluster,instance) (rate(cassandra_client_request_timeouts_total[5m])) > 5'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
|
|
@ -102,7 +103,7 @@ groups:
|
|||
severity: critical
|
||||
annotations:
|
||||
summary: Cassandra client request write failure (instance {{ $labels.instance }})
|
||||
description: "Read failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Write failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CassandraClientRequestReadFailure
|
||||
expr: 'increase(cassandra_client_request_failures_total{operation="read"}[1m]) > 0'
|
||||
|
|
|
|||
24
dist/rules/clickhouse/embedded-exporter.yml
vendored
24
dist/rules/clickhouse/embedded-exporter.yml
vendored
|
|
@ -5,6 +5,7 @@ groups:
|
|||
|
||||
rules:
|
||||
|
||||
# Adjust the job label to match your Prometheus configuration.
|
||||
- alert: ClickhouseNodeDown
|
||||
expr: 'up{job="clickhouse"} == 0'
|
||||
for: 2m
|
||||
|
|
@ -15,7 +16,7 @@ groups:
|
|||
description: "No metrics received from ClickHouse exporter for over 2 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ClickhouseMemoryUsageCritical
|
||||
expr: 'ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 90'
|
||||
expr: 'ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 90 and ClickHouseAsyncMetrics_CGroupMemoryTotal > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
|
|
@ -24,7 +25,7 @@ groups:
|
|||
description: "Memory usage is critically high, over 90%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ClickhouseMemoryUsageWarning
|
||||
expr: 'ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 80'
|
||||
expr: 'ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 80 and ClickHouseAsyncMetrics_CGroupMemoryTotal > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -86,16 +87,6 @@ groups:
|
|||
summary: ClickHouse No Live Replicas (instance {{ $labels.instance }})
|
||||
description: "There are too few live replicas available, risking data loss and service disruption.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Please replace the threshold with an appropriate value
|
||||
- alert: ClickhouseHighNetworkTraffic
|
||||
expr: 'ClickHouseMetrics_NetworkSend > 250 or ClickHouseMetrics_NetworkReceive > 250'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: ClickHouse High Network Traffic (instance {{ $labels.instance }})
|
||||
description: "Network traffic is unusually high, may affect cluster performance.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Please replace the threshold with an appropriate value
|
||||
- alert: ClickhouseHighTcpConnections
|
||||
expr: 'ClickHouseMetrics_TCPConnection > 400'
|
||||
|
|
@ -106,17 +97,18 @@ groups:
|
|||
summary: ClickHouse High TCP Connections (instance {{ $labels.instance }})
|
||||
description: "High number of TCP connections, indicating heavy client or inter-cluster communication.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Adjust the threshold based on your cluster size and expected replication traffic.
|
||||
- alert: ClickhouseInterserverConnectionIssues
|
||||
expr: 'increase(ClickHouseMetrics_InterserverConnection[5m]) > 0'
|
||||
for: 1m
|
||||
expr: 'ClickHouseMetrics_InterserverConnection > 50'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: ClickHouse Interserver Connection Issues (instance {{ $labels.instance }})
|
||||
description: "An increase in interserver connections may indicate replication or distributed query handling issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "High number of interserver connections may indicate replication or distributed query handling issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ClickhouseZookeeperConnectionIssues
|
||||
expr: 'avg(ClickHouseMetrics_ZooKeeperSession) != 1'
|
||||
expr: 'ClickHouseMetrics_ZooKeeperSession != 1'
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
|
|||
|
|
@ -42,7 +42,7 @@ groups:
|
|||
description: "CouchDB is using more than 90% of allowed OS file descriptors, may fail to open new files\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: Couchdb5xxErrorRatioHigh
|
||||
expr: 'rate(couchdb_httpd_status_codes{code=~"5.."}[5m]) / rate(couchdb_httpd_requests[5m]) > 0.05'
|
||||
expr: 'rate(couchdb_httpd_status_codes{code=~"5.."}[5m]) / rate(couchdb_httpd_requests[5m]) > 0.05 and rate(couchdb_httpd_requests[5m]) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
|
|
@ -153,7 +153,7 @@ groups:
|
|||
expr: 'changes(process_start_time_seconds[1h]) > 0'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
severity: info
|
||||
annotations:
|
||||
summary: CouchDB process restarted (instance {{ $labels.instance }})
|
||||
description: "CouchDB process has restarted recently\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
|
|||
|
|
@ -59,18 +59,20 @@ groups:
|
|||
summary: Elasticsearch Cluster Yellow (instance {{ $labels.instance }})
|
||||
description: "Elastic Cluster Yellow status\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# 1m delay allows a restart without triggering an alert.
|
||||
- alert: ElasticsearchHealthyNodes
|
||||
expr: 'elasticsearch_cluster_health_number_of_nodes < 3'
|
||||
for: 0m
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Elasticsearch Healthy Nodes (instance {{ $labels.instance }})
|
||||
description: "Missing node in Elasticsearch cluster\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# 1m delay allows a restart without triggering an alert.
|
||||
- alert: ElasticsearchHealthyDataNodes
|
||||
expr: 'elasticsearch_cluster_health_number_of_data_nodes < 3'
|
||||
for: 0m
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
|
|
@ -115,7 +117,7 @@ groups:
|
|||
|
||||
- alert: ElasticsearchUnassignedShards
|
||||
expr: 'elasticsearch_cluster_health_unassigned_shards > 0'
|
||||
for: 0m
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
|
|
@ -141,7 +143,7 @@ groups:
|
|||
description: "No new documents for 10 min!\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ElasticsearchHighIndexingLatency
|
||||
expr: 'increase(elasticsearch_indices_indexing_index_time_seconds_total[1m]) / increase(elasticsearch_indices_indexing_index_total[1m]) > 0.0005'
|
||||
expr: 'increase(elasticsearch_indices_indexing_index_time_seconds_total[1m]) / increase(elasticsearch_indices_indexing_index_total[1m]) > 0.0005 and increase(elasticsearch_indices_indexing_index_total[1m]) > 0'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -168,7 +170,7 @@ groups:
|
|||
description: "The query rate on Elasticsearch cluster is higher than the threshold.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ElasticsearchHighQueryLatency
|
||||
expr: 'increase(elasticsearch_indices_search_fetch_time_seconds[1m]) / increase(elasticsearch_indices_search_fetch_total[1m]) > 1'
|
||||
expr: 'increase(elasticsearch_indices_search_query_time_seconds[1m]) / increase(elasticsearch_indices_search_query_total[1m]) > 1 and increase(elasticsearch_indices_search_query_total[1m]) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
|
|||
8
dist/rules/hadoop/jmx_exporter.yml
vendored
8
dist/rules/hadoop/jmx_exporter.yml
vendored
|
|
@ -42,7 +42,7 @@ groups:
|
|||
description: "Available HDFS disk space is running low.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HadoopMapReduceTaskFailures
|
||||
expr: 'hadoop_mapreduce_task_failures_total > 100'
|
||||
expr: 'increase(hadoop_mapreduce_task_failures_total[1h]) > 100'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
|
|
@ -60,7 +60,7 @@ groups:
|
|||
description: "The Hadoop ResourceManager is approaching its memory limit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HadoopYarnContainerAllocationFailures
|
||||
expr: 'hadoop_yarn_container_allocation_failures_total > 10'
|
||||
expr: 'increase(hadoop_yarn_container_allocation_failures_total[1h]) > 10'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -78,10 +78,10 @@ groups:
|
|||
description: "The HBase cluster has an unusually high number of regions.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HadoopHbaseRegionServerHeapLow
|
||||
expr: 'hadoop_hbase_region_server_heap_bytes / hadoop_hbase_region_server_max_heap_bytes < 0.2'
|
||||
expr: 'hadoop_hbase_region_server_heap_bytes / hadoop_hbase_region_server_max_heap_bytes > 0.8'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Hadoop HBase Region Server Heap Low (instance {{ $labels.instance }})
|
||||
description: "HBase Region Servers are running low on heap space.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
|
|||
10
dist/rules/kafka/danielqsj-kafka-exporter.yml
vendored
10
dist/rules/kafka/danielqsj-kafka-exporter.yml
vendored
|
|
@ -14,11 +14,11 @@ groups:
|
|||
summary: Kafka topics replicas (instance {{ $labels.instance }})
|
||||
description: "Kafka topic in-sync partition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KafkaConsumersGroup
|
||||
expr: 'sum(kafka_consumergroup_lag) by (consumergroup) > 50'
|
||||
- alert: KafkaConsumerGroupLag
|
||||
expr: 'sum(kafka_consumergroup_lag) by (consumergroup) > 10000'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kafka consumers group (instance {{ $labels.instance }})
|
||||
description: "Kafka consumers group\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
summary: Kafka consumer group lag (instance {{ $labels.instance }})
|
||||
description: "Kafka consumer group {{ $labels.consumergroup }} is lagging behind ({{ $value }} messages)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
|
|||
2
dist/rules/meilisearch/embedded-exporter.yml
vendored
2
dist/rules/meilisearch/embedded-exporter.yml
vendored
|
|
@ -12,7 +12,7 @@ groups:
|
|||
severity: warning
|
||||
annotations:
|
||||
summary: Meilisearch index is empty (instance {{ $labels.instance }})
|
||||
description: "Meilisearch instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Meilisearch index {{ $labels.index }} has zero documents\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MeilisearchHttpResponseTime
|
||||
expr: 'meilisearch_http_response_time_seconds > 0.5'
|
||||
|
|
|
|||
11
dist/rules/mongodb/dcu-mongodb-exporter.yml
vendored
11
dist/rules/mongodb/dcu-mongodb-exporter.yml
vendored
|
|
@ -78,19 +78,10 @@ groups:
|
|||
description: "Too many cursors are timing out\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MongodbTooManyConnections
|
||||
expr: 'avg by(instance) (rate(mongodb_connections{state="current"}[1m])) / avg by(instance) (sum (mongodb_connections) by (instance)) * 100 > 80'
|
||||
expr: 'mongodb_connections{state="current"} / (mongodb_connections{state="current"} + mongodb_connections{state="available"}) * 100 > 80'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: MongoDB too many connections (instance {{ $labels.instance }})
|
||||
description: "Too many connections (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MongodbVirtualMemoryUsage
|
||||
expr: '(sum(mongodb_memory{type="virtual"}) BY (instance) / sum(mongodb_memory{type="mapped"}) BY (instance)) > 3'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: MongoDB virtual memory usage (instance {{ $labels.instance }})
|
||||
description: "High memory usage\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
|
|||
|
|
@ -5,18 +5,20 @@ groups:
|
|||
|
||||
rules:
|
||||
|
||||
# 1m delay allows a restart without triggering an alert.
|
||||
- alert: MongodbDown
|
||||
expr: 'mongodb_up == 0'
|
||||
for: 0m
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: MongoDB Down (instance {{ $labels.instance }})
|
||||
description: "MongoDB instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# 1m delay allows a restart without triggering an alert.
|
||||
- alert: MongodbReplicaMemberUnhealthy
|
||||
expr: 'mongodb_rs_members_health == 0'
|
||||
for: 0m
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
|
|
@ -32,6 +34,7 @@ groups:
|
|||
summary: MongoDB replication lag (instance {{ $labels.instance }})
|
||||
description: "Mongodb replication lag is more than 10s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# This query mixes old (mongodb_mongod_*) and new (mongodb_rs_*) metric names. It requires the Percona exporter to run with --compatible-mode to expose both.
|
||||
- alert: MongodbReplicationHeadroom
|
||||
expr: 'sum(avg(mongodb_mongod_replset_oplog_head_timestamp - mongodb_mongod_replset_oplog_tail_timestamp)) - sum(avg(mongodb_rs_members_optimeDate{member_state="PRIMARY"} - on (set) group_right mongodb_rs_members_optimeDate{member_state="SECONDARY"})) <= 0'
|
||||
for: 0m
|
||||
|
|
@ -60,7 +63,7 @@ groups:
|
|||
description: "Too many cursors are timing out\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MongodbTooManyConnections
|
||||
expr: 'avg by(instance) (rate(mongodb_ss_connections{conn_type="current"}[1m])) / avg by(instance) (sum (mongodb_ss_connections) by (instance)) * 100 > 80'
|
||||
expr: 'mongodb_ss_connections{conn_type="current"} / (mongodb_ss_connections{conn_type="current"} + mongodb_ss_connections{conn_type="available"}) * 100 > 80'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
|
|||
9
dist/rules/mysql/mysqld-exporter.yml
vendored
9
dist/rules/mysql/mysqld-exporter.yml
vendored
|
|
@ -5,9 +5,10 @@ groups:
|
|||
|
||||
rules:
|
||||
|
||||
# 1m delay allows a restart without triggering an alert.
|
||||
- alert: MysqlDown
|
||||
expr: 'mysql_up == 0'
|
||||
for: 0m
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
|
|
@ -41,18 +42,20 @@ groups:
|
|||
summary: MySQL high threads running (instance {{ $labels.instance }})
|
||||
description: "More than 60% of MySQL connections are in running state on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# 1m delay allows a restart without triggering an alert.
|
||||
- alert: MysqlSlaveIoThreadNotRunning
|
||||
expr: '( mysql_slave_status_slave_io_running and ON (instance) mysql_slave_status_master_server_id > 0 ) == 0'
|
||||
for: 0m
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: MySQL Slave IO thread not running (instance {{ $labels.instance }})
|
||||
description: "MySQL Slave IO thread not running on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# 1m delay allows a restart without triggering an alert.
|
||||
- alert: MysqlSlaveSqlThreadNotRunning
|
||||
expr: '( mysql_slave_status_slave_sql_running and ON (instance) mysql_slave_status_master_server_id > 0) == 0'
|
||||
for: 0m
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
|
|
|
|||
73
dist/rules/nats/nats-exporter.yml
vendored
73
dist/rules/nats/nats-exporter.yml
vendored
|
|
@ -5,24 +5,6 @@ groups:
|
|||
|
||||
rules:
|
||||
|
||||
- alert: NatsHighConnectionCount
|
||||
expr: 'gnatsd_varz_connections > 100'
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Nats high connection count (instance {{ $labels.instance }})
|
||||
description: "High number of NATS connections ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NatsHighSubscriptionsCount
|
||||
expr: 'gnatsd_connz_subscriptions > 50'
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Nats high subscriptions count (instance {{ $labels.instance }})
|
||||
description: "High number of NATS subscriptions ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NatsHighRoutesCount
|
||||
expr: 'gnatsd_varz_routes > 10'
|
||||
for: 3m
|
||||
|
|
@ -59,8 +41,9 @@ groups:
|
|||
summary: Nats server down (instance {{ $labels.instance }})
|
||||
description: "NATS server has been down for more than 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# gnatsd_varz_cpu is a gauge reporting CPU percentage (0-100 scale).
|
||||
- alert: NatsHighCpuUsage
|
||||
expr: 'rate(gnatsd_varz_cpu[5m]) > 0.8'
|
||||
expr: 'gnatsd_varz_cpu > 80'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -78,7 +61,7 @@ groups:
|
|||
description: "NATS server has more than 1000 active connections\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NatsHighJetstreamStoreUsage
|
||||
expr: 'gnatsd_varz_jetstream_stats_storage / gnatsd_varz_jetstream_config_max_storage > 0.8'
|
||||
expr: 'gnatsd_varz_jetstream_stats_storage / gnatsd_varz_jetstream_config_max_storage > 0.8 and gnatsd_varz_jetstream_config_max_storage > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -87,7 +70,7 @@ groups:
|
|||
description: "JetStream store usage is over 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NatsHighJetstreamMemoryUsage
|
||||
expr: 'gnatsd_varz_jetstream_stats_memory / gnatsd_varz_jetstream_config_max_memory > 0.8'
|
||||
expr: 'gnatsd_varz_jetstream_stats_memory / gnatsd_varz_jetstream_config_max_memory > 0.8 and gnatsd_varz_jetstream_config_max_memory > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -122,56 +105,20 @@ groups:
|
|||
summary: Nats too many errors (instance {{ $labels.instance }})
|
||||
description: "NATS server has encountered errors in the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NatsJetstreamConsumersExceeded
|
||||
- alert: NatsJetstreamAccountsExceeded
|
||||
expr: 'sum(gnatsd_varz_jetstream_stats_accounts) > 100'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Nats JetStream consumers exceeded (instance {{ $labels.instance }})
|
||||
description: "JetStream has more than 100 active consumers\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NatsFrequentAuthenticationTimeouts
|
||||
expr: 'increase(gnatsd_varz_auth_timeout[5m]) > 5'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Nats frequent authentication timeouts (instance {{ $labels.instance }})
|
||||
description: "There have been more than 5 authentication timeouts in the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NatsMaxPayloadSizeExceeded
|
||||
expr: 'max(gnatsd_varz_max_payload) > 1024 * 1024'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Nats max payload size exceeded (instance {{ $labels.instance }})
|
||||
description: "The max payload size allowed by NATS has been exceeded (1MB)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
summary: Nats JetStream accounts exceeded (instance {{ $labels.instance }})
|
||||
description: "JetStream has more than 100 active accounts\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NatsLeafNodeConnectionIssue
|
||||
expr: 'increase(gnatsd_varz_leafnodes[5m]) == 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Nats leaf node connection issue (instance {{ $labels.instance }})
|
||||
description: "No leaf node connections have been established in the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NatsMaxPingOperationsExceeded
|
||||
expr: 'gnatsd_varz_ping_max > 50'
|
||||
expr: 'gnatsd_varz_leafnodes == 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Nats max ping operations exceeded (instance {{ $labels.instance }})
|
||||
description: "The maximum number of ping operations in NATS has exceeded 50\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NatsWriteDeadlineExceeded
|
||||
expr: 'gnatsd_varz_write_deadline > 10'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Nats write deadline exceeded (instance {{ $labels.instance }})
|
||||
description: "The write deadline has been exceeded in NATS, indicating potential message delivery issues\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
summary: Nats leaf node connection issue (instance {{ $labels.instance }})
|
||||
description: "No leaf node connections on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
|
|||
|
|
@ -5,9 +5,10 @@ groups:
|
|||
|
||||
rules:
|
||||
|
||||
# 1m delay allows a restart without triggering an alert.
|
||||
- alert: PatroniHasNoLeader
|
||||
expr: '(max by (scope) (patroni_master) < 1) and (max by (scope) (patroni_standby_leader) < 1)'
|
||||
for: 0m
|
||||
expr: '(max by (scope) (patroni_primary) < 1) and (max by (scope) (patroni_standby_leader) < 1)'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
|
|
|
|||
|
|
@ -21,10 +21,10 @@ groups:
|
|||
severity: warning
|
||||
annotations:
|
||||
summary: PGBouncer errors (instance {{ $labels.instance }})
|
||||
description: "PGBouncer is logging errors. This may be due to a a server restart or an admin typing commands at the pgbouncer console.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "PGBouncer is logging errors. This may be due to a server restart or an admin typing commands at the pgbouncer console.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PgbouncerMaxConnections
|
||||
expr: 'increase(pgbouncer_errors_count{errmsg="no more connections allowed (max_client_conn)"}[30s]) > 0'
|
||||
expr: 'increase(pgbouncer_errors_count{errmsg="no more connections allowed (max_client_conn)"}[2m]) > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
|
|
|
|||
25
dist/rules/postgresql/postgres-exporter.yml
vendored
25
dist/rules/postgresql/postgres-exporter.yml
vendored
|
|
@ -5,9 +5,10 @@ groups:
|
|||
|
||||
rules:
|
||||
|
||||
# 1m delay allows a restart without triggering an alert.
|
||||
- alert: PostgresqlDown
|
||||
expr: 'pg_up == 0'
|
||||
for: 0m
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
|
|
@ -104,24 +105,6 @@ groups:
|
|||
summary: Postgresql low XID consumption (instance {{ $labels.instance }})
|
||||
description: "Postgresql seems to be consuming transaction IDs very slowly\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlHighRateStatementTimeout
|
||||
expr: 'rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Postgresql high rate statement timeout (instance {{ $labels.instance }})
|
||||
description: "Postgres transactions showing high rate of statement timeouts\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlHighRateDeadlock
|
||||
expr: 'increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Postgresql high rate deadlock (instance {{ $labels.instance }})
|
||||
description: "Postgres detected deadlocks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlUnusedReplicationSlot
|
||||
expr: '(pg_replication_slots_active == 0) and (pg_replication_is_replica == 0)'
|
||||
for: 1m
|
||||
|
|
@ -150,7 +133,7 @@ groups:
|
|||
description: "Postgres Database configuration change has occurred\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlSslCompressionActive
|
||||
expr: 'sum(pg_stat_ssl_compression) > 0'
|
||||
expr: 'sum by (instance) (pg_stat_ssl_compression) > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -159,7 +142,7 @@ groups:
|
|||
description: "Database allows connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlTooManyLocksAcquired
|
||||
expr: '((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20'
|
||||
expr: '((sum by (instance) (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
|
|
|
|||
8
dist/rules/pulsar/embedded-exporter.yml
vendored
8
dist/rules/pulsar/embedded-exporter.yml
vendored
|
|
@ -24,7 +24,7 @@ groups:
|
|||
description: "The number of subscription backlog entries is over 100k\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PulsarTopicLargeBacklogStorageSize
|
||||
expr: 'sum(pulsar_storage_size > 5*1024*1024*1024) by (topic)'
|
||||
expr: 'sum(pulsar_storage_size) by (topic) > 5*1024*1024*1024'
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -33,7 +33,7 @@ groups:
|
|||
description: "The topic backlog storage size is over 5 GB\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PulsarTopicVeryLargeBacklogStorageSize
|
||||
expr: 'sum(pulsar_storage_size > 20*1024*1024*1024) by (topic)'
|
||||
expr: 'sum(pulsar_storage_size) by (topic) > 20*1024*1024*1024'
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
|
|
@ -78,7 +78,7 @@ groups:
|
|||
description: "Observing Readonly Bookies\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PulsarHighNumberOfFunctionErrors
|
||||
expr: 'sum((rate(pulsar_function_user_exceptions_total{}[1m]) + rate(pulsar_function_system_exceptions_total{}[1m])) > 10) by (name)'
|
||||
expr: 'sum(rate(pulsar_function_user_exceptions_total[1m]) + rate(pulsar_function_system_exceptions_total[1m])) by (name) > 10'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
|
|
@ -87,7 +87,7 @@ groups:
|
|||
description: "Observing more than 10 Function errors per minute\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PulsarHighNumberOfSinkErrors
|
||||
expr: 'sum(rate(pulsar_sink_sink_exceptions_total{}[1m]) > 10) by (name)'
|
||||
expr: 'sum(rate(pulsar_sink_sink_exceptions_total[1m])) by (name) > 10'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
|
|
|
|||
17
dist/rules/rabbitmq/kbudde-rabbitmq-exporter.yml
vendored
17
dist/rules/rabbitmq/kbudde-rabbitmq-exporter.yml
vendored
|
|
@ -5,18 +5,20 @@ groups:
|
|||
|
||||
rules:
|
||||
|
||||
# 1m delay allows a restart without triggering an alert.
|
||||
- alert: RabbitmqDown
|
||||
expr: 'rabbitmq_up == 0'
|
||||
for: 0m
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: RabbitMQ down (instance {{ $labels.instance }})
|
||||
description: "RabbitMQ node down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# 1m delay allows a restart without triggering an alert.
|
||||
- alert: RabbitmqClusterDown
|
||||
expr: 'sum(rabbitmq_running) < 3'
|
||||
for: 0m
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
|
|
@ -33,13 +35,13 @@ groups:
|
|||
description: "Cluster partition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: RabbitmqOutOfMemory
|
||||
expr: 'rabbitmq_node_mem_used / rabbitmq_node_mem_limit * 100 > 90'
|
||||
expr: 'rabbitmq_node_mem_used / rabbitmq_node_mem_limit * 100 > 90 and rabbitmq_node_mem_limit > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: RabbitMQ out of memory (instance {{ $labels.instance }})
|
||||
description: "Memory available for RabbmitMQ is low (< 10%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Memory available for RabbitMQ is low (< 10%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: RabbitmqTooManyConnections
|
||||
expr: 'rabbitmq_connectionsTotal > 1000'
|
||||
|
|
@ -80,9 +82,10 @@ groups:
|
|||
summary: RabbitMQ slow queue consuming (instance {{ $labels.instance }})
|
||||
description: "Queue messages are consumed slowly (> 60s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Allows a short service restart.
|
||||
- alert: RabbitmqNoConsumer
|
||||
expr: 'rabbitmq_queue_consumers == 0'
|
||||
for: 1m
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
|
|
@ -100,11 +103,11 @@ groups:
|
|||
description: "Queue should have only 1 consumer\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Indicate the exchange name in dedicated label.
|
||||
- alert: RabbitmqUnactiveExchange
|
||||
- alert: RabbitmqInactiveExchange
|
||||
expr: 'rate(rabbitmq_exchange_messages_published_in_total{exchange="my-exchange"}[1m]) < 5'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: RabbitMQ unactive exchange (instance {{ $labels.instance }})
|
||||
summary: RabbitMQ inactive exchange (instance {{ $labels.instance }})
|
||||
description: "Exchange receive less than 5 msgs per second\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
|
|||
12
dist/rules/rabbitmq/rabbitmq-exporter.yml
vendored
12
dist/rules/rabbitmq/rabbitmq-exporter.yml
vendored
|
|
@ -5,18 +5,20 @@ groups:
|
|||
|
||||
rules:
|
||||
|
||||
# 1m delay allows a restart without triggering an alert.
|
||||
- alert: RabbitmqNodeDown
|
||||
expr: 'sum(rabbitmq_build_info) < 3'
|
||||
for: 0m
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: RabbitMQ node down (instance {{ $labels.instance }})
|
||||
description: "Less than 3 nodes running in RabbitMQ cluster\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# 1m delay allows a restart without triggering an alert.
|
||||
- alert: RabbitmqNodeNotDistributed
|
||||
expr: 'erlang_vm_dist_node_state < 3'
|
||||
for: 0m
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
|
|
@ -33,7 +35,7 @@ groups:
|
|||
description: "Running different version of RabbitMQ in the same cluster, can lead to failure.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: RabbitmqMemoryHigh
|
||||
expr: 'rabbitmq_process_resident_memory_bytes / rabbitmq_resident_memory_limit_bytes * 100 > 90'
|
||||
expr: 'rabbitmq_process_resident_memory_bytes / rabbitmq_resident_memory_limit_bytes * 100 > 90 and rabbitmq_resident_memory_limit_bytes > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -42,7 +44,7 @@ groups:
|
|||
description: "A node use more than 90% of allocated RAM\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: RabbitmqFileDescriptorsUsage
|
||||
expr: 'rabbitmq_process_open_fds / rabbitmq_process_max_fds * 100 > 90'
|
||||
expr: 'rabbitmq_process_open_fds / rabbitmq_process_max_fds * 100 > 90 and rabbitmq_process_max_fds > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -57,7 +59,7 @@ groups:
|
|||
severity: warning
|
||||
annotations:
|
||||
summary: RabbitMQ too many ready messages (instance {{ $labels.instance }})
|
||||
description: "RabbitMQ too many ready messages on {{ $labels.instace }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "RabbitMQ too many ready messages on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: RabbitmqTooManyUnackMessages
|
||||
expr: 'sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000'
|
||||
|
|
|
|||
14
dist/rules/redis/oliver006-redis-exporter.yml
vendored
14
dist/rules/redis/oliver006-redis-exporter.yml
vendored
|
|
@ -5,9 +5,10 @@ groups:
|
|||
|
||||
rules:
|
||||
|
||||
# 1m delay allows a restart without triggering an alert.
|
||||
- alert: RedisDown
|
||||
expr: 'redis_up == 0'
|
||||
for: 0m
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
|
|
@ -23,9 +24,10 @@ groups:
|
|||
summary: Redis missing master (instance {{ $labels.instance }})
|
||||
description: "Redis cluster has no node marked as master.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# 1m delay allows a restart without triggering an alert.
|
||||
- alert: RedisTooManyMasters
|
||||
expr: 'count(redis_instance_info{role="master"}) > 1'
|
||||
for: 0m
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
|
|
@ -60,13 +62,13 @@ groups:
|
|||
description: "Changes have been detected in Redis replica connection. This can occur when replica nodes lose connection to the master and reconnect (a.k.a flapping).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: RedisMissingBackup
|
||||
expr: 'time() - redis_rdb_last_save_timestamp_seconds > 60 * 60 * 24'
|
||||
expr: 'time() - redis_rdb_last_save_timestamp_seconds > 60 * 60 * 48'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Redis missing backup (instance {{ $labels.instance }})
|
||||
description: "Redis has not been backuped for 24 hours\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Redis has not been backed up for 48 hours\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# The exporter must be started with --include-system-metrics flag or REDIS_EXPORTER_INCL_SYSTEM_METRICS=true environment variable.
|
||||
- alert: RedisOutOfSystemMemory
|
||||
|
|
@ -106,10 +108,10 @@ groups:
|
|||
description: "Redis instance should have more connections (> 5)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: RedisRejectedConnections
|
||||
expr: 'increase(redis_rejected_connections_total[1m]) > 0'
|
||||
expr: 'increase(redis_rejected_connections_total[1m]) > 5'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Redis rejected connections (instance {{ $labels.instance }})
|
||||
description: "Some connections to Redis has been rejected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
|
|||
2
dist/rules/solr/embedded-exporter.yml
vendored
2
dist/rules/solr/embedded-exporter.yml
vendored
|
|
@ -30,7 +30,7 @@ groups:
|
|||
severity: critical
|
||||
annotations:
|
||||
summary: Solr replication errors (instance {{ $labels.instance }})
|
||||
description: "Solr collection {{ $labels.collection }} has failed updates for replica {{ $labels.replica }} on {{ $labels.base_url }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Solr collection {{ $labels.collection }} has replication errors for replica {{ $labels.replica }} on {{ $labels.base_url }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: SolrLowLiveNodeCount
|
||||
expr: 'solr_collections_live_nodes < 2'
|
||||
|
|
|
|||
|
|
@ -5,9 +5,10 @@ groups:
|
|||
|
||||
rules:
|
||||
|
||||
# 1m delay allows a restart without triggering an alert.
|
||||
- alert: SqlServerDown
|
||||
expr: 'mssql_up == 0'
|
||||
for: 0m
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
|
|
@ -15,10 +16,10 @@ groups:
|
|||
description: "SQL server instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: SqlServerDeadlock
|
||||
expr: 'increase(mssql_deadlocks[1m]) > 5'
|
||||
for: 0m
|
||||
expr: 'mssql_deadlocks > 5'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: SQL Server deadlock (instance {{ $labels.instance }})
|
||||
description: "SQL Server is having some deadlock.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "SQL Server {{ $labels.instance }} is experiencing deadlocks ({{ $value }}/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
|
|||
|
|
@ -5,9 +5,10 @@ groups:
|
|||
|
||||
rules:
|
||||
|
||||
# 1m delay allows a restart without triggering an alert.
|
||||
- alert: ZookeeperDown
|
||||
expr: 'zk_up == 0'
|
||||
for: 0m
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
|
|
|
|||
Loading…
Reference in a new issue