mirror of
https://github.com/samber/awesome-prometheus-alerts.git
synced 2026-06-22 01:17:19 +08:00
Adding Etcd metrics
This commit is contained in:
parent
19ef1dcf6d
commit
273fd6b9e3
1 changed files with 53 additions and 1 deletions
|
|
@ -302,6 +302,58 @@ services:
|
|||
- name: Etcd
|
||||
exporters:
|
||||
- rules:
|
||||
- name: Insufficient Members
|
||||
description: Etcd cluster should have an odd number of members
|
||||
query: 'count(etcd_server_id) > (count(etcd_server_id) / 2 - 1)'
|
||||
severity: error
|
||||
- name: No Leader
|
||||
description: Etcd cluster have no leader
|
||||
query: 'etcd_server_has_leader == 0'
|
||||
severity: error
|
||||
- name: High number of leader changes
|
||||
description: Etcd leader changed more than 3 times during last hour
|
||||
query: 'increase(etcd_server_leader_changes_seen_total[1h]) > 3'
|
||||
severity: warning
|
||||
- name: High number of failed GRPC requests
|
||||
description: More than 1% GRPC request failure detected in Etcd for 5 minutes
|
||||
query: 'sum(rate(grpc_server_handled_total{grpc_code!="OK"}[5m])) BY (grpc_service, grpc_method) / sum(rate(grpc_server_handled_total[5m])) BY (grpc_service, grpc_method) > 0.01'
|
||||
severity: warning
|
||||
- name: High number of failed GRPC requests
|
||||
description: More than 5% GRPC request failure detected in Etcd for 5 minutes
|
||||
query: 'sum(rate(grpc_server_handled_total{grpc_code!="OK"}[5m])) BY (grpc_service, grpc_method) / sum(rate(grpc_server_handled_total[5m])) BY (grpc_service, grpc_method) > 0.05'
|
||||
severity: error
|
||||
- name: GRPC requests slow
|
||||
description: GRPC requests slowing down, 99th percentil is over 0.15s for 5 minutes
|
||||
query: 'histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{grpc_type="unary"}[5m])) by (grpc_service, grpc_method, le)) > 0.15'
|
||||
severity: warning
|
||||
- name: High number of failed HTTP requests
|
||||
description: More than 1% HTTP failure detected in Etcd for 5 minutes
|
||||
query: 'sum(rate(etcd_http_failed_total[5m])) BY (method) / sum(rate(etcd_http_received_total[5m])) BY (method) > 0.01'
|
||||
severity: warning
|
||||
- name: High number of failed HTTP requests
|
||||
description: More than 5% HTTP failure detected in Etcd for 5 minutes
|
||||
query: 'sum(rate(etcd_http_failed_total[5m])) BY (method) / sum(rate(etcd_http_received_total[5m])) BY (method) > 0.05'
|
||||
severity: error
|
||||
- name: HTTP requests slow
|
||||
description: HTTP requests slowing down, 99th percentil is over 0.15s for 5 minutes
|
||||
query: 'histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15'
|
||||
severity: warning
|
||||
- name: Etcd member communication slow
|
||||
description: Etcd member communication slowing down, 99th percentil is over 0.15s for 5 minutes
|
||||
query: 'histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m])) > 0.15'
|
||||
severity: warning
|
||||
- name: High number of failed proposals
|
||||
description: Etcd server got more than 5 failed proposals past hour
|
||||
query: 'increase(etcd_server_proposals_failed_total[1h]) > 5'
|
||||
severity: warning
|
||||
- name: High fsync durations
|
||||
description: Etcd WAL fsync duration increasing, 99th percentil is over 0.5s for 5 minutes
|
||||
query: 'histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5'
|
||||
severity: warning
|
||||
- name: High commit durations
|
||||
description: Etcd commit duration increasing, 99th percentil is over 0.25s for 5 minutes
|
||||
query: 'histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25'
|
||||
severity: warning
|
||||
|
||||
- name: Zookeeper
|
||||
exporters:
|
||||
|
|
@ -314,7 +366,7 @@ services:
|
|||
- name: danielqsj/kafka_exporter
|
||||
doc_url: https://github.com/danielqsj/kafka_exporter
|
||||
rules:
|
||||
- name: Kafka Topics
|
||||
- name: Kafka Topics
|
||||
description: Kafka topic in-sync partition
|
||||
query: 'sum(kafka_topic_partition_in_sync_replica) by (topic) < 3'
|
||||
severity: error
|
||||
|
|
|
|||
Loading…
Reference in a new issue