diff --git a/_data/rules.yml b/_data/rules.yml index efd79bc..cda042f 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -302,6 +302,58 @@ services: - name: Etcd exporters: - rules: + - name: Insufficient Members + description: Etcd cluster should have an odd number of members + query: 'count(etcd_server_id) > (count(etcd_server_id) / 2 - 1)' + severity: error + - name: No Leader + description: Etcd cluster have no leader + query: 'etcd_server_has_leader == 0' + severity: error + - name: High number of leader changes + description: Etcd leader changed more than 3 times during last hour + query: 'increase(etcd_server_leader_changes_seen_total[1h]) > 3' + severity: warning + - name: High number of failed GRPC requests + description: More than 1% GRPC request failure detected in Etcd for 5 minutes + query: 'sum(rate(grpc_server_handled_total{grpc_code!="OK"}[5m])) BY (grpc_service, grpc_method) / sum(rate(grpc_server_handled_total[5m])) BY (grpc_service, grpc_method) > 0.01' + severity: warning + - name: High number of failed GRPC requests + description: More than 5% GRPC request failure detected in Etcd for 5 minutes + query: 'sum(rate(grpc_server_handled_total{grpc_code!="OK"}[5m])) BY (grpc_service, grpc_method) / sum(rate(grpc_server_handled_total[5m])) BY (grpc_service, grpc_method) > 0.05' + severity: error + - name: GRPC requests slow + description: GRPC requests slowing down, 99th percentil is over 0.15s for 5 minutes + query: 'histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{grpc_type="unary"}[5m])) by (grpc_service, grpc_method, le)) > 0.15' + severity: warning + - name: High number of failed HTTP requests + description: More than 1% HTTP failure detected in Etcd for 5 minutes + query: 'sum(rate(etcd_http_failed_total[5m])) BY (method) / sum(rate(etcd_http_received_total[5m])) BY (method) > 0.01' + severity: warning + - name: High number of failed HTTP requests + description: More than 5% HTTP failure detected in Etcd for 5 minutes + query: 'sum(rate(etcd_http_failed_total[5m])) BY (method) / sum(rate(etcd_http_received_total[5m])) BY (method) > 0.05' + severity: error + - name: HTTP requests slow + description: HTTP requests slowing down, 99th percentil is over 0.15s for 5 minutes + query: 'histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15' + severity: warning + - name: Etcd member communication slow + description: Etcd member communication slowing down, 99th percentil is over 0.15s for 5 minutes + query: 'histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m])) > 0.15' + severity: warning + - name: High number of failed proposals + description: Etcd server got more than 5 failed proposals past hour + query: 'increase(etcd_server_proposals_failed_total[1h]) > 5' + severity: warning + - name: High fsync durations + description: Etcd WAL fsync duration increasing, 99th percentil is over 0.5s for 5 minutes + query: 'histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5' + severity: warning + - name: High commit durations + description: Etcd commit duration increasing, 99th percentil is over 0.25s for 5 minutes + query: 'histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25' + severity: warning - name: Zookeeper exporters: @@ -314,7 +366,7 @@ services: - name: danielqsj/kafka_exporter doc_url: https://github.com/danielqsj/kafka_exporter rules: - - name: Kafka Topics + - name: Kafka Topics description: Kafka topic in-sync partition query: 'sum(kafka_topic_partition_in_sync_replica) by (topic) < 3' severity: error