Adding Etcd metrics

This commit is contained in:
Samuel Berthe 2019-02-20 13:28:12 +01:00
parent 19ef1dcf6d
commit 273fd6b9e3

View file

@ -302,6 +302,58 @@ services:
- name: Etcd
exporters:
- rules:
- name: Insufficient Members
description: Etcd cluster should have an odd number of members
query: 'count(etcd_server_id) > (count(etcd_server_id) / 2 - 1)'
severity: error
- name: No Leader
description: Etcd cluster have no leader
query: 'etcd_server_has_leader == 0'
severity: error
- name: High number of leader changes
description: Etcd leader changed more than 3 times during last hour
query: 'increase(etcd_server_leader_changes_seen_total[1h]) > 3'
severity: warning
- name: High number of failed GRPC requests
description: More than 1% GRPC request failure detected in Etcd for 5 minutes
query: 'sum(rate(grpc_server_handled_total{grpc_code!="OK"}[5m])) BY (grpc_service, grpc_method) / sum(rate(grpc_server_handled_total[5m])) BY (grpc_service, grpc_method) > 0.01'
severity: warning
- name: High number of failed GRPC requests
description: More than 5% GRPC request failure detected in Etcd for 5 minutes
query: 'sum(rate(grpc_server_handled_total{grpc_code!="OK"}[5m])) BY (grpc_service, grpc_method) / sum(rate(grpc_server_handled_total[5m])) BY (grpc_service, grpc_method) > 0.05'
severity: error
- name: GRPC requests slow
description: GRPC requests slowing down, 99th percentil is over 0.15s for 5 minutes
query: 'histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{grpc_type="unary"}[5m])) by (grpc_service, grpc_method, le)) > 0.15'
severity: warning
- name: High number of failed HTTP requests
description: More than 1% HTTP failure detected in Etcd for 5 minutes
query: 'sum(rate(etcd_http_failed_total[5m])) BY (method) / sum(rate(etcd_http_received_total[5m])) BY (method) > 0.01'
severity: warning
- name: High number of failed HTTP requests
description: More than 5% HTTP failure detected in Etcd for 5 minutes
query: 'sum(rate(etcd_http_failed_total[5m])) BY (method) / sum(rate(etcd_http_received_total[5m])) BY (method) > 0.05'
severity: error
- name: HTTP requests slow
description: HTTP requests slowing down, 99th percentil is over 0.15s for 5 minutes
query: 'histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) > 0.15'
severity: warning
- name: Etcd member communication slow
description: Etcd member communication slowing down, 99th percentil is over 0.15s for 5 minutes
query: 'histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m])) > 0.15'
severity: warning
- name: High number of failed proposals
description: Etcd server got more than 5 failed proposals past hour
query: 'increase(etcd_server_proposals_failed_total[1h]) > 5'
severity: warning
- name: High fsync durations
description: Etcd WAL fsync duration increasing, 99th percentil is over 0.5s for 5 minutes
query: 'histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) > 0.5'
severity: warning
- name: High commit durations
description: Etcd commit duration increasing, 99th percentil is over 0.25s for 5 minutes
query: 'histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) > 0.25'
severity: warning
- name: Zookeeper
exporters:
@ -314,7 +366,7 @@ services:
- name: danielqsj/kafka_exporter
doc_url: https://github.com/danielqsj/kafka_exporter
rules:
- name: Kafka Topics
- name: Kafka Topics
description: Kafka topic in-sync partition
query: 'sum(kafka_topic_partition_in_sync_replica) by (topic) < 3'
severity: error