From affacde49b18179abe5cda9131870578b690acc8 Mon Sep 17 00:00:00 2001 From: Samuel Berthe Date: Mon, 9 Mar 2020 00:16:17 +0100 Subject: [PATCH] adding prometheus internal alerts --- _data/rules.yml | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/_data/rules.yml b/_data/rules.yml index 518092d..b03d4ed 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -6,6 +6,10 @@ services: description: Prometheus configuration reload error query: "prometheus_config_last_reload_successful != 1" severity: warning + - name: Prometheus too many restarts + description: Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping. + query: "changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2" + severity: warning - name: Prometheus AlertManager configuration reload failure description: AlertManager configuration reload error query: "alertmanager_config_last_reload_successful != 1" @@ -29,7 +33,19 @@ services: - name: Prometheus rule evaluation slow description: 'Prometheus rule evaluation took more time than the scheduled interval. I indicates a slower storage backend access or too complex query.' query: 'prometheus_rule_group_last_duration_seconds < prometheus_rule_group_interval_seconds' - severity: error + severity: warning + - name: Prometheus notifications backlog + description: The Prometheus notification queue has not been empty for 10 minutes + query: 'min_over_time(prometheus_notifications_queue_length[10m])' + severity: warning + - name: Prometheus target scraping slow + description: Prometheus is scraping exporters slowly + query: 'prometheus_target_interval_length_seconds{quantile="0.9"} > 60' + severity: warning + - name: Prometheus large scrape + description: Prometheus has many scapres that exceed the sample limit + query: 'increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10' + severity: warning - name: Prometheus TSDB checkpoint creation failures description: 'Prometheus encountered {{ $value }} checkpoint creation failures' query: 'increase(prometheus_tsdb_checkpoint_creations_failed_total[3m]) > 0' @@ -68,6 +84,10 @@ services: description: Node memory is filling up (< 10% left) query: "node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10" severity: warning + - name: Host memory under memory pressure + description: The node is under heavy memory pressure. High rate of major page faults + query: "rate(node_vmstat_pgmajfault[1m]) > 1000" + severity: warning - name: Host unusual network throughput in description: Host network interfaces are probably receiving too much data (> 100 MB/s) query: "sum by (instance) (irate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100" @@ -140,6 +160,10 @@ services: description: 'At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap' query: 'node_md_disks{state="fail"} > 0' severity: warning + - name: Kernel version deviations + description: Different kernel versions are running + query: 'count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1' + severity: warning - name: Docker containers exporters: