adding prometheus internal alerts

2026-06-22 01:17:19 +08:00 · 2020-03-09 00:16:17 +01:00 · 2020-03-09 00:16:17 +01:00 · affacde49b
commit affacde49b
parent 189a3129c3
1 changed files with 25 additions and 1 deletions
--- a/_data/rules.yml
+++ b/_data/rules.yml
@ -6,6 +6,10 @@ services:
            description: Prometheus configuration reload error
            query: "prometheus_config_last_reload_successful != 1"
            severity: warning
+          - name: Prometheus too many restarts
+            description: Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.
+            query: "changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2"
+            severity: warning
          - name: Prometheus AlertManager configuration reload failure
            description: AlertManager configuration reload error
            query: "alertmanager_config_last_reload_successful != 1"
@ -29,7 +33,19 @@ services:
          - name: Prometheus rule evaluation slow
            description: 'Prometheus rule evaluation took more time than the scheduled interval. I indicates a slower storage backend access or too complex query.'
            query: 'prometheus_rule_group_last_duration_seconds < prometheus_rule_group_interval_seconds'
-            severity: error
+            severity: warning
+          - name: Prometheus notifications backlog
+            description: The Prometheus notification queue has not been empty for 10 minutes
+            query: 'min_over_time(prometheus_notifications_queue_length[10m])'
+            severity: warning
+          - name: Prometheus target scraping slow
+            description: Prometheus is scraping exporters slowly
+            query: 'prometheus_target_interval_length_seconds{quantile="0.9"} > 60'
+            severity: warning
+          - name: Prometheus large scrape
+            description: Prometheus has many scapres that exceed the sample limit
+            query: 'increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10'
+            severity: warning
          - name: Prometheus TSDB checkpoint creation failures
            description: 'Prometheus encountered {{ $value }} checkpoint creation failures'
            query: 'increase(prometheus_tsdb_checkpoint_creations_failed_total[3m]) > 0'
@ -68,6 +84,10 @@ services:
            description: Node memory is filling up (< 10% left)
            query: "node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10"
            severity: warning
+          - name: Host memory under memory pressure
+            description: The node is under heavy memory pressure. High rate of major page faults
+            query: "rate(node_vmstat_pgmajfault[1m]) > 1000"
+            severity: warning
          - name: Host unusual network throughput in
            description: Host network interfaces are probably receiving too much data (> 100 MB/s)
            query: "sum by (instance) (irate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100"
@ -140,6 +160,10 @@ services:
            description: 'At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap'
            query: 'node_md_disks{state="fail"} > 0'
            severity: warning
+          - name: Kernel version deviations
+            description: Different kernel versions are running
+            query: 'count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1'
+            severity: warning

  - name: Docker containers
    exporters: