adding rules to prometheus self monitoring

2026-06-21 08:57:19 +08:00 · 2020-03-17 20:56:49 +01:00 · 2020-03-17 20:56:49 +01:00 · c653b37e15
commit c653b37e15
parent fc3e72041c
1 changed files with 25 additions and 9 deletions
--- a/_data/rules.yml
+++ b/_data/rules.yml
@ -4,6 +4,18 @@ groups:
      - name: Prometheus self-monitoring
        exporters:
          - rules:
+            - name: Prometheus job missing
+              description: A Prometheus job has disappeared
+              query: 'absent(up{job="my-job"})'
+              severity: warning
+            - name: Prometheus target missing
+              description: A Prometheus target has disappeared. An exporter might be crashed.
+              query: 'up == 0'
+              severity: error
+            - name: Prometheus all targets missing
+              description: A Prometheus job does not have living target anymore.
+              query: 'count by (job) (up) == 0'
+              severity: error
            - name: Prometheus configuration reload failure
              description: Prometheus configuration reload error
              query: 'prometheus_config_last_reload_successful != 1'
@ -16,6 +28,10 @@ groups:
              description: AlertManager configuration reload error
              query: 'alertmanager_config_last_reload_successful != 1'
              severity: warning
+            - name: Prometheus AlertManager config not synced
+              description: Configurations of AlertManager cluster instances are out of sync
+              query: 'count(count_values("config_hash", alertmanager_config_hash)) > 1'
+              severity: warning
            - name: Prometheus AlertManager E2E dead man snitch
              description: Prometheus DeadManSnitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.
              query: 'vector(1)'
@ -24,10 +40,6 @@ groups:
              description: Prometheus cannot connect the alertmanager
              query: "prometheus_notifications_alertmanagers_discovered < 1"
              severity: error
-            - name: Prometheus Exporter down
-              description: Prometheus exporter down
-              query: "up == 0"
-              severity: error
            - name: Prometheus rule evaluation failures
              description: 'Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.'
              query: 'increase(prometheus_rule_evaluation_failures_total[3m]) > 0'
@ -60,6 +72,10 @@ groups:
              description: Prometheus has many scrapes that exceed the sample limit
              query: 'increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10'
              severity: warning
+            - name: Prometheus target scrape duplicate
+              description: Prometheus has many samples rejected due to duplicate timestamps but different values
+              query: 'increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0'
+              severity: warning
            - name: Prometheus TSDB checkpoint creation failures
              description: 'Prometheus encountered {{ $value }} checkpoint creation failures'
              query: 'increase(prometheus_tsdb_checkpoint_creations_failed_total[3m]) > 0'
@ -266,15 +282,15 @@ groups:
                severity: error
              - name: Windows Server CPU Usage
                description: CPU Usage is more than 80%
-                query: '100 - (avg by (instance) (irate(wmi_cpu_time_total{mode="idle"}[2m])) * 100) > 80'
+                query: '100 - (avg by (instance) (rate(wmi_cpu_time_total{mode="idle"}[2m])) * 100) > 80'
                severity: warning
              - name: Windows Server memory Usage
-                description: Memory Usage is more than 90%
-                query: "100*(wmi_os_physical_memory_free_bytes) / wmi_cs_physical_memory_bytes > 90"
+                description: Memory usage is more than 90%
+                query: "100 * (wmi_os_physical_memory_free_bytes) / wmi_cs_physical_memory_bytes > 90"
                severity: warning
              - name: Windows Server disk Space Usage
-                description: Disk Space on Drive is used more than 80%
-                query: "100.0 - 100 * ((wmi_logical_disk_free_bytes{} / 1024 / 1024 ) / (wmi_logical_disk_size_bytes{}  / 1024 / 1024)) > 80"
+                description: Disk usage is more than 80%
+                query: "100.0 - 100 * ((wmi_logical_disk_free_bytes{} / 1024 / 1024 ) / (wmi_logical_disk_size_bytes{} / 1024 / 1024)) > 80"
                severity: error