adding rules to prometheus self monitoring

This commit is contained in:
Samuel Berthe 2020-03-17 20:56:49 +01:00
parent fc3e72041c
commit c653b37e15
No known key found for this signature in database
GPG key ID: 9D7813625412A946

View file

@ -4,6 +4,18 @@ groups:
- name: Prometheus self-monitoring
exporters:
- rules:
- name: Prometheus job missing
description: A Prometheus job has disappeared
query: 'absent(up{job="my-job"})'
severity: warning
- name: Prometheus target missing
description: A Prometheus target has disappeared. An exporter might be crashed.
query: 'up == 0'
severity: error
- name: Prometheus all targets missing
description: A Prometheus job does not have living target anymore.
query: 'count by (job) (up) == 0'
severity: error
- name: Prometheus configuration reload failure
description: Prometheus configuration reload error
query: 'prometheus_config_last_reload_successful != 1'
@ -16,6 +28,10 @@ groups:
description: AlertManager configuration reload error
query: 'alertmanager_config_last_reload_successful != 1'
severity: warning
- name: Prometheus AlertManager config not synced
description: Configurations of AlertManager cluster instances are out of sync
query: 'count(count_values("config_hash", alertmanager_config_hash)) > 1'
severity: warning
- name: Prometheus AlertManager E2E dead man snitch
description: Prometheus DeadManSnitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.
query: 'vector(1)'
@ -24,10 +40,6 @@ groups:
description: Prometheus cannot connect the alertmanager
query: "prometheus_notifications_alertmanagers_discovered < 1"
severity: error
- name: Prometheus Exporter down
description: Prometheus exporter down
query: "up == 0"
severity: error
- name: Prometheus rule evaluation failures
description: 'Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.'
query: 'increase(prometheus_rule_evaluation_failures_total[3m]) > 0'
@ -60,6 +72,10 @@ groups:
description: Prometheus has many scrapes that exceed the sample limit
query: 'increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10'
severity: warning
- name: Prometheus target scrape duplicate
description: Prometheus has many samples rejected due to duplicate timestamps but different values
query: 'increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0'
severity: warning
- name: Prometheus TSDB checkpoint creation failures
description: 'Prometheus encountered {{ $value }} checkpoint creation failures'
query: 'increase(prometheus_tsdb_checkpoint_creations_failed_total[3m]) > 0'
@ -266,15 +282,15 @@ groups:
severity: error
- name: Windows Server CPU Usage
description: CPU Usage is more than 80%
query: '100 - (avg by (instance) (irate(wmi_cpu_time_total{mode="idle"}[2m])) * 100) > 80'
query: '100 - (avg by (instance) (rate(wmi_cpu_time_total{mode="idle"}[2m])) * 100) > 80'
severity: warning
- name: Windows Server memory Usage
description: Memory Usage is more than 90%
query: "100*(wmi_os_physical_memory_free_bytes) / wmi_cs_physical_memory_bytes > 90"
description: Memory usage is more than 90%
query: "100 * (wmi_os_physical_memory_free_bytes) / wmi_cs_physical_memory_bytes > 90"
severity: warning
- name: Windows Server disk Space Usage
description: Disk Space on Drive is used more than 80%
query: "100.0 - 100 * ((wmi_logical_disk_free_bytes{} / 1024 / 1024 ) / (wmi_logical_disk_size_bytes{} / 1024 / 1024)) > 80"
description: Disk usage is more than 80%
query: "100.0 - 100 * ((wmi_logical_disk_free_bytes{} / 1024 / 1024 ) / (wmi_logical_disk_size_bytes{} / 1024 / 1024)) > 80"
severity: error