mirror of
https://github.com/samber/awesome-prometheus-alerts.git
synced 2026-06-22 01:17:19 +08:00
adding prometheus internal alerts
This commit is contained in:
parent
189a3129c3
commit
affacde49b
1 changed files with 25 additions and 1 deletions
|
|
@ -6,6 +6,10 @@ services:
|
|||
description: Prometheus configuration reload error
|
||||
query: "prometheus_config_last_reload_successful != 1"
|
||||
severity: warning
|
||||
- name: Prometheus too many restarts
|
||||
description: Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.
|
||||
query: "changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2"
|
||||
severity: warning
|
||||
- name: Prometheus AlertManager configuration reload failure
|
||||
description: AlertManager configuration reload error
|
||||
query: "alertmanager_config_last_reload_successful != 1"
|
||||
|
|
@ -29,7 +33,19 @@ services:
|
|||
- name: Prometheus rule evaluation slow
|
||||
description: 'Prometheus rule evaluation took more time than the scheduled interval. I indicates a slower storage backend access or too complex query.'
|
||||
query: 'prometheus_rule_group_last_duration_seconds < prometheus_rule_group_interval_seconds'
|
||||
severity: error
|
||||
severity: warning
|
||||
- name: Prometheus notifications backlog
|
||||
description: The Prometheus notification queue has not been empty for 10 minutes
|
||||
query: 'min_over_time(prometheus_notifications_queue_length[10m])'
|
||||
severity: warning
|
||||
- name: Prometheus target scraping slow
|
||||
description: Prometheus is scraping exporters slowly
|
||||
query: 'prometheus_target_interval_length_seconds{quantile="0.9"} > 60'
|
||||
severity: warning
|
||||
- name: Prometheus large scrape
|
||||
description: Prometheus has many scapres that exceed the sample limit
|
||||
query: 'increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10'
|
||||
severity: warning
|
||||
- name: Prometheus TSDB checkpoint creation failures
|
||||
description: 'Prometheus encountered {{ $value }} checkpoint creation failures'
|
||||
query: 'increase(prometheus_tsdb_checkpoint_creations_failed_total[3m]) > 0'
|
||||
|
|
@ -68,6 +84,10 @@ services:
|
|||
description: Node memory is filling up (< 10% left)
|
||||
query: "node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10"
|
||||
severity: warning
|
||||
- name: Host memory under memory pressure
|
||||
description: The node is under heavy memory pressure. High rate of major page faults
|
||||
query: "rate(node_vmstat_pgmajfault[1m]) > 1000"
|
||||
severity: warning
|
||||
- name: Host unusual network throughput in
|
||||
description: Host network interfaces are probably receiving too much data (> 100 MB/s)
|
||||
query: "sum by (instance) (irate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100"
|
||||
|
|
@ -140,6 +160,10 @@ services:
|
|||
description: 'At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap'
|
||||
query: 'node_md_disks{state="fail"} > 0'
|
||||
severity: warning
|
||||
- name: Kernel version deviations
|
||||
description: Different kernel versions are running
|
||||
query: 'count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1'
|
||||
severity: warning
|
||||
|
||||
- name: Docker containers
|
||||
exporters:
|
||||
|
|
|
|||
Loading…
Reference in a new issue