mirror of
https://github.com/samber/awesome-prometheus-alerts.git
synced 2026-06-24 18:36:59 +08:00
Added query for node warmup before reporing it's down (#304)
Co-authored-by: Brett Yoakum <yoakum@adobe.com>
This commit is contained in:
parent
ae0e4a42db
commit
0887515f98
1 changed files with 4 additions and 0 deletions
|
|
@ -26,6 +26,10 @@ groups:
|
||||||
description: A Prometheus job does not have living target anymore.
|
description: A Prometheus job does not have living target anymore.
|
||||||
query: 'sum by (job) (up) == 0'
|
query: 'sum by (job) (up) == 0'
|
||||||
severity: critical
|
severity: critical
|
||||||
|
- name: Prometheus target missing with warmup time
|
||||||
|
description: Allow a job time to start up (10 minutes) before alerting that it's down.
|
||||||
|
query: 'sum by (instance, job) ((up == 0) * on (instance) group_right(job) (node_time_seconds - node_boot_time_seconds > 600))'
|
||||||
|
severity: critical
|
||||||
- name: Prometheus configuration reload failure
|
- name: Prometheus configuration reload failure
|
||||||
description: Prometheus configuration reload error
|
description: Prometheus configuration reload error
|
||||||
query: 'prometheus_config_last_reload_successful != 1'
|
query: 'prometheus_config_last_reload_successful != 1'
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue