Added query for node warmup before reporing it's down

This commit is contained in:
Brett Yoakum 2022-08-26 13:04:37 -06:00
parent ae0e4a42db
commit a42d32fb26

View file

@ -26,6 +26,10 @@ groups:
description: A Prometheus job does not have living target anymore.
query: 'sum by (job) (up) == 0'
severity: critical
- name: Prometheus target missing with warmup time
description: Allow a job time to start up (10 minutes) before alerting that it's down.
query: 'sum by (instance, job) ((up == 0) * on (instance) group_right(job) (node_time_seconds - node_boot_time_seconds > 600))'
severity: critical
- name: Prometheus configuration reload failure
description: Prometheus configuration reload error
query: 'prometheus_config_last_reload_successful != 1'