From 0887515f98823c20aea723f8ac1a974f33833afc Mon Sep 17 00:00:00 2001 From: Brett Date: Sun, 28 Aug 2022 08:31:15 -0600 Subject: [PATCH] Added query for node warmup before reporing it's down (#304) Co-authored-by: Brett Yoakum --- _data/rules.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/_data/rules.yml b/_data/rules.yml index 1634679..e8f9a7f 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -26,6 +26,10 @@ groups: description: A Prometheus job does not have living target anymore. query: 'sum by (job) (up) == 0' severity: critical + - name: Prometheus target missing with warmup time + description: Allow a job time to start up (10 minutes) before alerting that it's down. + query: 'sum by (instance, job) ((up == 0) * on (instance) group_right(job) (node_time_seconds - node_boot_time_seconds > 600))' + severity: critical - name: Prometheus configuration reload failure description: Prometheus configuration reload error query: 'prometheus_config_last_reload_successful != 1'