Query fails if instance names are not unique across jobs. This fixes it.

2026-06-26 11:27:00 +08:00 · 2024-07-02 13:49:12 -04:00 · 2024-07-02 13:49:12 -04:00 · 54e2b09b3d
commit 54e2b09b3d
parent 51d0484bb4
1 changed files with 1 additions and 1 deletions
--- a/_data/rules.yml
+++ b/_data/rules.yml
@ -27,7 +27,7 @@ groups:
                severity: critical
              - name: Prometheus target missing with warmup time
                description: Allow a job time to start up (10 minutes) before alerting that it's down.
-                query: "sum by (instance, job) ((up == 0) * on (instance) group_right(job) (node_time_seconds - node_boot_time_seconds > 600))"
+                query: "sum by (instance, job) ((up == 0) * on (instance) group_left (__name__) (node_time_seconds - node_boot_time_seconds > 600))"
                severity: critical
              - name: Prometheus configuration reload failure
                description: Prometheus configuration reload error