From 79f2858037c933d370f723d3fba253b5befaba9f Mon Sep 17 00:00:00 2001 From: Simon Matic Langford Date: Mon, 17 Nov 2025 13:56:04 +0000 Subject: [PATCH] Improve Jenkins node alerts to better handle servers with multiple nodes (#484) --- _data/rules.yml | 11 ++++++++--- dist/rules/jenkins/metric-plugin.yml | 17 +++++++++++++---- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/_data/rules.yml b/_data/rules.yml index bf2ee22..f4c66c1 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -3259,9 +3259,14 @@ groups: slug: metric-plugin doc_url: https://plugins.jenkins.io/prometheus/ rules: - - name: Jenkins offline - description: "Jenkins offline: `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})" - query: "jenkins_node_offline_value > 1" + - name: Jenkins node offline + description: "At least one Jenkins node offline: `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})" + query: "jenkins_node_offline_value > 0" + severity: critical + for: 5m + - name: Jenkins no node online + description: "No Jenkins nodes are online: `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})" + query: "jenkins_node_online_value == 0" severity: critical - name: Jenkins healthcheck description: "Jenkins healthcheck score: {{$value}}. Healthcheck failure for `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})" diff --git a/dist/rules/jenkins/metric-plugin.yml b/dist/rules/jenkins/metric-plugin.yml index 57c9cf6..5271e1e 100644 --- a/dist/rules/jenkins/metric-plugin.yml +++ b/dist/rules/jenkins/metric-plugin.yml @@ -5,14 +5,23 @@ groups: rules: - - alert: JenkinsOffline - expr: 'jenkins_node_offline_value > 1' + - alert: JenkinsNodeOffline + expr: 'jenkins_node_offline_value > 0' + for: 5m + labels: + severity: critical + annotations: + summary: Jenkins node offline (instance {{ $labels.instance }}) + description: "At least one Jenkins node offline: `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: JenkinsNoNodeOnline + expr: 'jenkins_node_online_value == 0' for: 0m labels: severity: critical annotations: - summary: Jenkins offline (instance {{ $labels.instance }}) - description: "Jenkins offline: `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: Jenkins no node online (instance {{ $labels.instance }}) + description: "No Jenkins nodes are online: `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: JenkinsHealthcheck expr: 'jenkins_health_check_score < 1'