From 78a7e61050d5a7a07f24a89270b2bba8512db9a7 Mon Sep 17 00:00:00 2001 From: "MikeN. Paxos" <84518874+mike2194@users.noreply.github.com> Date: Mon, 27 Dec 2021 06:48:07 -0500 Subject: [PATCH 1/4] added jenkins alert rules for jenkins metrics plugin (#268) * added jenkins alert rules * Update rules.yml Co-authored-by: Samuel Berthe --- _data/rules.yml | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/_data/rules.yml b/_data/rules.yml index acf067b..1f98cb6 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -2126,3 +2126,41 @@ groups: query: sum by (job) (cortex_query_frontend_queue_length) > 0 severity: critical for: 5m + + - name: Jenkins + exporters: + - name: Metric plugin + doc_url: https://plugins.jenkins.io/prometheus/ + rules: + - name: Jenkins offline + description: "Jenkins offline: `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})" + query: 'jenkins_node_offline_value > 1' + severity: critical + - name: Jenkins healthcheck + description: "Jenkins healthcheck score: {{$value}}. Healthcheck failure for `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})" + query: 'jenkins_health_check_score < 1' + severity: critical + - name: Jenkins builds health score + description: "Healthcheck failure for `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})" + query: 'default_jenkins_builds_health_score < 1' + severity: critical + - name: Jenkins run failure total + description: "Job run failures: ({{$value}}) {{$labels.jenkins_job}}. Healthcheck failure for `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})" + query: 'delta(jenkins_runs_failure_total[1h]) > 100' + severity: warning + - name: JenkinsBuildTestsFailing + description: "Last build tests failed: {{$labels.jenkins_job}}. Failed build Tests for job `{{$labels.jenkins_job}}` on {{$labels.instance}}/{{$labels.env}} ({{$labels.region}})" + query: 'default_jenkins_builds_last_build_tests_failing > 0' + severity: warning + - name: Jenkins last build failed + description: "Last build failed: {{$labels.jenkins_job}}. Failed build for job `{{$labels.jenkins_job}}` on {{$labels.instance}}/{{$labels.env}} ({{$labels.region}})" + query: 'default_jenkins_builds_last_build_result_ordinal == 2' + severity: warning + comments: | + * RUNNING -1 true - The build had no errors. + * SUCCESS 0 true - The build had no errors. + * UNSTABLE 1 true - The build had some errors but they were not fatal. For example, some tests failed. + * FAILURE 2 false - The build had a fatal error. + * NOT_BUILT 3 false - The module was not built. + * ABORTED 4 false - The build was manually aborted. + From 37722256d5329df75aae2425c1da2d2c224e9482 Mon Sep 17 00:00:00 2001 From: Samuel Berthe Date: Mon, 27 Dec 2021 12:49:32 +0100 Subject: [PATCH 2/4] Adding jenkins --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 37cea8b..821c3d2 100644 --- a/README.md +++ b/README.md @@ -77,6 +77,7 @@ Collection available here: **[https://awesome-prometheus-alerts.grep.to](https:/ - [Loki](https://awesome-prometheus-alerts.grep.to/rules#loki) - [Promtail](https://awesome-prometheus-alerts.grep.to/rules#promtail) - [Cortex](https://awesome-prometheus-alerts.grep.to/rules#cortex) +- [Jenkins](https://awesome-prometheus-alerts.grep.to/rules#jenkins) ## 🤝 Contributing From 038e46743d0de725a2aacaadc959a965203698de Mon Sep 17 00:00:00 2001 From: armondressler Date: Sun, 16 Jan 2022 03:24:36 +0100 Subject: [PATCH 3/4] fixed erroneous usage of rate() function on gauges (#270) Co-authored-by: Dressler Armon, B2B-PAP-HLT-DO-ENG --- _data/rules.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/_data/rules.yml b/_data/rules.yml index 1f98cb6..fd5aa22 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -494,12 +494,12 @@ groups: severity: critical - name: MySQL too many connections (> 80%) description: 'More than 80% of MySQL connections are in use on {{ $labels.instance }}' - query: 'avg by (instance) (rate(mysql_global_status_threads_connected[1m])) / avg by (instance) (mysql_global_variables_max_connections) * 100 > 80' + query: 'max_over_time(mysql_global_status_threads_connected[1m]) / mysql_global_variables_max_connections * 100 > 80' severity: warning for: 2m - name: MySQL high threads running description: 'More than 60% of MySQL connections are in running state on {{ $labels.instance }}' - query: 'avg by (instance) (rate(mysql_global_status_threads_running[1m])) / avg by (instance) (mysql_global_variables_max_connections) * 100 > 60' + query: 'max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections * 100 > 60' severity: warning for: 2m - name: MySQL Slave IO thread not running From 21ddd2f75297acc58045993a22eab1ab7bdf31da Mon Sep 17 00:00:00 2001 From: Koen Dierckx Date: Sun, 23 Jan 2022 19:36:36 +0100 Subject: [PATCH 4/4] Added Alert manager job alert (#272) Co-authored-by: DIERCKXK --- _data/rules.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/_data/rules.yml b/_data/rules.yml index fd5aa22..a89bbe4 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -33,6 +33,10 @@ groups: description: Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping. query: 'changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2' severity: warning + - name: Prometheus AlertManager job missing + description: A Prometheus AlertManager job has disappeared + query: 'absent(up{job="alertmanager"})' + severity: warning - name: Prometheus AlertManager configuration reload failure description: AlertManager configuration reload error query: 'alertmanager_config_last_reload_successful != 1'