From 0ba7c2a47e9f49427be8b84afc09ed33ce9a5400 Mon Sep 17 00:00:00 2001 From: Alberto del Barrio <9251788+the-smooth-operator@users.noreply.github.com> Date: Sun, 27 Jun 2021 14:16:42 +0200 Subject: [PATCH 1/6] fix typo (#228) --- _data/rules.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/_data/rules.yml b/_data/rules.yml index d38fec3..98b0fd7 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -836,12 +836,12 @@ groups: query: 'rabbitmq_process_open_fds / rabbitmq_process_max_fds * 100 > 90' severity: warning for: 2m - - name: Rabbitmq too much unack - description: Too much unacknowledged messages + - name: Rabbitmq too many unack messages + description: Too many unacknowledged messages query: 'sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000' severity: warning for: 1m - - name: Rabbitmq too much connections + - name: Rabbitmq too many connections description: The total connections of a node is too high query: 'rabbitmq_connections > 1000' severity: warning From d1235231640590bb04868a76ebad5208218f4ee1 Mon Sep 17 00:00:00 2001 From: piano <35907674+piano-wow@users.noreply.github.com> Date: Tue, 29 Jun 2021 18:21:35 +0800 Subject: [PATCH 2/6] Update alertmanager.md (#231) webhook_config should be webhook_configs --- alertmanager.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/alertmanager.md b/alertmanager.md index c7f8add..849a08f 100644 --- a/alertmanager.md +++ b/alertmanager.md @@ -92,7 +92,7 @@ receivers: text: "{{ range .Alerts }} {{ .Annotations.summary }}\n{{ .Annotations.description }}\n{{ end }}" - name: "pager" - webhook_config: + webhook_configs: - url: http://a.b.c.d:8080/send/sms send_resolved: true From 6a6f89bad5aa92093614f32f89ab3603fff57d5e Mon Sep 17 00:00:00 2001 From: Alexandros Orfanos Date: Tue, 29 Jun 2021 13:37:54 +0300 Subject: [PATCH 3/6] Add php-fpm max-children alert (#224) --- _data/rules.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/_data/rules.yml b/_data/rules.yml index 98b0fd7..3a0fbb5 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -1441,6 +1441,10 @@ groups: - name: bakins/php-fpm-exporter doc_url: https://github.com/bakins/php-fpm-exporter rules: + - name: PHP-FPM max-children reached + description: PHP-FPM reached max children - {{ $labels.instance }} + query: 'sum(phpfpm_max_children_reached_total) by (instance) > 0' + severity: warning - name: JVM exporters: From 650914d4ad5c2e0eb98a3d69c05469d212a44230 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 29 Jun 2021 12:38:05 +0200 Subject: [PATCH 4/6] build(deps): bump nokogiri from 1.11.1 to 1.11.4 (#221) Bumps [nokogiri](https://github.com/sparklemotion/nokogiri) from 1.11.1 to 1.11.4. - [Release notes](https://github.com/sparklemotion/nokogiri/releases) - [Changelog](https://github.com/sparklemotion/nokogiri/blob/main/CHANGELOG.md) - [Commits](https://github.com/sparklemotion/nokogiri/compare/v1.11.1...v1.11.4) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Gemfile.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index 086a903..d2200ac 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -197,14 +197,14 @@ GEM rb-fsevent (~> 0.10, >= 0.10.3) rb-inotify (~> 0.9, >= 0.9.10) mercenary (0.3.6) - mini_portile2 (2.5.0) + mini_portile2 (2.5.1) minima (2.5.1) jekyll (>= 3.5, < 5.0) jekyll-feed (~> 0.9) jekyll-seo-tag (~> 2.1) minitest (5.14.1) multipart-post (2.1.1) - nokogiri (1.11.1) + nokogiri (1.11.4) mini_portile2 (~> 2.5.0) racc (~> 1.4) octokit (4.16.0) From 243c0280cf460b3e027a0a9db53e8fdbab43153a Mon Sep 17 00:00:00 2001 From: asteny <32196849+asteny@users.noreply.github.com> Date: Mon, 5 Jul 2021 00:28:58 +0300 Subject: [PATCH 5/6] Haproxy 2 embedded exporter fixes (#229) --- _data/rules.yml | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/_data/rules.yml b/_data/rules.yml index 3a0fbb5..c2f5385 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -1247,22 +1247,22 @@ groups: rules: - name: HAProxy high HTTP 4xx error rate backend description: Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }} - query: '((sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5' + query: ((sum by (proxy) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 severity: critical for: 1m - name: HAProxy high HTTP 5xx error rate backend description: Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }} - query: ((sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 + query: ((sum by (proxy) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 severity: critical for: 1m - name: HAProxy high HTTP 4xx error rate server description: Too many HTTP requests with status 4xx (> 5%) on server {{ $labels.server }} - query: ((sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 + query: ((sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 severity: critical for: 1m - name: HAProxy high HTTP 5xx error rate server description: Too many HTTP requests with status 5xx (> 5%) on server {{ $labels.server }} - query: ((sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 + query: ((sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 severity: critical for: 1m - name: HAProxy server response errors @@ -1279,33 +1279,29 @@ groups: description: Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be to high. query: (sum by (proxy) (rate(haproxy_server_connection_errors_total[1m]))) > 100 severity: critical - - name: HAProxy backend max active session - description: HAproxy backend {{ $labels.fqdn }}/{{ $labels.backend }} is reaching session limit (> 80%). - query: avg_over_time(((sum by (proxy) (haproxy_server_max_sessions)) / (sum by (proxy) (haproxy_server_limit_sessions))) [2m:]) * 100 > 80 + - name: HAProxy backend max active session > 80% + description: Session limit from backend {{ $labels.proxy }} to server {{ $labels.server }} reached 80% of limit - {{ $value | printf "%.2f"}}% + query: ((haproxy_server_max_sessions >0) * 100) / (haproxy_server_limit_sessions > 0) > 80 severity: warning for: 2m - name: HAProxy pending requests - description: Some HAProxy requests are pending on {{ $labels.fqdn }}/{{ $labels.backend }} backend + description: Some HAProxy requests are pending on {{ $labels.proxy }} - {{ $value | printf "%.2f"}} query: sum by (proxy) (rate(haproxy_backend_current_queue[2m])) > 0 severity: warning for: 2m - name: HAProxy HTTP slowing down - description: Average request time is increasing + description: Average request time is increasing - {{ $value | printf "%.2f"}} query: avg by (proxy) (haproxy_backend_max_total_time_seconds) > 1 severity: warning for: 1m - name: HAProxy retry high - description: High rate of retry on {{ $labels.fqdn }}/{{ $labels.backend }} backend + description: High rate of retry on {{ $labels.proxy }} - {{ $value | printf "%.2f"}} query: sum by (proxy) (rate(haproxy_backend_retry_warnings_total[1m])) > 10 severity: warning for: 2m - - name: HAProxy proxy down - description: HAProxy proxy is down - query: haproxy_backend_up == 0 - severity: critical - - name: HAProxy server down - description: HAProxy backend is down - query: haproxy_backend_active_servers == 0 + - name: HAproxy has no alive backends + description: HAProxy has no alive active or backup backends for {{ $labels.proxy }} + query: haproxy_backend_active_servers + haproxy_backend_backup_servers == 0 severity: critical - name: HAProxy frontend security blocked requests description: HAProxy is blocking requests for security reason From c2b8178304a839eb108a6cd3f297645e8e90ad7b Mon Sep 17 00:00:00 2001 From: Gjed Date: Sun, 4 Jul 2021 23:59:46 +0200 Subject: [PATCH 6/6] Loki alerts (#218) Co-authored-by: Samuel Berthe --- README.md | 2 ++ _data/rules.yml | 26 ++++++++++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/README.md b/README.md index d0602de..0f1562e 100644 --- a/README.md +++ b/README.md @@ -72,6 +72,8 @@ Collection available here: **[https://awesome-prometheus-alerts.grep.to](https:/ #### Other - [Thanos](https://awesome-prometheus-alerts.grep.to/rules#thanos) +- [Loki](https://awesome-prometheus-alerts.grep.to/rules#loki) +- [Cortex](https://awesome-prometheus-alerts.grep.to/rules#cortex) ## 🤝 Contributing diff --git a/_data/rules.yml b/_data/rules.yml index c2f5385..99f566d 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -2001,3 +2001,29 @@ groups: description: Thanos compaction has not run in 24 hours. query: '(time() - thanos_objstore_bucket_last_successful_upload_time) > 24*60*60' severity: critical + - name: Loki + exporters: + - rules: + - name: Loki process too many restarts + description: A loki process had too many restarts (target {{ $labels.instance }}) + query: changes(process_start_time_seconds{job=~"loki"}[15m]) > 2 + severity: warning + - name: Cortex + exporters: + - rules: + - name: Cortex ruler configuration reload failure + description: Cortex ruler configuration reload failure (instance {{ $labels.instance }}) + query: cortex_ruler_config_last_reload_successful != 1 + severity: warning + - name: Cortex not connected to Alertmanager + description: Cortex not connected to Alertmanager (instance {{ $labels.instance }}) + query: cortex_prometheus_notifications_alertmanagers_discovered < 1 + severity: critical + - name: Cortex notification are being dropped + description: Cortex notification are being dropped due to errors (instance {{ $labels.instance }}) + query: rate(cortex_prometheus_notifications_dropped_total[5m]) > 0 + severity: critical + - name: Cortex notification error + description: Cortex is failing when sengin alert notifications (instance {{ $labels.instance }}) + query: rate(cortex_prometheus_notifications_errors_total[5m]) > 0 + severity: critical