diff --git a/Gemfile.lock b/Gemfile.lock index 086a903..d2200ac 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -197,14 +197,14 @@ GEM rb-fsevent (~> 0.10, >= 0.10.3) rb-inotify (~> 0.9, >= 0.9.10) mercenary (0.3.6) - mini_portile2 (2.5.0) + mini_portile2 (2.5.1) minima (2.5.1) jekyll (>= 3.5, < 5.0) jekyll-feed (~> 0.9) jekyll-seo-tag (~> 2.1) minitest (5.14.1) multipart-post (2.1.1) - nokogiri (1.11.1) + nokogiri (1.11.4) mini_portile2 (~> 2.5.0) racc (~> 1.4) octokit (4.16.0) diff --git a/README.md b/README.md index d0602de..0f1562e 100644 --- a/README.md +++ b/README.md @@ -72,6 +72,8 @@ Collection available here: **[https://awesome-prometheus-alerts.grep.to](https:/ #### Other - [Thanos](https://awesome-prometheus-alerts.grep.to/rules#thanos) +- [Loki](https://awesome-prometheus-alerts.grep.to/rules#loki) +- [Cortex](https://awesome-prometheus-alerts.grep.to/rules#cortex) ## 🤝 Contributing diff --git a/_data/rules.yml b/_data/rules.yml index 2c346fe..9b1648d 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -836,12 +836,12 @@ groups: query: 'rabbitmq_process_open_fds / rabbitmq_process_max_fds * 100 > 90' severity: warning for: 2m - - name: Rabbitmq too much unack - description: Too much unacknowledged messages + - name: Rabbitmq too many unack messages + description: Too many unacknowledged messages query: 'sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000' severity: warning for: 1m - - name: Rabbitmq too much connections + - name: Rabbitmq too many connections description: The total connections of a node is too high query: 'rabbitmq_connections > 1000' severity: warning @@ -1247,22 +1247,22 @@ groups: rules: - name: HAProxy high HTTP 4xx error rate backend description: Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }} - query: '((sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5' + query: ((sum by (proxy) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 severity: critical for: 1m - name: HAProxy high HTTP 5xx error rate backend description: Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }} - query: ((sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 + query: ((sum by (proxy) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 severity: critical for: 1m - name: HAProxy high HTTP 4xx error rate server description: Too many HTTP requests with status 4xx (> 5%) on server {{ $labels.server }} - query: ((sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 + query: ((sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 severity: critical for: 1m - name: HAProxy high HTTP 5xx error rate server description: Too many HTTP requests with status 5xx (> 5%) on server {{ $labels.server }} - query: ((sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 + query: ((sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 severity: critical for: 1m - name: HAProxy server response errors @@ -1279,33 +1279,29 @@ groups: description: Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be to high. query: (sum by (proxy) (rate(haproxy_server_connection_errors_total[1m]))) > 100 severity: critical - - name: HAProxy backend max active session - description: HAproxy backend {{ $labels.fqdn }}/{{ $labels.backend }} is reaching session limit (> 80%). - query: avg_over_time(((sum by (proxy) (haproxy_server_max_sessions)) / (sum by (proxy) (haproxy_server_limit_sessions))) [2m:]) * 100 > 80 + - name: HAProxy backend max active session > 80% + description: Session limit from backend {{ $labels.proxy }} to server {{ $labels.server }} reached 80% of limit - {{ $value | printf "%.2f"}}% + query: ((haproxy_server_max_sessions >0) * 100) / (haproxy_server_limit_sessions > 0) > 80 severity: warning for: 2m - name: HAProxy pending requests - description: Some HAProxy requests are pending on {{ $labels.fqdn }}/{{ $labels.backend }} backend + description: Some HAProxy requests are pending on {{ $labels.proxy }} - {{ $value | printf "%.2f"}} query: sum by (proxy) (rate(haproxy_backend_current_queue[2m])) > 0 severity: warning for: 2m - name: HAProxy HTTP slowing down - description: Average request time is increasing + description: Average request time is increasing - {{ $value | printf "%.2f"}} query: avg by (proxy) (haproxy_backend_max_total_time_seconds) > 1 severity: warning for: 1m - name: HAProxy retry high - description: High rate of retry on {{ $labels.fqdn }}/{{ $labels.backend }} backend + description: High rate of retry on {{ $labels.proxy }} - {{ $value | printf "%.2f"}} query: sum by (proxy) (rate(haproxy_backend_retry_warnings_total[1m])) > 10 severity: warning for: 2m - - name: HAProxy proxy down - description: HAProxy proxy is down - query: haproxy_backend_up == 0 - severity: critical - - name: HAProxy server down - description: HAProxy backend is down - query: haproxy_backend_active_servers == 0 + - name: HAproxy has no alive backends + description: HAProxy has no alive active or backup backends for {{ $labels.proxy }} + query: haproxy_backend_active_servers + haproxy_backend_backup_servers == 0 severity: critical - name: HAProxy frontend security blocked requests description: HAProxy is blocking requests for security reason @@ -1441,6 +1437,10 @@ groups: - name: bakins/php-fpm-exporter doc_url: https://github.com/bakins/php-fpm-exporter rules: + - name: PHP-FPM max-children reached + description: PHP-FPM reached max children - {{ $labels.instance }} + query: 'sum(phpfpm_max_children_reached_total) by (instance) > 0' + severity: warning - name: JVM exporters: @@ -2004,3 +2004,29 @@ groups: description: Thanos compaction has not run in 24 hours. query: '(time() - thanos_objstore_bucket_last_successful_upload_time) > 24*60*60' severity: critical + - name: Loki + exporters: + - rules: + - name: Loki process too many restarts + description: A loki process had too many restarts (target {{ $labels.instance }}) + query: changes(process_start_time_seconds{job=~"loki"}[15m]) > 2 + severity: warning + - name: Cortex + exporters: + - rules: + - name: Cortex ruler configuration reload failure + description: Cortex ruler configuration reload failure (instance {{ $labels.instance }}) + query: cortex_ruler_config_last_reload_successful != 1 + severity: warning + - name: Cortex not connected to Alertmanager + description: Cortex not connected to Alertmanager (instance {{ $labels.instance }}) + query: cortex_prometheus_notifications_alertmanagers_discovered < 1 + severity: critical + - name: Cortex notification are being dropped + description: Cortex notification are being dropped due to errors (instance {{ $labels.instance }}) + query: rate(cortex_prometheus_notifications_dropped_total[5m]) > 0 + severity: critical + - name: Cortex notification error + description: Cortex is failing when sengin alert notifications (instance {{ $labels.instance }}) + query: rate(cortex_prometheus_notifications_errors_total[5m]) > 0 + severity: critical diff --git a/alertmanager.md b/alertmanager.md index c7f8add..849a08f 100644 --- a/alertmanager.md +++ b/alertmanager.md @@ -92,7 +92,7 @@ receivers: text: "{{ range .Alerts }} {{ .Annotations.summary }}\n{{ .Annotations.description }}\n{{ end }}" - name: "pager" - webhook_config: + webhook_configs: - url: http://a.b.c.d:8080/send/sms send_resolved: true