mirror of
https://github.com/samber/awesome-prometheus-alerts.git
synced 2026-06-26 19:37:27 +08:00
Merge branch 'samber:master' into master
This commit is contained in:
commit
64b38460d5
4 changed files with 51 additions and 23 deletions
|
|
@ -197,14 +197,14 @@ GEM
|
|||
rb-fsevent (~> 0.10, >= 0.10.3)
|
||||
rb-inotify (~> 0.9, >= 0.9.10)
|
||||
mercenary (0.3.6)
|
||||
mini_portile2 (2.5.0)
|
||||
mini_portile2 (2.5.1)
|
||||
minima (2.5.1)
|
||||
jekyll (>= 3.5, < 5.0)
|
||||
jekyll-feed (~> 0.9)
|
||||
jekyll-seo-tag (~> 2.1)
|
||||
minitest (5.14.1)
|
||||
multipart-post (2.1.1)
|
||||
nokogiri (1.11.1)
|
||||
nokogiri (1.11.4)
|
||||
mini_portile2 (~> 2.5.0)
|
||||
racc (~> 1.4)
|
||||
octokit (4.16.0)
|
||||
|
|
|
|||
|
|
@ -72,6 +72,8 @@ Collection available here: **[https://awesome-prometheus-alerts.grep.to](https:/
|
|||
#### Other
|
||||
|
||||
- [Thanos](https://awesome-prometheus-alerts.grep.to/rules#thanos)
|
||||
- [Loki](https://awesome-prometheus-alerts.grep.to/rules#loki)
|
||||
- [Cortex](https://awesome-prometheus-alerts.grep.to/rules#cortex)
|
||||
|
||||
## 🤝 Contributing
|
||||
|
||||
|
|
|
|||
|
|
@ -836,12 +836,12 @@ groups:
|
|||
query: 'rabbitmq_process_open_fds / rabbitmq_process_max_fds * 100 > 90'
|
||||
severity: warning
|
||||
for: 2m
|
||||
- name: Rabbitmq too much unack
|
||||
description: Too much unacknowledged messages
|
||||
- name: Rabbitmq too many unack messages
|
||||
description: Too many unacknowledged messages
|
||||
query: 'sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000'
|
||||
severity: warning
|
||||
for: 1m
|
||||
- name: Rabbitmq too much connections
|
||||
- name: Rabbitmq too many connections
|
||||
description: The total connections of a node is too high
|
||||
query: 'rabbitmq_connections > 1000'
|
||||
severity: warning
|
||||
|
|
@ -1247,22 +1247,22 @@ groups:
|
|||
rules:
|
||||
- name: HAProxy high HTTP 4xx error rate backend
|
||||
description: Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}
|
||||
query: '((sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5'
|
||||
query: ((sum by (proxy) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5
|
||||
severity: critical
|
||||
for: 1m
|
||||
- name: HAProxy high HTTP 5xx error rate backend
|
||||
description: Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}
|
||||
query: ((sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5
|
||||
query: ((sum by (proxy) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5
|
||||
severity: critical
|
||||
for: 1m
|
||||
- name: HAProxy high HTTP 4xx error rate server
|
||||
description: Too many HTTP requests with status 4xx (> 5%) on server {{ $labels.server }}
|
||||
query: ((sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5
|
||||
query: ((sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5
|
||||
severity: critical
|
||||
for: 1m
|
||||
- name: HAProxy high HTTP 5xx error rate server
|
||||
description: Too many HTTP requests with status 5xx (> 5%) on server {{ $labels.server }}
|
||||
query: ((sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5
|
||||
query: ((sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5
|
||||
severity: critical
|
||||
for: 1m
|
||||
- name: HAProxy server response errors
|
||||
|
|
@ -1279,33 +1279,29 @@ groups:
|
|||
description: Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be to high.
|
||||
query: (sum by (proxy) (rate(haproxy_server_connection_errors_total[1m]))) > 100
|
||||
severity: critical
|
||||
- name: HAProxy backend max active session
|
||||
description: HAproxy backend {{ $labels.fqdn }}/{{ $labels.backend }} is reaching session limit (> 80%).
|
||||
query: avg_over_time(((sum by (proxy) (haproxy_server_max_sessions)) / (sum by (proxy) (haproxy_server_limit_sessions))) [2m:]) * 100 > 80
|
||||
- name: HAProxy backend max active session > 80%
|
||||
description: Session limit from backend {{ $labels.proxy }} to server {{ $labels.server }} reached 80% of limit - {{ $value | printf "%.2f"}}%
|
||||
query: ((haproxy_server_max_sessions >0) * 100) / (haproxy_server_limit_sessions > 0) > 80
|
||||
severity: warning
|
||||
for: 2m
|
||||
- name: HAProxy pending requests
|
||||
description: Some HAProxy requests are pending on {{ $labels.fqdn }}/{{ $labels.backend }} backend
|
||||
description: Some HAProxy requests are pending on {{ $labels.proxy }} - {{ $value | printf "%.2f"}}
|
||||
query: sum by (proxy) (rate(haproxy_backend_current_queue[2m])) > 0
|
||||
severity: warning
|
||||
for: 2m
|
||||
- name: HAProxy HTTP slowing down
|
||||
description: Average request time is increasing
|
||||
description: Average request time is increasing - {{ $value | printf "%.2f"}}
|
||||
query: avg by (proxy) (haproxy_backend_max_total_time_seconds) > 1
|
||||
severity: warning
|
||||
for: 1m
|
||||
- name: HAProxy retry high
|
||||
description: High rate of retry on {{ $labels.fqdn }}/{{ $labels.backend }} backend
|
||||
description: High rate of retry on {{ $labels.proxy }} - {{ $value | printf "%.2f"}}
|
||||
query: sum by (proxy) (rate(haproxy_backend_retry_warnings_total[1m])) > 10
|
||||
severity: warning
|
||||
for: 2m
|
||||
- name: HAProxy proxy down
|
||||
description: HAProxy proxy is down
|
||||
query: haproxy_backend_up == 0
|
||||
severity: critical
|
||||
- name: HAProxy server down
|
||||
description: HAProxy backend is down
|
||||
query: haproxy_backend_active_servers == 0
|
||||
- name: HAproxy has no alive backends
|
||||
description: HAProxy has no alive active or backup backends for {{ $labels.proxy }}
|
||||
query: haproxy_backend_active_servers + haproxy_backend_backup_servers == 0
|
||||
severity: critical
|
||||
- name: HAProxy frontend security blocked requests
|
||||
description: HAProxy is blocking requests for security reason
|
||||
|
|
@ -1441,6 +1437,10 @@ groups:
|
|||
- name: bakins/php-fpm-exporter
|
||||
doc_url: https://github.com/bakins/php-fpm-exporter
|
||||
rules:
|
||||
- name: PHP-FPM max-children reached
|
||||
description: PHP-FPM reached max children - {{ $labels.instance }}
|
||||
query: 'sum(phpfpm_max_children_reached_total) by (instance) > 0'
|
||||
severity: warning
|
||||
|
||||
- name: JVM
|
||||
exporters:
|
||||
|
|
@ -2004,3 +2004,29 @@ groups:
|
|||
description: Thanos compaction has not run in 24 hours.
|
||||
query: '(time() - thanos_objstore_bucket_last_successful_upload_time) > 24*60*60'
|
||||
severity: critical
|
||||
- name: Loki
|
||||
exporters:
|
||||
- rules:
|
||||
- name: Loki process too many restarts
|
||||
description: A loki process had too many restarts (target {{ $labels.instance }})
|
||||
query: changes(process_start_time_seconds{job=~"loki"}[15m]) > 2
|
||||
severity: warning
|
||||
- name: Cortex
|
||||
exporters:
|
||||
- rules:
|
||||
- name: Cortex ruler configuration reload failure
|
||||
description: Cortex ruler configuration reload failure (instance {{ $labels.instance }})
|
||||
query: cortex_ruler_config_last_reload_successful != 1
|
||||
severity: warning
|
||||
- name: Cortex not connected to Alertmanager
|
||||
description: Cortex not connected to Alertmanager (instance {{ $labels.instance }})
|
||||
query: cortex_prometheus_notifications_alertmanagers_discovered < 1
|
||||
severity: critical
|
||||
- name: Cortex notification are being dropped
|
||||
description: Cortex notification are being dropped due to errors (instance {{ $labels.instance }})
|
||||
query: rate(cortex_prometheus_notifications_dropped_total[5m]) > 0
|
||||
severity: critical
|
||||
- name: Cortex notification error
|
||||
description: Cortex is failing when sengin alert notifications (instance {{ $labels.instance }})
|
||||
query: rate(cortex_prometheus_notifications_errors_total[5m]) > 0
|
||||
severity: critical
|
||||
|
|
|
|||
|
|
@ -92,7 +92,7 @@ receivers:
|
|||
text: "{{ range .Alerts }}<!channel> {{ .Annotations.summary }}\n{{ .Annotations.description }}\n{{ end }}"
|
||||
|
||||
- name: "pager"
|
||||
webhook_config:
|
||||
webhook_configs:
|
||||
- url: http://a.b.c.d:8080/send/sms
|
||||
send_resolved: true
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue