Merge branch 'samber:master' into master

This commit is contained in:
gökhan 2021-07-05 09:36:44 +03:00 committed by GitHub
commit 64b38460d5
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 51 additions and 23 deletions

View file

@ -197,14 +197,14 @@ GEM
rb-fsevent (~> 0.10, >= 0.10.3) rb-fsevent (~> 0.10, >= 0.10.3)
rb-inotify (~> 0.9, >= 0.9.10) rb-inotify (~> 0.9, >= 0.9.10)
mercenary (0.3.6) mercenary (0.3.6)
mini_portile2 (2.5.0) mini_portile2 (2.5.1)
minima (2.5.1) minima (2.5.1)
jekyll (>= 3.5, < 5.0) jekyll (>= 3.5, < 5.0)
jekyll-feed (~> 0.9) jekyll-feed (~> 0.9)
jekyll-seo-tag (~> 2.1) jekyll-seo-tag (~> 2.1)
minitest (5.14.1) minitest (5.14.1)
multipart-post (2.1.1) multipart-post (2.1.1)
nokogiri (1.11.1) nokogiri (1.11.4)
mini_portile2 (~> 2.5.0) mini_portile2 (~> 2.5.0)
racc (~> 1.4) racc (~> 1.4)
octokit (4.16.0) octokit (4.16.0)

View file

@ -72,6 +72,8 @@ Collection available here: **[https://awesome-prometheus-alerts.grep.to](https:/
#### Other #### Other
- [Thanos](https://awesome-prometheus-alerts.grep.to/rules#thanos) - [Thanos](https://awesome-prometheus-alerts.grep.to/rules#thanos)
- [Loki](https://awesome-prometheus-alerts.grep.to/rules#loki)
- [Cortex](https://awesome-prometheus-alerts.grep.to/rules#cortex)
## 🤝 Contributing ## 🤝 Contributing

View file

@ -836,12 +836,12 @@ groups:
query: 'rabbitmq_process_open_fds / rabbitmq_process_max_fds * 100 > 90' query: 'rabbitmq_process_open_fds / rabbitmq_process_max_fds * 100 > 90'
severity: warning severity: warning
for: 2m for: 2m
- name: Rabbitmq too much unack - name: Rabbitmq too many unack messages
description: Too much unacknowledged messages description: Too many unacknowledged messages
query: 'sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000' query: 'sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000'
severity: warning severity: warning
for: 1m for: 1m
- name: Rabbitmq too much connections - name: Rabbitmq too many connections
description: The total connections of a node is too high description: The total connections of a node is too high
query: 'rabbitmq_connections > 1000' query: 'rabbitmq_connections > 1000'
severity: warning severity: warning
@ -1247,22 +1247,22 @@ groups:
rules: rules:
- name: HAProxy high HTTP 4xx error rate backend - name: HAProxy high HTTP 4xx error rate backend
description: Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }} description: Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}
query: '((sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5' query: ((sum by (proxy) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5
severity: critical severity: critical
for: 1m for: 1m
- name: HAProxy high HTTP 5xx error rate backend - name: HAProxy high HTTP 5xx error rate backend
description: Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }} description: Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}
query: ((sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 query: ((sum by (proxy) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5
severity: critical severity: critical
for: 1m for: 1m
- name: HAProxy high HTTP 4xx error rate server - name: HAProxy high HTTP 4xx error rate server
description: Too many HTTP requests with status 4xx (> 5%) on server {{ $labels.server }} description: Too many HTTP requests with status 4xx (> 5%) on server {{ $labels.server }}
query: ((sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 query: ((sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5
severity: critical severity: critical
for: 1m for: 1m
- name: HAProxy high HTTP 5xx error rate server - name: HAProxy high HTTP 5xx error rate server
description: Too many HTTP requests with status 5xx (> 5%) on server {{ $labels.server }} description: Too many HTTP requests with status 5xx (> 5%) on server {{ $labels.server }}
query: ((sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 query: ((sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5
severity: critical severity: critical
for: 1m for: 1m
- name: HAProxy server response errors - name: HAProxy server response errors
@ -1279,33 +1279,29 @@ groups:
description: Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be to high. description: Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be to high.
query: (sum by (proxy) (rate(haproxy_server_connection_errors_total[1m]))) > 100 query: (sum by (proxy) (rate(haproxy_server_connection_errors_total[1m]))) > 100
severity: critical severity: critical
- name: HAProxy backend max active session - name: HAProxy backend max active session > 80%
description: HAproxy backend {{ $labels.fqdn }}/{{ $labels.backend }} is reaching session limit (> 80%). description: Session limit from backend {{ $labels.proxy }} to server {{ $labels.server }} reached 80% of limit - {{ $value | printf "%.2f"}}%
query: avg_over_time(((sum by (proxy) (haproxy_server_max_sessions)) / (sum by (proxy) (haproxy_server_limit_sessions))) [2m:]) * 100 > 80 query: ((haproxy_server_max_sessions >0) * 100) / (haproxy_server_limit_sessions > 0) > 80
severity: warning severity: warning
for: 2m for: 2m
- name: HAProxy pending requests - name: HAProxy pending requests
description: Some HAProxy requests are pending on {{ $labels.fqdn }}/{{ $labels.backend }} backend description: Some HAProxy requests are pending on {{ $labels.proxy }} - {{ $value | printf "%.2f"}}
query: sum by (proxy) (rate(haproxy_backend_current_queue[2m])) > 0 query: sum by (proxy) (rate(haproxy_backend_current_queue[2m])) > 0
severity: warning severity: warning
for: 2m for: 2m
- name: HAProxy HTTP slowing down - name: HAProxy HTTP slowing down
description: Average request time is increasing description: Average request time is increasing - {{ $value | printf "%.2f"}}
query: avg by (proxy) (haproxy_backend_max_total_time_seconds) > 1 query: avg by (proxy) (haproxy_backend_max_total_time_seconds) > 1
severity: warning severity: warning
for: 1m for: 1m
- name: HAProxy retry high - name: HAProxy retry high
description: High rate of retry on {{ $labels.fqdn }}/{{ $labels.backend }} backend description: High rate of retry on {{ $labels.proxy }} - {{ $value | printf "%.2f"}}
query: sum by (proxy) (rate(haproxy_backend_retry_warnings_total[1m])) > 10 query: sum by (proxy) (rate(haproxy_backend_retry_warnings_total[1m])) > 10
severity: warning severity: warning
for: 2m for: 2m
- name: HAProxy proxy down - name: HAproxy has no alive backends
description: HAProxy proxy is down description: HAProxy has no alive active or backup backends for {{ $labels.proxy }}
query: haproxy_backend_up == 0 query: haproxy_backend_active_servers + haproxy_backend_backup_servers == 0
severity: critical
- name: HAProxy server down
description: HAProxy backend is down
query: haproxy_backend_active_servers == 0
severity: critical severity: critical
- name: HAProxy frontend security blocked requests - name: HAProxy frontend security blocked requests
description: HAProxy is blocking requests for security reason description: HAProxy is blocking requests for security reason
@ -1441,6 +1437,10 @@ groups:
- name: bakins/php-fpm-exporter - name: bakins/php-fpm-exporter
doc_url: https://github.com/bakins/php-fpm-exporter doc_url: https://github.com/bakins/php-fpm-exporter
rules: rules:
- name: PHP-FPM max-children reached
description: PHP-FPM reached max children - {{ $labels.instance }}
query: 'sum(phpfpm_max_children_reached_total) by (instance) > 0'
severity: warning
- name: JVM - name: JVM
exporters: exporters:
@ -2004,3 +2004,29 @@ groups:
description: Thanos compaction has not run in 24 hours. description: Thanos compaction has not run in 24 hours.
query: '(time() - thanos_objstore_bucket_last_successful_upload_time) > 24*60*60' query: '(time() - thanos_objstore_bucket_last_successful_upload_time) > 24*60*60'
severity: critical severity: critical
- name: Loki
exporters:
- rules:
- name: Loki process too many restarts
description: A loki process had too many restarts (target {{ $labels.instance }})
query: changes(process_start_time_seconds{job=~"loki"}[15m]) > 2
severity: warning
- name: Cortex
exporters:
- rules:
- name: Cortex ruler configuration reload failure
description: Cortex ruler configuration reload failure (instance {{ $labels.instance }})
query: cortex_ruler_config_last_reload_successful != 1
severity: warning
- name: Cortex not connected to Alertmanager
description: Cortex not connected to Alertmanager (instance {{ $labels.instance }})
query: cortex_prometheus_notifications_alertmanagers_discovered < 1
severity: critical
- name: Cortex notification are being dropped
description: Cortex notification are being dropped due to errors (instance {{ $labels.instance }})
query: rate(cortex_prometheus_notifications_dropped_total[5m]) > 0
severity: critical
- name: Cortex notification error
description: Cortex is failing when sengin alert notifications (instance {{ $labels.instance }})
query: rate(cortex_prometheus_notifications_errors_total[5m]) > 0
severity: critical

View file

@ -92,7 +92,7 @@ receivers:
text: "{{ range .Alerts }}<!channel> {{ .Annotations.summary }}\n{{ .Annotations.description }}\n{{ end }}" text: "{{ range .Alerts }}<!channel> {{ .Annotations.summary }}\n{{ .Annotations.description }}\n{{ end }}"
- name: "pager" - name: "pager"
webhook_config: webhook_configs:
- url: http://a.b.c.d:8080/send/sms - url: http://a.b.c.d:8080/send/sms
send_resolved: true send_resolved: true