Merge branch 'samber:master' into master

2026-06-26 19:37:27 +08:00 · 2021-07-05 09:36:44 +03:00 · 2021-07-05 09:36:44 +03:00 · 64b38460d5
commit 64b38460d5
parent f969f31893 c2b8178304
4 changed files with 51 additions and 23 deletions
--- a/Gemfile.lock
+++ b/Gemfile.lock
@ -197,14 +197,14 @@ GEM
      rb-fsevent (~> 0.10, >= 0.10.3)
      rb-inotify (~> 0.9, >= 0.9.10)
    mercenary (0.3.6)
-    mini_portile2 (2.5.0)
+    mini_portile2 (2.5.1)
    minima (2.5.1)
      jekyll (>= 3.5, < 5.0)
      jekyll-feed (~> 0.9)
      jekyll-seo-tag (~> 2.1)
    minitest (5.14.1)
    multipart-post (2.1.1)
-    nokogiri (1.11.1)
+    nokogiri (1.11.4)
      mini_portile2 (~> 2.5.0)
      racc (~> 1.4)
    octokit (4.16.0)
--- a/README.md
+++ b/README.md
@ -72,6 +72,8 @@ Collection available here: **[https://awesome-prometheus-alerts.grep.to](https:/
 #### Other

 - [Thanos](https://awesome-prometheus-alerts.grep.to/rules#thanos)
+- [Loki](https://awesome-prometheus-alerts.grep.to/rules#loki)
+- [Cortex](https://awesome-prometheus-alerts.grep.to/rules#cortex)

 ## 🤝 Contributing

--- a/_data/rules.yml
+++ b/_data/rules.yml
@ -836,12 +836,12 @@ groups:
                query: 'rabbitmq_process_open_fds / rabbitmq_process_max_fds * 100 > 90'
                severity: warning
                for: 2m
-              - name: Rabbitmq too much unack
-                description: Too much unacknowledged messages
+              - name: Rabbitmq too many unack messages
+                description: Too many unacknowledged messages
                query: 'sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000'
                severity: warning
                for: 1m
-              - name: Rabbitmq too much connections
+              - name: Rabbitmq too many connections
                description: The total connections of a node is too high
                query: 'rabbitmq_connections > 1000'
                severity: warning
@ -1247,22 +1247,22 @@ groups:
            rules:
              - name: HAProxy high HTTP 4xx error rate backend
                description: Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}
-                query: '((sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5'
+                query: ((sum by (proxy) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5
                severity: critical
                for: 1m
              - name: HAProxy high HTTP 5xx error rate backend
                description: Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}
-                query: ((sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5
+                query: ((sum by (proxy) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5
                severity: critical
                for: 1m
              - name: HAProxy high HTTP 4xx error rate server
                description: Too many HTTP requests with status 4xx (> 5%) on server {{ $labels.server }}
-                query: ((sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5
+                query: ((sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5
                severity: critical
                for: 1m
              - name: HAProxy high HTTP 5xx error rate server
                description: Too many HTTP requests with status 5xx (> 5%) on server {{ $labels.server }}
-                query: ((sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5
+                query: ((sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5
                severity: critical
                for: 1m
              - name: HAProxy server response errors
@ -1279,33 +1279,29 @@ groups:
                description: Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be to high.
                query: (sum by (proxy) (rate(haproxy_server_connection_errors_total[1m]))) > 100
                severity: critical
-              - name: HAProxy backend max active session
-                description: HAproxy backend {{ $labels.fqdn }}/{{ $labels.backend }} is reaching session limit (> 80%).
-                query: avg_over_time(((sum by (proxy) (haproxy_server_max_sessions)) / (sum by (proxy) (haproxy_server_limit_sessions))) [2m:]) * 100 > 80
+              - name: HAProxy backend max active session > 80%
+                description: Session limit from backend {{ $labels.proxy }} to server {{ $labels.server }} reached 80% of limit - {{ $value | printf "%.2f"}}%
+                query: ((haproxy_server_max_sessions >0) * 100) / (haproxy_server_limit_sessions > 0) > 80
                severity: warning
                for: 2m
              - name: HAProxy pending requests
-                description: Some HAProxy requests are pending on {{ $labels.fqdn }}/{{ $labels.backend }} backend
+                description: Some HAProxy requests are pending on {{ $labels.proxy }} - {{ $value | printf "%.2f"}}
                query: sum by (proxy) (rate(haproxy_backend_current_queue[2m])) > 0
                severity: warning
                for: 2m
              - name: HAProxy HTTP slowing down
-                description: Average request time is increasing
+                description: Average request time is increasing - {{ $value | printf "%.2f"}}
                query: avg by (proxy) (haproxy_backend_max_total_time_seconds) > 1
                severity: warning
                for: 1m
              - name: HAProxy retry high
-                description: High rate of retry on {{ $labels.fqdn }}/{{ $labels.backend }} backend
+                description: High rate of retry on {{ $labels.proxy }} - {{ $value | printf "%.2f"}}
                query: sum by (proxy) (rate(haproxy_backend_retry_warnings_total[1m])) > 10
                severity: warning
                for: 2m
-              - name: HAProxy proxy down
-                description: HAProxy proxy is down
-                query: haproxy_backend_up == 0
-                severity: critical
-              - name: HAProxy server down
-                description: HAProxy backend is down
-                query: haproxy_backend_active_servers == 0
+              - name: HAproxy has no alive backends
+                description: HAProxy has no alive active or backup backends for {{ $labels.proxy }}
+                query: haproxy_backend_active_servers + haproxy_backend_backup_servers == 0
                severity: critical
              - name: HAProxy frontend security blocked requests
                description: HAProxy is blocking requests for security reason
@ -1441,6 +1437,10 @@ groups:
          - name: bakins/php-fpm-exporter
            doc_url: https://github.com/bakins/php-fpm-exporter
            rules:
+              - name: PHP-FPM max-children reached
+                description: PHP-FPM reached max children - {{ $labels.instance }}
+                query: 'sum(phpfpm_max_children_reached_total) by (instance) > 0'
+                severity: warning

      - name: JVM
        exporters:
@ -2004,3 +2004,29 @@ groups:
                description: Thanos compaction has not run in 24 hours.
                query: '(time() - thanos_objstore_bucket_last_successful_upload_time) > 24*60*60'
                severity: critical
+      - name: Loki
+        exporters:
+          - rules:
+              - name: Loki process too many restarts
+                description: A loki process had too many restarts (target {{ $labels.instance }})
+                query: changes(process_start_time_seconds{job=~"loki"}[15m]) > 2
+                severity: warning
+      - name: Cortex
+        exporters:
+          - rules:
+              - name: Cortex ruler configuration reload failure
+                description: Cortex ruler configuration reload failure (instance {{ $labels.instance }})
+                query: cortex_ruler_config_last_reload_successful != 1
+                severity: warning
+              - name: Cortex not connected to Alertmanager
+                description: Cortex not connected to Alertmanager (instance {{ $labels.instance }})
+                query: cortex_prometheus_notifications_alertmanagers_discovered < 1
+                severity: critical
+              - name: Cortex notification are being dropped
+                description: Cortex notification are being dropped due to errors (instance {{ $labels.instance }})
+                query: rate(cortex_prometheus_notifications_dropped_total[5m]) > 0
+                severity: critical
+              - name: Cortex notification error
+                description: Cortex is failing when sengin alert notifications (instance {{ $labels.instance }})
+                query: rate(cortex_prometheus_notifications_errors_total[5m]) > 0
+                severity: critical
--- a/alertmanager.md
+++ b/alertmanager.md
@ -92,7 +92,7 @@ receivers:
        text: "{{ range .Alerts }}<!channel> {{ .Annotations.summary }}\n{{ .Annotations.description }}\n{{ end }}"

  - name: "pager"
-    webhook_config:
+    webhook_configs:
      - url: http://a.b.c.d:8080/send/sms
        send_resolved: true