HAproxy: add instance to sums, so we keep the label and can use it in summary

2026-06-26 19:37:27 +08:00 · 2024-03-14 15:34:37 +01:00 · 2024-03-14 15:34:37 +01:00 · 7e8899f6a4
commit 7e8899f6a4
parent 693c9e51b2
1 changed files with 114 additions and 116 deletions
--- a/dist/rules/haproxy/embedded-exporter-v2.yml
+++ b/dist/rules/haproxy/embedded-exporter-v2.yml
@ -1,131 +1,129 @@
 groups:
  - name: EmbeddedExporterV2
- name: EmbeddedExporterV2
+    rules:
      - alert: HaproxyHighHttp4xxErrorRateBackend
        expr: '((sum by (proxy, instance) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (proxy, instance) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5'
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: HAProxy high HTTP 4xx error rate backend (instance {{ $labels.instance }})
          description: "Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-  rules:
+      - alert: HaproxyHighHttp5xxErrorRateBackend
        expr: '((sum by (proxy, instance) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (proxy, instance) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5'
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: HAProxy high HTTP 5xx error rate backend (instance {{ $labels.instance }})
          description: "Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-    - alert: HaproxyHighHttp4xxErrorRateBackend
+      - alert: HaproxyHighHttp4xxErrorRateServer
-      expr: '((sum by (proxy) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5'
+        expr: '((sum by (server, instance) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (server, instance) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5'
-      for: 1m
+        for: 1m
-      labels:
+        labels:
-        severity: critical
+          severity: critical
-      annotations:
+        annotations:
-        summary: HAProxy high HTTP 4xx error rate backend (instance {{ $labels.instance }})
+          summary: HAProxy high HTTP 4xx error rate server (instance {{ $labels.instance }})
-        description: "Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+          description: "Too many HTTP requests with status 4xx (> 5%) on server {{ $labels.server }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-    - alert: HaproxyHighHttp5xxErrorRateBackend
+      - alert: HaproxyHighHttp5xxErrorRateServer
-      expr: '((sum by (proxy) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5'
+        expr: '((sum by (server, instance) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (server, instance) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5'
-      for: 1m
+        for: 1m
-      labels:
+        labels:
-        severity: critical
+          severity: critical
-      annotations:
+        annotations:
-        summary: HAProxy high HTTP 5xx error rate backend (instance {{ $labels.instance }})
+          summary: HAProxy high HTTP 5xx error rate server (instance {{ $labels.instance }})
-        description: "Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+          description: "Too many HTTP requests with status 5xx (> 5%) on server {{ $labels.server }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-    - alert: HaproxyHighHttp4xxErrorRateServer
+      - alert: HaproxyServerResponseErrors
-      expr: '((sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5'
+        expr: "(sum by (server, instance) (rate(haproxy_server_response_errors_total[1m])) / sum by (server, instance) (rate(haproxy_server_http_responses_total[1m]))) * 100 > 5"
-      for: 1m
+        for: 1m
-      labels:
+        labels:
-        severity: critical
+          severity: critical
-      annotations:
+        annotations:
-        summary: HAProxy high HTTP 4xx error rate server (instance {{ $labels.instance }})
+          summary: HAProxy server response errors (instance {{ $labels.instance }})
-        description: "Too many HTTP requests with status 4xx (> 5%) on server {{ $labels.server }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+          description: "Too many response errors to {{ $labels.server }} server (> 5%).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-    - alert: HaproxyHighHttp5xxErrorRateServer
+      - alert: HaproxyBackendConnectionErrors
-      expr: '((sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5'
+        expr: "(sum by (proxy, instance) (rate(haproxy_backend_connection_errors_total[1m]))) > 100"
-      for: 1m
+        for: 1m
-      labels:
+        labels:
-        severity: critical
+          severity: critical
-      annotations:
+        annotations:
-        summary: HAProxy high HTTP 5xx error rate server (instance {{ $labels.instance }})
+          summary: HAProxy backend connection errors (instance {{ $labels.instance }})
-        description: "Too many HTTP requests with status 5xx (> 5%) on server {{ $labels.server }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+          description: "Too many connection errors to {{ $labels.fqdn }}/{{ $labels.backend }} backend (> 100 req/s). Request throughput may be too high.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-    - alert: HaproxyServerResponseErrors
+      - alert: HaproxyServerConnectionErrors
-      expr: '(sum by (server) (rate(haproxy_server_response_errors_total[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100 > 5'
+        expr: "(sum by (proxy, instance) (rate(haproxy_server_connection_errors_total[1m]))) > 100"
-      for: 1m
+        for: 0m
-      labels:
+        labels:
-        severity: critical
+          severity: critical
-      annotations:
+        annotations:
-        summary: HAProxy server response errors (instance {{ $labels.instance }})
+          summary: HAProxy server connection errors (instance {{ $labels.instance }})
-        description: "Too many response errors to {{ $labels.server }} server (> 5%).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+          description: "Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-    - alert: HaproxyBackendConnectionErrors
+      - alert: HaproxyBackendMaxActiveSession>80%
-      expr: '(sum by (proxy) (rate(haproxy_backend_connection_errors_total[1m]))) > 100'
+        expr: "((haproxy_server_max_sessions >0) * 100) / (haproxy_server_limit_sessions > 0) > 80"
-      for: 1m
+        for: 2m
-      labels:
+        labels:
-        severity: critical
+          severity: warning
-      annotations:
+        annotations:
-        summary: HAProxy backend connection errors (instance {{ $labels.instance }})
+          summary: HAProxy backend max active session > 80% (instance {{ $labels.instance }})
-        description: "Too many connection errors to {{ $labels.fqdn }}/{{ $labels.backend }} backend (> 100 req/s). Request throughput may be too high.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+          description: "Session limit from backend {{ $labels.proxy }} to server {{ $labels.server }} reached 80% of limit - {{ $value | printf \"%.2f\"}}%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-    - alert: HaproxyServerConnectionErrors
+      - alert: HaproxyPendingRequests
-      expr: '(sum by (proxy) (rate(haproxy_server_connection_errors_total[1m]))) > 100'
+        expr: "sum by (proxy, instance) (rate(haproxy_backend_current_queue[2m])) > 0"
-      for: 0m
+        for: 2m
-      labels:
+        labels:
-        severity: critical
+          severity: warning
-      annotations:
+        annotations:
-        summary: HAProxy server connection errors (instance {{ $labels.instance }})
+          summary: HAProxy pending requests (instance {{ $labels.instance }})
-        description: "Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+          description: "Some HAProxy requests are pending on {{ $labels.proxy }} - {{ $value | printf \"%.2f\"}}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-    - alert: HaproxyBackendMaxActiveSession>80%
+      - alert: HaproxyHttpSlowingDown
-      expr: '((haproxy_server_max_sessions >0) * 100) / (haproxy_server_limit_sessions > 0) > 80'
+        expr: "avg by (instance, proxy) (haproxy_backend_max_total_time_seconds) > 1"
-      for: 2m
+        for: 1m
-      labels:
+        labels:
-        severity: warning
+          severity: warning
-      annotations:
+        annotations:
-        summary: HAProxy backend max active session > 80% (instance {{ $labels.instance }})
+          summary: HAProxy HTTP slowing down (instance {{ $labels.instance }})
-        description: "Session limit from backend {{ $labels.proxy }} to server {{ $labels.server }} reached 80% of limit - {{ $value | printf \"%.2f\"}}%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+          description: "Average request time is increasing - {{ $value | printf \"%.2f\"}}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-    - alert: HaproxyPendingRequests
+      - alert: HaproxyRetryHigh
-      expr: 'sum by (proxy) (rate(haproxy_backend_current_queue[2m])) > 0'
+        expr: "sum by (proxy, instance) (rate(haproxy_backend_retry_warnings_total[1m])) > 10"
-      for: 2m
+        for: 2m
-      labels:
+        labels:
-        severity: warning
+          severity: warning
-      annotations:
+        annotations:
-        summary: HAProxy pending requests (instance {{ $labels.instance }})
+          summary: HAProxy retry high (instance {{ $labels.instance }})
-        description: "Some HAProxy requests are pending on {{ $labels.proxy }} - {{ $value | printf \"%.2f\"}}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+          description: "High rate of retry on {{ $labels.proxy }} - {{ $value | printf \"%.2f\"}}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-    - alert: HaproxyHttpSlowingDown
+      - alert: HaproxyHasNoAliveBackends
-      expr: 'avg by (instance, proxy) (haproxy_backend_max_total_time_seconds) > 1'
+        expr: "haproxy_backend_active_servers + haproxy_backend_backup_servers == 0"
-      for: 1m
+        for: 0m
-      labels:
+        labels:
-        severity: warning
+          severity: critical
-      annotations:
+        annotations:
-        summary: HAProxy HTTP slowing down (instance {{ $labels.instance }})
+          summary: HAproxy has no alive backends (instance {{ $labels.instance }})
-        description: "Average request time is increasing - {{ $value | printf \"%.2f\"}}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+          description: "HAProxy has no alive active or backup backends for {{ $labels.proxy }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-    - alert: HaproxyRetryHigh
+      - alert: HaproxyFrontendSecurityBlockedRequests
-      expr: 'sum by (proxy) (rate(haproxy_backend_retry_warnings_total[1m])) > 10'
+        expr: "sum by (proxy, instance) (rate(haproxy_frontend_denied_connections_total[2m])) > 10"
-      for: 2m
+        for: 2m
-      labels:
+        labels:
-        severity: warning
+          severity: warning
-      annotations:
+        annotations:
-        summary: HAProxy retry high (instance {{ $labels.instance }})
+          summary: HAProxy frontend security blocked requests (instance {{ $labels.instance }})
-        description: "High rate of retry on {{ $labels.proxy }} - {{ $value | printf \"%.2f\"}}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+          description: "HAProxy is blocking requests for security reason\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-    - alert: HaproxyHasNoAliveBackends
+      - alert: HaproxyServerHealthcheckFailure
-      expr: 'haproxy_backend_active_servers + haproxy_backend_backup_servers == 0'
+        expr: "increase(haproxy_server_check_failures_total[1m]) > 0"
-      for: 0m
+        for: 1m
-      labels:
+        labels:
-        severity: critical
+          severity: warning
-      annotations:
+        annotations:
-        summary: HAproxy has no alive backends (instance {{ $labels.instance }})
+          summary: HAProxy server healthcheck failure (instance {{ $labels.instance }})
-        description: "HAProxy has no alive active or backup backends for {{ $labels.proxy }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+          description: "Some server healthcheck are failing on {{ $labels.server }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HaproxyFrontendSecurityBlockedRequests
      expr: 'sum by (proxy) (rate(haproxy_frontend_denied_connections_total[2m])) > 10'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: HAProxy frontend security blocked requests (instance {{ $labels.instance }})
        description: "HAProxy is blocking requests for security reason\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HaproxyServerHealthcheckFailure
      expr: 'increase(haproxy_server_check_failures_total[1m]) > 0'
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: HAProxy server healthcheck failure (instance {{ $labels.instance }})
        description: "Some server healthcheck are failing on {{ $labels.server }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"