HAproxy: add instance to sums, so we keep the label and can use it in summary

This commit is contained in:
Matej Zerovnik 2024-03-14 15:34:37 +01:00
parent 693c9e51b2
commit 7e8899f6a4

View file

@ -1,11 +1,9 @@
groups: groups:
- name: EmbeddedExporterV2
- name: EmbeddedExporterV2
rules: rules:
- alert: HaproxyHighHttp4xxErrorRateBackend - alert: HaproxyHighHttp4xxErrorRateBackend
expr: '((sum by (proxy) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5' expr: '((sum by (proxy, instance) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (proxy, instance) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5'
for: 1m for: 1m
labels: labels:
severity: critical severity: critical
@ -14,7 +12,7 @@ groups:
description: "Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyHighHttp5xxErrorRateBackend - alert: HaproxyHighHttp5xxErrorRateBackend
expr: '((sum by (proxy) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5' expr: '((sum by (proxy, instance) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (proxy, instance) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5'
for: 1m for: 1m
labels: labels:
severity: critical severity: critical
@ -23,7 +21,7 @@ groups:
description: "Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyHighHttp4xxErrorRateServer - alert: HaproxyHighHttp4xxErrorRateServer
expr: '((sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5' expr: '((sum by (server, instance) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (server, instance) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5'
for: 1m for: 1m
labels: labels:
severity: critical severity: critical
@ -32,7 +30,7 @@ groups:
description: "Too many HTTP requests with status 4xx (> 5%) on server {{ $labels.server }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Too many HTTP requests with status 4xx (> 5%) on server {{ $labels.server }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyHighHttp5xxErrorRateServer - alert: HaproxyHighHttp5xxErrorRateServer
expr: '((sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5' expr: '((sum by (server, instance) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (server, instance) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5'
for: 1m for: 1m
labels: labels:
severity: critical severity: critical
@ -41,7 +39,7 @@ groups:
description: "Too many HTTP requests with status 5xx (> 5%) on server {{ $labels.server }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Too many HTTP requests with status 5xx (> 5%) on server {{ $labels.server }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyServerResponseErrors - alert: HaproxyServerResponseErrors
expr: '(sum by (server) (rate(haproxy_server_response_errors_total[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100 > 5' expr: "(sum by (server, instance) (rate(haproxy_server_response_errors_total[1m])) / sum by (server, instance) (rate(haproxy_server_http_responses_total[1m]))) * 100 > 5"
for: 1m for: 1m
labels: labels:
severity: critical severity: critical
@ -50,7 +48,7 @@ groups:
description: "Too many response errors to {{ $labels.server }} server (> 5%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Too many response errors to {{ $labels.server }} server (> 5%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyBackendConnectionErrors - alert: HaproxyBackendConnectionErrors
expr: '(sum by (proxy) (rate(haproxy_backend_connection_errors_total[1m]))) > 100' expr: "(sum by (proxy, instance) (rate(haproxy_backend_connection_errors_total[1m]))) > 100"
for: 1m for: 1m
labels: labels:
severity: critical severity: critical
@ -59,7 +57,7 @@ groups:
description: "Too many connection errors to {{ $labels.fqdn }}/{{ $labels.backend }} backend (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Too many connection errors to {{ $labels.fqdn }}/{{ $labels.backend }} backend (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyServerConnectionErrors - alert: HaproxyServerConnectionErrors
expr: '(sum by (proxy) (rate(haproxy_server_connection_errors_total[1m]))) > 100' expr: "(sum by (proxy, instance) (rate(haproxy_server_connection_errors_total[1m]))) > 100"
for: 0m for: 0m
labels: labels:
severity: critical severity: critical
@ -68,7 +66,7 @@ groups:
description: "Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyBackendMaxActiveSession>80% - alert: HaproxyBackendMaxActiveSession>80%
expr: '((haproxy_server_max_sessions >0) * 100) / (haproxy_server_limit_sessions > 0) > 80' expr: "((haproxy_server_max_sessions >0) * 100) / (haproxy_server_limit_sessions > 0) > 80"
for: 2m for: 2m
labels: labels:
severity: warning severity: warning
@ -77,7 +75,7 @@ groups:
description: "Session limit from backend {{ $labels.proxy }} to server {{ $labels.server }} reached 80% of limit - {{ $value | printf \"%.2f\"}}%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Session limit from backend {{ $labels.proxy }} to server {{ $labels.server }} reached 80% of limit - {{ $value | printf \"%.2f\"}}%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyPendingRequests - alert: HaproxyPendingRequests
expr: 'sum by (proxy) (rate(haproxy_backend_current_queue[2m])) > 0' expr: "sum by (proxy, instance) (rate(haproxy_backend_current_queue[2m])) > 0"
for: 2m for: 2m
labels: labels:
severity: warning severity: warning
@ -86,7 +84,7 @@ groups:
description: "Some HAProxy requests are pending on {{ $labels.proxy }} - {{ $value | printf \"%.2f\"}}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Some HAProxy requests are pending on {{ $labels.proxy }} - {{ $value | printf \"%.2f\"}}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyHttpSlowingDown - alert: HaproxyHttpSlowingDown
expr: 'avg by (instance, proxy) (haproxy_backend_max_total_time_seconds) > 1' expr: "avg by (instance, proxy) (haproxy_backend_max_total_time_seconds) > 1"
for: 1m for: 1m
labels: labels:
severity: warning severity: warning
@ -95,7 +93,7 @@ groups:
description: "Average request time is increasing - {{ $value | printf \"%.2f\"}}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Average request time is increasing - {{ $value | printf \"%.2f\"}}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyRetryHigh - alert: HaproxyRetryHigh
expr: 'sum by (proxy) (rate(haproxy_backend_retry_warnings_total[1m])) > 10' expr: "sum by (proxy, instance) (rate(haproxy_backend_retry_warnings_total[1m])) > 10"
for: 2m for: 2m
labels: labels:
severity: warning severity: warning
@ -104,7 +102,7 @@ groups:
description: "High rate of retry on {{ $labels.proxy }} - {{ $value | printf \"%.2f\"}}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "High rate of retry on {{ $labels.proxy }} - {{ $value | printf \"%.2f\"}}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyHasNoAliveBackends - alert: HaproxyHasNoAliveBackends
expr: 'haproxy_backend_active_servers + haproxy_backend_backup_servers == 0' expr: "haproxy_backend_active_servers + haproxy_backend_backup_servers == 0"
for: 0m for: 0m
labels: labels:
severity: critical severity: critical
@ -113,7 +111,7 @@ groups:
description: "HAProxy has no alive active or backup backends for {{ $labels.proxy }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "HAProxy has no alive active or backup backends for {{ $labels.proxy }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyFrontendSecurityBlockedRequests - alert: HaproxyFrontendSecurityBlockedRequests
expr: 'sum by (proxy) (rate(haproxy_frontend_denied_connections_total[2m])) > 10' expr: "sum by (proxy, instance) (rate(haproxy_frontend_denied_connections_total[2m])) > 10"
for: 2m for: 2m
labels: labels:
severity: warning severity: warning
@ -122,7 +120,7 @@ groups:
description: "HAProxy is blocking requests for security reason\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "HAProxy is blocking requests for security reason\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyServerHealthcheckFailure - alert: HaproxyServerHealthcheckFailure
expr: 'increase(haproxy_server_check_failures_total[1m]) > 0' expr: "increase(haproxy_server_check_failures_total[1m]) > 0"
for: 1m for: 1m
labels: labels:
severity: warning severity: warning