From 1b7c36666cf4a3c3b27f3ff4955094994524585a Mon Sep 17 00:00:00 2001 From: Benjamin Dos Santos Date: Tue, 5 Jan 2021 16:45:52 +0100 Subject: [PATCH] chore: add Prometheus alerts for HAProxy v2 ref #87 --- _data/rules.yml | 74 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 73 insertions(+), 1 deletion(-) diff --git a/_data/rules.yml b/_data/rules.yml index c71dd23..9111b34 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -1070,12 +1070,84 @@ groups: - name: Embedded exporter (HAProxy >= v2) doc_url: https://github.com/haproxy/haproxy/tree/master/contrib/prometheus-exporter rules: + - name: HAProxy high HTTP 4xx error rate backend + description: Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }} + query: '((sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5' + severity: critical + for: 1m + - name: HAProxy high HTTP 4xx error rate backend + description: Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }} + query: ((sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 + severity: critical + for: 1m + - name: HAProxy high HTTP 4xx error rate server + description: Too many HTTP requests with status 4xx (> 5%) on server {{ $labels.server }} + query: ((sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 + severity: critical + for: 1m + - name: HAProxy high HTTP 5xx error rate server + description: Too many HTTP requests with status 5xx (> 5%) on server {{ $labels.server }} + query: ((sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 + severity: critical + for: 1m + - name: HAProxy server response errors + description: Too many response errors to {{ $labels.server }} server (> 5%). + query: (sum by (server) (rate(haproxy_server_response_errors_total[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100 > 5 + severity: critical + for: 1m + - name: HAProxy backend connection errors + description: Too many connection errors to {{ $labels.fqdn }}/{{ $labels.backend }} backend (> 100 req/s). Request throughput may be to high. + query: (sum by (proxy) (rate(haproxy_backend_connection_errors_total[1m]))) > 100 + severity: critical + for: 1m + - name: HAProxy server connection errors + description: Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be to high. + query: (sum by (proxy) (rate(haproxy_backend_connection_errors_total[1m]))) > 100 + severity: critical + - name: HAProxy backend max active session + description: HAproxy backend {{ $labels.fqdn }}/{{ $labels.backend }} is reaching session limit (> 80%). + query: avg_over_time(((sum by (proxy) (haproxy_server_max_sessions)) / (sum by (proxy) (haproxy_server_limit_sessions))) [2m]) * 100 > 80 + severity: warning + for: 2m + - name: HAProxy pending requests + description: Some HAProxy requests are pending on {{ $labels.fqdn }}/{{ $labels.backend }} backend + query: sum by (proxy) haproxy_backend_current_queue > 0 + severity: warning + for: 2m + - name: HAProxy HTTP slowing down + description: Average request time is increasing + query: avg by (proxy) (haproxy_backend_max_total_time_seconds) > 1 + severity: warning + for: 1m + - name: HAProxy retry high + description: High rate of retry on {{ $labels.fqdn }}/{{ $labels.backend }} backend + query: sum by (proxy) (rate(haproxy_backend_retry_warnings_total[1m])) > 10 + severity: warning + for: 2m + - name: HAProxy proxy down + description: HAProxy proxy is down + query: haproxy_backend_up == 0 + severity: critical + - name: HAProxy server down + description: HAProxy backend is down + query: haproxy_backend_active_servers == 0 + severity: critical + - name: HAProxy frontend security blocked requests + description: HAProxy is blocking requests for security reason + query: rate(sum by (proxy) (haproxy_frontend_denied_connections_total)) > 10 + severity: warning + for: 2m + - name: HAProxy server healthcheck failure + description: Some server healthcheck are failing on {{ $labels.server }} + query: increase(haproxy_server_check_failures_total[1m]) > 0 + severity: warning + for: 1m - name: prometheus/haproxy_exporter (HAProxy < v2) doc_url: https://github.com/prometheus/haproxy_exporter rules: - name: HAProxy down description: HAProxy down - query: 'haproxy_up == 0' + query: "haproxy_up == 0" severity: critical - name: HAProxy high HTTP 4xx error rate backend description: Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}