diff --git a/dist/rules/haproxy/embedded-exporter-v2.yml b/dist/rules/haproxy/embedded-exporter-v2.yml index 4814124..7b75261 100644 --- a/dist/rules/haproxy/embedded-exporter-v2.yml +++ b/dist/rules/haproxy/embedded-exporter-v2.yml @@ -1,131 +1,129 @@ groups: + - name: EmbeddedExporterV2 -- name: EmbeddedExporterV2 + rules: + - alert: HaproxyHighHttp4xxErrorRateBackend + expr: '((sum by (proxy, instance) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (proxy, instance) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5' + for: 1m + labels: + severity: critical + annotations: + summary: HAProxy high HTTP 4xx error rate backend (instance {{ $labels.instance }}) + description: "Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - rules: + - alert: HaproxyHighHttp5xxErrorRateBackend + expr: '((sum by (proxy, instance) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (proxy, instance) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5' + for: 1m + labels: + severity: critical + annotations: + summary: HAProxy high HTTP 5xx error rate backend (instance {{ $labels.instance }}) + description: "Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: HaproxyHighHttp4xxErrorRateBackend - expr: '((sum by (proxy) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5' - for: 1m - labels: - severity: critical - annotations: - summary: HAProxy high HTTP 4xx error rate backend (instance {{ $labels.instance }}) - description: "Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: HaproxyHighHttp4xxErrorRateServer + expr: '((sum by (server, instance) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (server, instance) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5' + for: 1m + labels: + severity: critical + annotations: + summary: HAProxy high HTTP 4xx error rate server (instance {{ $labels.instance }}) + description: "Too many HTTP requests with status 4xx (> 5%) on server {{ $labels.server }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: HaproxyHighHttp5xxErrorRateBackend - expr: '((sum by (proxy) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5' - for: 1m - labels: - severity: critical - annotations: - summary: HAProxy high HTTP 5xx error rate backend (instance {{ $labels.instance }}) - description: "Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: HaproxyHighHttp5xxErrorRateServer + expr: '((sum by (server, instance) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (server, instance) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5' + for: 1m + labels: + severity: critical + annotations: + summary: HAProxy high HTTP 5xx error rate server (instance {{ $labels.instance }}) + description: "Too many HTTP requests with status 5xx (> 5%) on server {{ $labels.server }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: HaproxyHighHttp4xxErrorRateServer - expr: '((sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5' - for: 1m - labels: - severity: critical - annotations: - summary: HAProxy high HTTP 4xx error rate server (instance {{ $labels.instance }}) - description: "Too many HTTP requests with status 4xx (> 5%) on server {{ $labels.server }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: HaproxyServerResponseErrors + expr: "(sum by (server, instance) (rate(haproxy_server_response_errors_total[1m])) / sum by (server, instance) (rate(haproxy_server_http_responses_total[1m]))) * 100 > 5" + for: 1m + labels: + severity: critical + annotations: + summary: HAProxy server response errors (instance {{ $labels.instance }}) + description: "Too many response errors to {{ $labels.server }} server (> 5%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: HaproxyHighHttp5xxErrorRateServer - expr: '((sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5' - for: 1m - labels: - severity: critical - annotations: - summary: HAProxy high HTTP 5xx error rate server (instance {{ $labels.instance }}) - description: "Too many HTTP requests with status 5xx (> 5%) on server {{ $labels.server }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: HaproxyBackendConnectionErrors + expr: "(sum by (proxy, instance) (rate(haproxy_backend_connection_errors_total[1m]))) > 100" + for: 1m + labels: + severity: critical + annotations: + summary: HAProxy backend connection errors (instance {{ $labels.instance }}) + description: "Too many connection errors to {{ $labels.fqdn }}/{{ $labels.backend }} backend (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: HaproxyServerResponseErrors - expr: '(sum by (server) (rate(haproxy_server_response_errors_total[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100 > 5' - for: 1m - labels: - severity: critical - annotations: - summary: HAProxy server response errors (instance {{ $labels.instance }}) - description: "Too many response errors to {{ $labels.server }} server (> 5%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: HaproxyServerConnectionErrors + expr: "(sum by (proxy, instance) (rate(haproxy_server_connection_errors_total[1m]))) > 100" + for: 0m + labels: + severity: critical + annotations: + summary: HAProxy server connection errors (instance {{ $labels.instance }}) + description: "Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: HaproxyBackendConnectionErrors - expr: '(sum by (proxy) (rate(haproxy_backend_connection_errors_total[1m]))) > 100' - for: 1m - labels: - severity: critical - annotations: - summary: HAProxy backend connection errors (instance {{ $labels.instance }}) - description: "Too many connection errors to {{ $labels.fqdn }}/{{ $labels.backend }} backend (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: HaproxyBackendMaxActiveSession>80% + expr: "((haproxy_server_max_sessions >0) * 100) / (haproxy_server_limit_sessions > 0) > 80" + for: 2m + labels: + severity: warning + annotations: + summary: HAProxy backend max active session > 80% (instance {{ $labels.instance }}) + description: "Session limit from backend {{ $labels.proxy }} to server {{ $labels.server }} reached 80% of limit - {{ $value | printf \"%.2f\"}}%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: HaproxyServerConnectionErrors - expr: '(sum by (proxy) (rate(haproxy_server_connection_errors_total[1m]))) > 100' - for: 0m - labels: - severity: critical - annotations: - summary: HAProxy server connection errors (instance {{ $labels.instance }}) - description: "Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: HaproxyPendingRequests + expr: "sum by (proxy, instance) (rate(haproxy_backend_current_queue[2m])) > 0" + for: 2m + labels: + severity: warning + annotations: + summary: HAProxy pending requests (instance {{ $labels.instance }}) + description: "Some HAProxy requests are pending on {{ $labels.proxy }} - {{ $value | printf \"%.2f\"}}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: HaproxyBackendMaxActiveSession>80% - expr: '((haproxy_server_max_sessions >0) * 100) / (haproxy_server_limit_sessions > 0) > 80' - for: 2m - labels: - severity: warning - annotations: - summary: HAProxy backend max active session > 80% (instance {{ $labels.instance }}) - description: "Session limit from backend {{ $labels.proxy }} to server {{ $labels.server }} reached 80% of limit - {{ $value | printf \"%.2f\"}}%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: HaproxyHttpSlowingDown + expr: "avg by (instance, proxy) (haproxy_backend_max_total_time_seconds) > 1" + for: 1m + labels: + severity: warning + annotations: + summary: HAProxy HTTP slowing down (instance {{ $labels.instance }}) + description: "Average request time is increasing - {{ $value | printf \"%.2f\"}}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: HaproxyPendingRequests - expr: 'sum by (proxy) (rate(haproxy_backend_current_queue[2m])) > 0' - for: 2m - labels: - severity: warning - annotations: - summary: HAProxy pending requests (instance {{ $labels.instance }}) - description: "Some HAProxy requests are pending on {{ $labels.proxy }} - {{ $value | printf \"%.2f\"}}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: HaproxyRetryHigh + expr: "sum by (proxy, instance) (rate(haproxy_backend_retry_warnings_total[1m])) > 10" + for: 2m + labels: + severity: warning + annotations: + summary: HAProxy retry high (instance {{ $labels.instance }}) + description: "High rate of retry on {{ $labels.proxy }} - {{ $value | printf \"%.2f\"}}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: HaproxyHttpSlowingDown - expr: 'avg by (instance, proxy) (haproxy_backend_max_total_time_seconds) > 1' - for: 1m - labels: - severity: warning - annotations: - summary: HAProxy HTTP slowing down (instance {{ $labels.instance }}) - description: "Average request time is increasing - {{ $value | printf \"%.2f\"}}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: HaproxyHasNoAliveBackends + expr: "haproxy_backend_active_servers + haproxy_backend_backup_servers == 0" + for: 0m + labels: + severity: critical + annotations: + summary: HAproxy has no alive backends (instance {{ $labels.instance }}) + description: "HAProxy has no alive active or backup backends for {{ $labels.proxy }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: HaproxyRetryHigh - expr: 'sum by (proxy) (rate(haproxy_backend_retry_warnings_total[1m])) > 10' - for: 2m - labels: - severity: warning - annotations: - summary: HAProxy retry high (instance {{ $labels.instance }}) - description: "High rate of retry on {{ $labels.proxy }} - {{ $value | printf \"%.2f\"}}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: HaproxyFrontendSecurityBlockedRequests + expr: "sum by (proxy, instance) (rate(haproxy_frontend_denied_connections_total[2m])) > 10" + for: 2m + labels: + severity: warning + annotations: + summary: HAProxy frontend security blocked requests (instance {{ $labels.instance }}) + description: "HAProxy is blocking requests for security reason\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: HaproxyHasNoAliveBackends - expr: 'haproxy_backend_active_servers + haproxy_backend_backup_servers == 0' - for: 0m - labels: - severity: critical - annotations: - summary: HAproxy has no alive backends (instance {{ $labels.instance }}) - description: "HAProxy has no alive active or backup backends for {{ $labels.proxy }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: HaproxyFrontendSecurityBlockedRequests - expr: 'sum by (proxy) (rate(haproxy_frontend_denied_connections_total[2m])) > 10' - for: 2m - labels: - severity: warning - annotations: - summary: HAProxy frontend security blocked requests (instance {{ $labels.instance }}) - description: "HAProxy is blocking requests for security reason\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: HaproxyServerHealthcheckFailure - expr: 'increase(haproxy_server_check_failures_total[1m]) > 0' - for: 1m - labels: - severity: warning - annotations: - summary: HAProxy server healthcheck failure (instance {{ $labels.instance }}) - description: "Some server healthcheck are failing on {{ $labels.server }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: HaproxyServerHealthcheckFailure + expr: "increase(haproxy_server_check_failures_total[1m]) > 0" + for: 1m + labels: + severity: warning + annotations: + summary: HAProxy server healthcheck failure (instance {{ $labels.instance }}) + description: "Some server healthcheck are failing on {{ $labels.server }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"