mirror of
https://github.com/samber/awesome-prometheus-alerts.git
synced 2026-06-22 01:17:19 +08:00
65 lines
3.2 KiB
YAML
65 lines
3.2 KiB
YAML
groups:
|
|
|
|
- name: Gitaly
|
|
|
|
|
|
rules:
|
|
|
|
- alert: GitlabGitalyHighGrpcErrorRate
|
|
expr: 'sum(rate(grpc_server_handled_total{job="gitaly",grpc_code!="OK"}[5m])) / sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) * 100 > 5'
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: GitLab Gitaly high gRPC error rate (instance {{ $labels.instance }})
|
|
description: "Gitaly on {{ $labels.instance }} is returning more than 5% gRPC errors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
# ResourceExhausted errors from Gitaly mean Git operations are being rejected due to
|
|
# concurrency limits. This directly impacts users trying to push, pull, or clone.
|
|
# This alert is derived from the GitLab Omnibus default rules.
|
|
- alert: GitlabGitalyResourceExhausted
|
|
expr: 'sum(rate(grpc_server_handled_total{job="gitaly",grpc_code="ResourceExhausted"}[5m])) / sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) * 100 > 1'
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: GitLab Gitaly resource exhausted (instance {{ $labels.instance }})
|
|
description: "Gitaly on {{ $labels.instance }} is returning ResourceExhausted errors, indicating overload ({{ $value }}%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: GitlabGitalyHighRpcLatency
|
|
expr: 'histogram_quantile(0.95, sum(rate(grpc_server_handling_seconds_bucket{job="gitaly",grpc_type="unary"}[5m])) by (le)) > 1'
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: GitLab Gitaly high RPC latency (instance {{ $labels.instance }})
|
|
description: "Gitaly on {{ $labels.instance }} p95 unary RPC latency exceeds 1 second ({{ $value }}s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: GitlabGitalyCpuThrottled
|
|
expr: 'rate(gitaly_cgroup_cpu_cfs_throttled_seconds_total[5m]) > 0'
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: GitLab Gitaly CPU throttled (instance {{ $labels.instance }})
|
|
description: "Gitaly processes on {{ $labels.instance }} are being CPU throttled by cgroups.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: GitlabGitalyAuthenticationFailures
|
|
expr: 'increase(gitaly_authentications_total{status="failed"}[5m]) > 0'
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: GitLab Gitaly authentication failures (instance {{ $labels.instance }})
|
|
description: "Gitaly on {{ $labels.instance }} has authentication failures ({{ $value }}).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
# When the circuit breaker trips to "open" state, Git operations (push, pull, clone) will fail.
|
|
# Check Gitaly service health and logs.
|
|
- alert: GitlabGitalyCircuitBreakerTripped
|
|
expr: 'increase(gitaly_circuit_breaker_transitions_total{to_state="open"}[5m]) > 0'
|
|
for: 0m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: GitLab Gitaly circuit breaker tripped (instance {{ $labels.instance }})
|
|
description: "Gitaly circuit breaker has tripped on {{ $labels.instance }}. Git operations are failing.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|