mirror of
https://github.com/samber/awesome-prometheus-alerts.git
synced 2026-06-21 00:47:18 +08:00
Publish
This commit is contained in:
parent
a4581ed322
commit
4fb1aa9ae4
29 changed files with 72 additions and 57 deletions
|
|
@ -33,9 +33,10 @@ groups:
|
|||
summary: Flink all task slots used (instance {{ $labels.instance }})
|
||||
description: "All Flink task slots are in use ({{ $value }} available). New jobs cannot be scheduled.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# A single restart may be normal during deployments. Adjust threshold based on restart tolerance.
|
||||
- alert: FlinkJobRestartIncreasing
|
||||
expr: 'increase(flink_jobmanager_job_numRestarts[5m]) > 0'
|
||||
for: 0m
|
||||
expr: 'increase(flink_jobmanager_job_numRestarts[5m]) > 1'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
|
|
@ -43,14 +44,15 @@ groups:
|
|||
description: "Flink job {{ $labels.job_name }} has restarted {{ $value }} times in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: FlinkCheckpointFailures
|
||||
expr: 'increase(flink_jobmanager_job_numberOfFailedCheckpoints[10m]) > 0'
|
||||
for: 0m
|
||||
expr: 'increase(flink_jobmanager_job_numberOfFailedCheckpoints[10m]) > 1'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Flink checkpoint failures (instance {{ $labels.instance }})
|
||||
description: "Flink job {{ $labels.job_name }} has {{ $value }} failed checkpoints in the last 10 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Value is in milliseconds. humanizeDuration expects seconds, so the template output may be misleading.
|
||||
# Threshold is 60 seconds. Adjust based on your checkpoint interval and state size.
|
||||
- alert: FlinkCheckpointDurationHigh
|
||||
expr: 'flink_jobmanager_job_lastCheckpointDuration > 60000'
|
||||
|
|
|
|||
9
dist/rules/apache-spark/spark-prometheus.yml
vendored
9
dist/rules/apache-spark/spark-prometheus.yml
vendored
|
|
@ -51,7 +51,7 @@ groups:
|
|||
# Fires when more than 10% of executor time is spent in garbage collection.
|
||||
# This metric comes from the PrometheusResource endpoint (/metrics/executors/prometheus/).
|
||||
- alert: SparkExecutorHighGcTime
|
||||
expr: 'metrics_executor_totalGCTime / (metrics_executor_totalDuration > 0) > 0.1'
|
||||
expr: 'metrics_executor_totalGCTime_seconds_total / (metrics_executor_totalDuration > 0) > 0.1'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -60,7 +60,7 @@ groups:
|
|||
description: "Spark executor {{ $labels.executor_id }} in {{ $labels.application_name }} is spending too much time in GC.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: SparkExecutorAllTasksFailing
|
||||
expr: 'metrics_executor_failedTasks > 0 and metrics_executor_completedTasks == 0'
|
||||
expr: 'metrics_executor_failedTasks_total > 0 and metrics_executor_completedTasks == 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
|
|
@ -69,7 +69,7 @@ groups:
|
|||
description: "Spark executor {{ $labels.executor_id }} has only failing tasks ({{ $value }} failed, 0 completed).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: SparkExecutorHighTaskFailureRate
|
||||
expr: 'metrics_executor_failedTasks / (metrics_executor_totalTasks > 0) > 0.1'
|
||||
expr: 'metrics_executor_failedTasks_total / (metrics_executor_totalTasks_total > 0) > 0.1'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -77,9 +77,10 @@ groups:
|
|||
summary: Spark executor high task failure rate (instance {{ $labels.instance }})
|
||||
description: "Spark executor {{ $labels.executor_id }} has a task failure rate above 10%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# diskUsed is a gauge, not a counter — do not use rate(). Threshold of 1GB is a rough default.
|
||||
# Disk spilling indicates insufficient memory for the workload.
|
||||
- alert: SparkExecutorHighDiskSpill
|
||||
expr: 'rate(metrics_executor_diskUsed_bytes[5m]) > 0'
|
||||
expr: 'metrics_executor_diskUsed_bytes > 1e9'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
|
|||
|
|
@ -112,7 +112,7 @@ groups:
|
|||
|
||||
# Requires ApplicationELB HTTPCode_ELB_5XX_Count and RequestCount metrics.
|
||||
- alert: AwsAlbHigh5xxErrorRate
|
||||
expr: '(aws_applicationelb_httpcode_elb_5_xx_count_sum / aws_applicationelb_request_count_sum) * 100 > 5'
|
||||
expr: '(aws_applicationelb_httpcode_elb_5_xx_count_sum / aws_applicationelb_request_count_sum) * 100 > 5 and aws_applicationelb_request_count_sum > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
|
|
@ -132,7 +132,7 @@ groups:
|
|||
|
||||
# Requires Lambda Errors and Invocations metrics.
|
||||
- alert: AwsLambdaHighErrorRate
|
||||
expr: '(aws_lambda_errors_sum / aws_lambda_invocations_sum) * 100 > 5'
|
||||
expr: '(aws_lambda_errors_sum / aws_lambda_invocations_sum) * 100 > 5 and aws_lambda_invocations_sum > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
|
|||
|
|
@ -33,6 +33,7 @@ groups:
|
|||
summary: Cert-Manager certificate not ready (instance {{ $labels.instance }})
|
||||
description: "The certificate {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is not ready to serve traffic.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# In cert-manager 1.19+, the metric was renamed (dropped http_ prefix). Verify metric name against your version.
|
||||
- alert: Cert-managerHittingAcmeRateLimits
|
||||
expr: 'sum by (host) (rate(certmanager_http_acme_client_request_count{status="429"}[5m])) > 0'
|
||||
for: 5m
|
||||
|
|
|
|||
4
dist/rules/cilium/embedded-exporter.yml
vendored
4
dist/rules/cilium/embedded-exporter.yml
vendored
|
|
@ -5,6 +5,7 @@ groups:
|
|||
|
||||
rules:
|
||||
|
||||
# Metric name depends on Cilium version. Use cilium_unreachable_nodes (older) or cilium_node_connectivity_status (1.14+).
|
||||
- alert: CiliumAgentUnreachableNodes
|
||||
expr: 'sum(cilium_unreachable_nodes{}) by (pod) > 0'
|
||||
for: 15m
|
||||
|
|
@ -14,6 +15,7 @@ groups:
|
|||
summary: Cilium agent unreachable nodes (instance {{ $labels.instance }})
|
||||
description: "Cilium agent {{ $labels.pod }} cannot reach {{ $value }} node(s). Check network connectivity and node health.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Metric name depends on Cilium version. Use cilium_unreachable_health_endpoints (older) or cilium_node_connectivity_status (1.14+).
|
||||
- alert: CiliumAgentUnreachableHealthEndpoints
|
||||
expr: 'sum(cilium_unreachable_health_endpoints{}) by (pod) > 0'
|
||||
for: 15m
|
||||
|
|
@ -23,6 +25,7 @@ groups:
|
|||
summary: Cilium agent unreachable health endpoints (instance {{ $labels.instance }})
|
||||
description: "Cilium agent {{ $labels.pod }} cannot reach {{ $value }} health endpoint(s). Node-to-node health probes are failing.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Metric name depends on Cilium version. Use cilium_controllers_failing (older) or cilium_controllers_runs_total (1.14+).
|
||||
- alert: CiliumAgentFailingControllers
|
||||
expr: 'sum(cilium_controllers_failing{}) by (pod) > 0'
|
||||
for: 5m
|
||||
|
|
@ -198,6 +201,7 @@ groups:
|
|||
summary: Cilium operator low available IPAM IPs (instance {{ $labels.instance }})
|
||||
description: "Cilium operator IPAM IP pool is over 90% utilized. Allocate more IPs to avoid exhaustion.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Some Cilium versions may not have a status label on this metric. Verify against your Cilium version.
|
||||
- alert: CiliumOperatorIpamInterfaceCreationFailures
|
||||
expr: 'sum(rate(cilium_operator_ipam_interface_creation_ops{status!="success"}[5m])) by () > 0'
|
||||
for: 10m
|
||||
|
|
|
|||
|
|
@ -16,7 +16,7 @@ groups:
|
|||
|
||||
- alert: DigitaloceanAccountNotActive
|
||||
expr: 'digitalocean_account_active != 1'
|
||||
for: 0m
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
|
|
@ -52,7 +52,7 @@ groups:
|
|||
|
||||
- alert: DigitaloceanLoadBalancerNoBackends
|
||||
expr: 'digitalocean_loadbalancer_droplets == 0'
|
||||
for: 0m
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
|
|
@ -79,7 +79,7 @@ groups:
|
|||
|
||||
- alert: DigitaloceanExporterCollectionErrors
|
||||
expr: 'increase(digitalocean_errors_total[5m]) > 0'
|
||||
for: 0m
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
|
|
|
|||
6
dist/rules/ebpf/ebpf-exporter.yml
vendored
6
dist/rules/ebpf/ebpf-exporter.yml
vendored
|
|
@ -13,7 +13,7 @@ groups:
|
|||
severity: warning
|
||||
annotations:
|
||||
summary: eBPF exporter program not attached (instance {{ $labels.instance }})
|
||||
description: "eBPF program {{ $labels.name }} failed to attach. The program is not collecting data. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "eBPF program {{ $labels.id }} failed to attach. The program is not collecting data. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EbpfExporterDecoderErrors
|
||||
expr: 'rate(ebpf_exporter_decoder_errors_total[5m]) > 0'
|
||||
|
|
@ -22,10 +22,10 @@ groups:
|
|||
severity: warning
|
||||
annotations:
|
||||
summary: eBPF exporter decoder errors (instance {{ $labels.instance }})
|
||||
description: "eBPF exporter is experiencing decoder errors for program {{ $labels.name }}. Kernel data is not being correctly transformed into labels. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "eBPF exporter is experiencing decoder errors for config {{ $labels.config }}. Kernel data is not being correctly transformed into labels. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EbpfExporterNoEnabledConfigs
|
||||
expr: 'ebpf_exporter_enabled_configs == 0'
|
||||
expr: 'absent(ebpf_exporter_enabled_configs)'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
|
|||
4
dist/rules/envoy/embedded-exporter.yml
vendored
4
dist/rules/envoy/embedded-exporter.yml
vendored
|
|
@ -24,7 +24,7 @@ groups:
|
|||
description: "Envoy memory allocated is above 90% of heap size on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EnvoyHighDownstreamHttp5xxErrorRate
|
||||
expr: 'sum by (instance) (rate(envoy_http_downstream_rq_xx{envoy_response_code_class="5"}[5m])) / sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) * 100 > 5'
|
||||
expr: 'sum by (instance) (rate(envoy_http_downstream_rq_xx{envoy_response_code_class="5"}[5m])) / sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) * 100 > 5 and sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) > 0'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
|
|
@ -33,7 +33,7 @@ groups:
|
|||
description: "More than 5% of downstream HTTP responses are 5xx on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EnvoyHighDownstreamHttp4xxErrorRate
|
||||
expr: 'sum by (instance) (rate(envoy_http_downstream_rq_xx{envoy_response_code_class="4"}[5m])) / sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) * 100 > 10'
|
||||
expr: 'sum by (instance) (rate(envoy_http_downstream_rq_xx{envoy_response_code_class="4"}[5m])) / sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) * 100 > 10 and sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
|
|||
5
dist/rules/gitlab-ci/gitaly.yml
vendored
5
dist/rules/gitlab-ci/gitaly.yml
vendored
|
|
@ -5,8 +5,9 @@ groups:
|
|||
|
||||
rules:
|
||||
|
||||
# grpc_code!="OK" includes non-error codes like NotFound, AlreadyExists. Consider filtering to specific error codes for less noise.
|
||||
- alert: GitlabGitalyHighGrpcErrorRate
|
||||
expr: 'sum(rate(grpc_server_handled_total{job="gitaly",grpc_code!="OK"}[5m])) / sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) * 100 > 5'
|
||||
expr: 'sum(rate(grpc_server_handled_total{job="gitaly",grpc_code!="OK"}[5m])) / sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) * 100 > 5 and sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -18,7 +19,7 @@ groups:
|
|||
# concurrency limits. This directly impacts users trying to push, pull, or clone.
|
||||
# This alert is derived from the GitLab Omnibus default rules.
|
||||
- alert: GitlabGitalyResourceExhausted
|
||||
expr: 'sum(rate(grpc_server_handled_total{job="gitaly",grpc_code="ResourceExhausted"}[5m])) / sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) * 100 > 1'
|
||||
expr: 'sum(rate(grpc_server_handled_total{job="gitaly",grpc_code="ResourceExhausted"}[5m])) / sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) * 100 > 1 and sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
|
|
|
|||
|
|
@ -37,7 +37,7 @@ groups:
|
|||
# Threshold is 5% of all requests returning server errors.
|
||||
# Check GitLab logs at /var/log/gitlab/ for root cause.
|
||||
- alert: GitlabHighHttpErrorRate
|
||||
expr: 'sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m])) * 100 > 5'
|
||||
expr: 'sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m])) * 100 > 5 and sum(rate(http_requests_total[5m])) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
|
|
@ -58,7 +58,7 @@ groups:
|
|||
# This metric requires the emit_sidekiq_histogram_metrics feature flag to be enabled.
|
||||
# A sustained failure rate indicates background processing issues.
|
||||
- alert: GitlabSidekiqJobsFailing
|
||||
expr: 'rate(sidekiq_jobs_failed_total[5m]) > 0'
|
||||
expr: 'rate(sidekiq_jobs_failed_total[5m]) > 0.1'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -136,6 +136,7 @@ groups:
|
|||
summary: GitLab CI pipeline creation slow (instance {{ $labels.instance }})
|
||||
description: "GitLab CI pipeline creation p95 latency on {{ $labels.instance }} is above 30 seconds.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# This metric may not exist in all GitLab versions. Verify against your GitLab installation.
|
||||
- alert: GitlabCiPipelineFailuresIncreasing
|
||||
expr: 'rate(gitlab_ci_pipeline_failure_reasons[5m]) > 0'
|
||||
for: 10m
|
||||
|
|
@ -188,7 +189,7 @@ groups:
|
|||
|
||||
# This may happen during a rolling deployment. If it persists, investigate incomplete upgrades.
|
||||
- alert: GitlabVersionMismatch
|
||||
expr: 'count(count by (version) (deployments{version!=""})) > 1'
|
||||
expr: 'count(count by (version) (gitlab_build_info)) > 1'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
|
|||
2
dist/rules/gitlab-ci/workhorse.yml
vendored
2
dist/rules/gitlab-ci/workhorse.yml
vendored
|
|
@ -8,7 +8,7 @@ groups:
|
|||
# Workhorse sits in front of Puma and handles Git HTTP, file uploads, and proxying.
|
||||
# Threshold from GitLab Omnibus default rules: 10% for high-traffic instances.
|
||||
- alert: GitlabWorkhorseHighErrorRate
|
||||
expr: 'sum(rate(gitlab_workhorse_http_request_duration_seconds_count{code=~"5.."}[5m])) / sum(rate(gitlab_workhorse_http_request_duration_seconds_count[5m])) * 100 > 10'
|
||||
expr: 'sum(rate(gitlab_workhorse_http_request_duration_seconds_count{code=~"5.."}[5m])) / sum(rate(gitlab_workhorse_http_request_duration_seconds_count[5m])) * 100 > 10 and sum(rate(gitlab_workhorse_http_request_duration_seconds_count[5m])) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
|
|
|
|||
4
dist/rules/golang/golang-exporter.yml
vendored
4
dist/rules/golang/golang-exporter.yml
vendored
|
|
@ -39,13 +39,13 @@ groups:
|
|||
|
||||
# Threshold is workload-dependent. Applications with heavy CGo or blocking I/O may legitimately use more OS threads. Adjust to match your baseline.
|
||||
- alert: GoThreadCountHigh
|
||||
expr: 'go_threads > 50'
|
||||
expr: 'go_threads > 500'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Go thread count high (instance {{ $labels.instance }})
|
||||
description: "Go OS thread count is high (> 50), potential blocking syscall or CGo leak\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Go OS thread count is high (> 500), potential blocking syscall or CGo leak\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold is a rough default. Adjust based on your application's normal object count.
|
||||
- alert: GoHeapObjectsCountHigh
|
||||
|
|
|
|||
|
|
@ -6,10 +6,10 @@ groups:
|
|||
rules:
|
||||
|
||||
- alert: GrafanaAlloyServiceDown
|
||||
expr: 'count by (instance) (alloy_build_info) unless count by (instance) (alloy_build_info offset 2m) '
|
||||
expr: 'count by (instance) (alloy_build_info offset 2h) unless count by (instance) (alloy_build_info)'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Grafana Alloy service down (instance {{ $labels.instance }})
|
||||
description: "Alloy on (instance {{ $labels.instance }}) is not responding or has stopped running.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Alloy on instance {{ $labels.instance }} is not responding or has stopped running.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
|
|||
|
|
@ -231,8 +231,9 @@ groups:
|
|||
summary: Mimir ingester TSDB WAL writes failed (instance {{ $labels.instance }})
|
||||
description: "Mimir ingester {{ $labels.instance }} is failing to write to TSDB WAL.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold aligned with official Mimir mixin (30 minutes).
|
||||
- alert: MimirStoreGatewayHasNotSyncedBucket
|
||||
expr: '(time() - cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 600) and cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 0'
|
||||
expr: '(time() - cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 1800) and cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
|
|
@ -277,7 +278,7 @@ groups:
|
|||
description: "Mimir compactor {{ $labels.instance }} has not run compaction in the last 24 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MimirCompactorHasConsecutiveFailures
|
||||
expr: 'increase(cortex_compactor_runs_failed_total[2h]) > 1'
|
||||
expr: 'increase(cortex_compactor_runs_failed_total{reason!="shutdown"}[2h]) > 1'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
|
|
@ -303,8 +304,9 @@ groups:
|
|||
summary: Mimir compactor has not uploaded blocks (instance {{ $labels.instance }})
|
||||
description: "Mimir compactor {{ $labels.instance }} has not uploaded any block in the last 24 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Using 24h window per official mixin — compaction skips are rare events.
|
||||
- alert: MimirCompactorSkippedBlocks
|
||||
expr: 'increase(cortex_compactor_blocks_marked_for_no_compaction_total[5m]) > 0'
|
||||
expr: 'increase(cortex_compactor_blocks_marked_for_no_compaction_total[24h]) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
|
|||
|
|
@ -117,6 +117,7 @@ groups:
|
|||
summary: Tempo compaction too many outstanding blocks warning (instance {{ $labels.instance }})
|
||||
description: "There are too many outstanding compaction blocks for {{ $labels.instance }}. Consider increasing compactor resources.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Official Tempo mixin normalizes by backend-worker count. Adjust threshold based on your compactor configuration.
|
||||
- alert: TempoCompactionTooManyOutstandingBlocksCritical
|
||||
expr: 'sum by (instance) (tempodb_compaction_outstanding_blocks) > 250'
|
||||
for: 24h
|
||||
|
|
|
|||
10
dist/rules/graph-node/embedded-exporter.yml
vendored
10
dist/rules/graph-node/embedded-exporter.yml
vendored
|
|
@ -41,20 +41,20 @@ groups:
|
|||
summary: Provider failed because get genesis timeout (instance {{ $labels.instance }})
|
||||
description: "Timeout to get genesis for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: StoreConnectionIsTooSlow
|
||||
- alert: StoreConnectionSlow
|
||||
expr: 'store_connection_wait_time_ms > 10'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Store connection is too slow (instance {{ $labels.instance }})
|
||||
summary: Store connection slow (instance {{ $labels.instance }})
|
||||
description: "Store connection is too slow to `{{$labels.pool}}` pool, `{{$labels.shard}}` shard in Graph node `{{$labels.instance}}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: StoreConnectionIsTooSlow
|
||||
- alert: StoreConnectionVerySlow
|
||||
expr: 'store_connection_wait_time_ms > 20'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Store connection is too slow (instance {{ $labels.instance }})
|
||||
description: "Store connection is too slow to `{{$labels.pool}}` pool, `{{$labels.shard}}` shard in Graph node `{{$labels.instance}}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
summary: Store connection very slow (instance {{ $labels.instance }})
|
||||
description: "Store connection is very slow to `{{$labels.pool}}` pool, `{{$labels.shard}}` shard in Graph node `{{$labels.instance}}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
|
|||
2
dist/rules/ipmi/ipmi-exporter.yml
vendored
2
dist/rules/ipmi/ipmi-exporter.yml
vendored
|
|
@ -118,7 +118,7 @@ groups:
|
|||
# Catches any sensor type not covered by the specific temperature/fan/voltage/current/power alerts.
|
||||
- alert: IpmiGenericSensorCritical
|
||||
expr: 'ipmi_sensor_state == 2'
|
||||
for: 0m
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
|
|
|
|||
|
|
@ -56,9 +56,9 @@ groups:
|
|||
summary: Keycloak high registration failure rate (instance {{ $labels.instance }})
|
||||
description: "More than 10% of registration attempts are failing in realm {{ $labels.realm }} (current value: {{ $value | printf \"%.1f\" }}%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold of 2 seconds is a rough default. Adjust based on your performance requirements.
|
||||
# keycloak_request_duration is in milliseconds. Threshold of 2000ms (2 seconds) is a rough default.
|
||||
- alert: KeycloakSlowRequestResponseTime
|
||||
expr: 'sum by (method) (rate(keycloak_request_duration_sum[5m])) / sum by (method) (rate(keycloak_request_duration_count[5m])) > 2 and sum by (method) (rate(keycloak_request_duration_count[5m])) > 0'
|
||||
expr: 'sum by (method) (rate(keycloak_request_duration_sum[5m])) / sum by (method) (rate(keycloak_request_duration_count[5m])) > 2000 and sum by (method) (rate(keycloak_request_duration_count[5m])) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
|
|||
2
dist/rules/linkerd/embedded-exporter.yml
vendored
2
dist/rules/linkerd/embedded-exporter.yml
vendored
|
|
@ -12,4 +12,4 @@ groups:
|
|||
severity: warning
|
||||
annotations:
|
||||
summary: Linkerd high error rate (instance {{ $labels.instance }})
|
||||
description: "Linkerd error rate for {{ $labels.deployment | $labels.statefulset | $labels.daemonset }} is over 10%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Linkerd error rate for {{ $labels.deployment }}{{ $labels.statefulset }}{{ $labels.daemonset }} is over 10%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
|
|||
2
dist/rules/openstack/openstack-exporter.yml
vendored
2
dist/rules/openstack/openstack-exporter.yml
vendored
|
|
@ -24,7 +24,7 @@ groups:
|
|||
description: "Nova agent {{ $labels.hostname }} ({{ $labels.service }}) is down in zone {{ $labels.zone }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: OpenstackNeutronAgentDown
|
||||
expr: 'openstack_neutron_agent_state{adminState="enabled"} == 0'
|
||||
expr: 'openstack_neutron_agent_state{adminState="up"} == 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
|
|
|
|||
|
|
@ -65,7 +65,7 @@ groups:
|
|||
|
||||
# Threshold is highly workload-dependent. Adjust 200 to suit your environment.
|
||||
- alert: OracleDbTooManyActiveSessions
|
||||
expr: 'oracledb_sessions_activity{status="ACTIVE", type="USER"} > 200'
|
||||
expr: 'oracledb_sessions_value{status="ACTIVE", type="USER"} > 200'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -73,10 +73,9 @@ groups:
|
|||
summary: Oracle DB too many active sessions (instance {{ $labels.instance }})
|
||||
description: "Oracle Database on {{ $labels.instance }} has too many active user sessions (current value: {{ $value }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# High user I/O wait time indicates storage performance issues (slow disks, SAN latency, etc.).
|
||||
# The metric is in centiseconds per second. Threshold 300 means 3 seconds of I/O wait per second of wall time.
|
||||
# The metric from v$waitclassmetric is already a normalized rate (centiseconds per second). Threshold 300 means 3 seconds of I/O wait per second of wall time.
|
||||
- alert: OracleDbHighWaitTime(userI/o)
|
||||
expr: 'rate(oracledb_wait_time_user_io[5m]) > 300'
|
||||
expr: 'oracledb_wait_time_user_io > 300'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
|
|||
10
dist/rules/process-exporter/process-exporter.yml
vendored
10
dist/rules/process-exporter/process-exporter.yml
vendored
|
|
@ -7,9 +7,9 @@ groups:
|
|||
|
||||
- alert: ProcessExporterGroupDown
|
||||
expr: 'namedprocess_namegroup_num_procs == 0'
|
||||
for: 2m
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Process exporter group down (instance {{ $labels.instance }})
|
||||
description: "No processes found for group {{ $labels.groupname }}. The service may have stopped. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
@ -63,7 +63,7 @@ groups:
|
|||
description: "Process group {{ $labels.groupname }} is using {{ $value | humanize }}B of swap. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ProcessExporterZombieProcesses
|
||||
expr: 'namedprocess_namegroup_states{state="Zombie"} > 0'
|
||||
expr: 'namedprocess_namegroup_states{state="Zombie"} > 5'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -71,9 +71,9 @@ groups:
|
|||
summary: Process exporter zombie processes (instance {{ $labels.instance }})
|
||||
description: "Process group {{ $labels.groupname }} has {{ $value }} zombie processes. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold of 10000 switches/s is a rough default. Adjust based on the workload profile.
|
||||
# Filters to voluntary switches only — involuntary switches are normal under CPU contention. Threshold of 50000/s is a rough default. Adjust based on workload.
|
||||
- alert: ProcessExporterHighContextSwitching
|
||||
expr: 'rate(namedprocess_namegroup_context_switches_total[5m]) > 10000'
|
||||
expr: 'rate(namedprocess_namegroup_context_switches_total{ctxswitchtype="voluntary"}[5m]) > 50000'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
|
|||
|
|
@ -36,7 +36,7 @@ groups:
|
|||
description: "Proxmox VE CPU usage is above 90% on {{ $labels.id }}. Current value: {{ $value | printf \"%.2f\" }}%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PveHighMemoryUsage
|
||||
expr: 'pve_memory_usage_bytes / pve_memory_size_bytes * 100 > 90'
|
||||
expr: 'pve_memory_usage_bytes / pve_memory_size_bytes * 100 > 90 and pve_memory_size_bytes > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
|
|||
1
dist/rules/python/python-exporter.yml
vendored
1
dist/rules/python/python-exporter.yml
vendored
|
|
@ -33,6 +33,7 @@ groups:
|
|||
summary: Python file descriptors exhaustion (instance {{ $labels.instance }})
|
||||
description: "Python process is running out of file descriptors (> 90% used)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Gen2 collection rate > 1/s is very high. In most applications, gen2 runs are infrequent. Adjust threshold based on your workload.
|
||||
- alert: PythonGcGeneration2CollectionsHigh
|
||||
expr: 'rate(python_gc_collections_total{generation="2"}[5m]) > 1'
|
||||
for: 5m
|
||||
|
|
|
|||
1
dist/rules/ruby/ruby-exporter.yml
vendored
1
dist/rules/ruby/ruby-exporter.yml
vendored
|
|
@ -24,6 +24,7 @@ groups:
|
|||
summary: Ruby heap free slots high (instance {{ $labels.instance }})
|
||||
description: "Ruby heap has too many free slots (> 500k), memory fragmentation after large allocations\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Major GC rate > 5/s is extremely high. Consider lowering to > 1 or > 2 for earlier detection.
|
||||
- alert: RubyMajorGcRateHigh
|
||||
expr: 'rate(ruby_major_gc_ops_total[5m]) > 5'
|
||||
for: 5m
|
||||
|
|
|
|||
4
dist/rules/snmp/snmp-exporter.yml
vendored
4
dist/rules/snmp/snmp-exporter.yml
vendored
|
|
@ -46,7 +46,7 @@ groups:
|
|||
summary: SNMP interface high outbound error rate (instance {{ $labels.instance }})
|
||||
description: "Interface {{ $labels.ifDescr }} on {{ $labels.instance }} has an outbound error rate above 5%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold is a rough default. Adjust based on your link capacity and traffic patterns.
|
||||
# Threshold is a rough default. ifSpeed is a Gauge32 that maxes out at ~4.29 Gbps. For 10G+ interfaces, use ifHighSpeed (in Mbps) instead.
|
||||
- alert: SnmpInterfaceHighBandwidthUsageInbound
|
||||
expr: 'rate(ifHCInOctets{job=~"snmp.*"}[5m]) * 8 / ifSpeed > 0.80 and ifSpeed > 0'
|
||||
for: 15m
|
||||
|
|
@ -56,7 +56,7 @@ groups:
|
|||
summary: SNMP interface high bandwidth usage inbound (instance {{ $labels.instance }})
|
||||
description: "Interface {{ $labels.ifDescr }} on {{ $labels.instance }} inbound utilization is above 80%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold is a rough default. Adjust based on your link capacity and traffic patterns.
|
||||
# Threshold is a rough default. ifSpeed is a Gauge32 that maxes out at ~4.29 Gbps. For 10G+ interfaces, use ifHighSpeed (in Mbps) instead.
|
||||
- alert: SnmpInterfaceHighBandwidthUsageOutbound
|
||||
expr: 'rate(ifHCOutOctets{job=~"snmp.*"}[5m]) * 8 / ifSpeed > 0.80 and ifSpeed > 0'
|
||||
for: 15m
|
||||
|
|
|
|||
2
dist/rules/spinnaker/embedded-exporter.yml
vendored
2
dist/rules/spinnaker/embedded-exporter.yml
vendored
|
|
@ -27,7 +27,7 @@ groups:
|
|||
|
||||
# The 30s threshold is a rough default. Adjust based on your pipeline SLOs.
|
||||
- alert: SpinnakerOrcaQueueMessageLagHigh
|
||||
expr: 'rate(queue_message_lag_seconds_sum[5m]) / rate(queue_message_lag_seconds_count[5m]) > 30'
|
||||
expr: 'rate(queue_message_lag_seconds_sum[5m]) / rate(queue_message_lag_seconds_count[5m]) > 30 and rate(queue_message_lag_seconds_count[5m]) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
|
|||
6
dist/rules/systemd/systemd-exporter.yml
vendored
6
dist/rules/systemd/systemd-exporter.yml
vendored
|
|
@ -34,7 +34,7 @@ groups:
|
|||
description: "Systemd service {{ $labels.name }} has restarted {{ $value }} times in the last hour. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: SystemdUnitTasksNearLimit
|
||||
expr: 'systemd_unit_tasks_current / systemd_unit_tasks_max > 0.9 and systemd_unit_tasks_max > 0'
|
||||
expr: 'systemd_unit_tasks_current / ignoring(type) systemd_unit_tasks_max > 0.9 and systemd_unit_tasks_max > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -44,7 +44,7 @@ groups:
|
|||
|
||||
- alert: SystemdSocketRefusedConnections
|
||||
expr: 'increase(systemd_socket_refused_connections_total[5m]) > 0'
|
||||
for: 0m
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
|
|
@ -54,7 +54,7 @@ groups:
|
|||
# Threshold of 100 connections is arbitrary. Adjust to your workload.
|
||||
- alert: SystemdSocketHighConnections
|
||||
expr: 'systemd_socket_current_connections > 100'
|
||||
for: 0m
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
|
|
|
|||
|
|
@ -17,6 +17,7 @@ groups:
|
|||
summary: WireGuard peer handshake too old (instance {{ $labels.instance }})
|
||||
description: "WireGuard peer {{ $labels.public_key }} on interface {{ $labels.interface }} has not had a handshake for over 5 minutes. The tunnel may be down.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# This alert will fire for all offline mobile/laptop peers. Consider filtering by expected-online peers.
|
||||
- alert: WireguardPeerHandshakeNeverEstablished
|
||||
expr: 'wireguard_latest_handshake_seconds == 0'
|
||||
for: 5m
|
||||
|
|
|
|||
Loading…
Reference in a new issue