Feat/cilium alerting rules (#526)

* Add .worktrees/ to .gitignore

* feat: add Cilium alerting rules (32 rules across agent, operator, ClusterMesh, KVStoreMesh, Hubble)

* fix: use job label instead of k8s_app, switch to single-quoted YAML strings

* remove Cilium agent high restart rate alert
This commit is contained in:
Samuel Berthe 2026-03-16 17:10:59 +01:00 committed by GitHub
parent e8eb75c2e2
commit 2b99cf1f76
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 180 additions and 0 deletions

View file

@ -125,6 +125,7 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts
- [Keycloak](https://samber.github.io/awesome-prometheus-alerts/rules#keycloak)
- [Cloudflare](https://samber.github.io/awesome-prometheus-alerts/rules#cloudflare)
- [SNMP](https://samber.github.io/awesome-prometheus-alerts/rules#snmp)
- [Cilium](https://samber.github.io/awesome-prometheus-alerts/rules#cilium)
- [WireGuard](https://samber.github.io/awesome-prometheus-alerts/rules#wireguard)
#### Cloud providers

View file

@ -4043,6 +4043,185 @@ groups:
severity: info
comments: sysUpTime is in centiseconds (hundredths of a second).
- name: Cilium
exporters:
- name: Embedded exporter
slug: embedded-exporter
doc_url: https://docs.cilium.io/en/stable/observability/metrics/
rules:
# Agent health
- name: Cilium agent unreachable nodes
description: "Cilium agent {{ $labels.pod }} cannot reach {{ $value }} node(s). Check network connectivity and node health."
query: "sum(cilium_unreachable_nodes{}) by (pod) > 0"
severity: warning
for: 15m
- name: Cilium agent unreachable health endpoints
description: "Cilium agent {{ $labels.pod }} cannot reach {{ $value }} health endpoint(s). Node-to-node health probes are failing."
query: "sum(cilium_unreachable_health_endpoints{}) by (pod) > 0"
severity: warning
for: 15m
- name: Cilium agent failing controllers
description: "Cilium agent {{ $labels.pod }} has {{ $value }} failing controller(s). Check cilium-agent logs for details."
query: "sum(cilium_controllers_failing{}) by (pod) > 0"
severity: warning
for: 5m
# Endpoints
- name: Cilium agent endpoint failures
description: "Cilium agent {{ $labels.pod }} has {{ $value }} endpoint(s) in invalid state."
query: 'sum(cilium_endpoint_state{endpoint_state="invalid"}) by (pod) > 0'
severity: warning
for: 5m
- name: Cilium agent endpoint regeneration failures
description: "Cilium agent {{ $labels.pod }} is failing to regenerate endpoints. Network policy enforcement may be stale."
query: 'sum(rate(cilium_endpoint_regenerations_total{outcome="fail"}[5m])) by (pod) > 0'
severity: warning
for: 5m
- name: Cilium agent endpoint update failure
description: "Cilium agent {{ $labels.pod }} is failing K8s endpoint update API calls ({{ $labels.method }} {{ $labels.return_code }})."
query: 'sum(rate(cilium_k8s_client_api_calls_total{method=~"(PUT|POST|PATCH)", endpoint="endpoint", return_code!~"2[0-9][0-9]"}[5m])) by (pod, method, return_code) > 0'
severity: warning
for: 5m
- name: Cilium agent endpoint create failure
description: "Cilium agent {{ $labels.pod }} is failing CNI endpoint-create calls. New pods may fail to get networking."
query: 'sum(rate(cilium_api_limiter_processed_requests_total{api_call=~"endpoint-create", outcome="fail"}[1m])) by (pod, api_call) > 0'
severity: info
for: 5m
# BPF maps
- name: Cilium agent map operation failures
description: "Cilium agent {{ $labels.pod }} has eBPF map operation failures on {{ $labels.map_name }}. Datapath may be degraded."
query: 'sum(rate(cilium_bpf_map_ops_total{outcome="fail"}[5m])) by (map_name, pod) > 0'
severity: warning
for: 5m
- name: Cilium agent BPF map pressure
description: "Cilium agent {{ $labels.pod }} eBPF map {{ $labels.map_name }} is above 90% utilization. Map may become full."
query: "cilium_bpf_map_pressure{} > 0.9"
severity: warning
for: 5m
comments: Map pressure is a ratio from 0 to 1. At 1.0, the map is full and new entries will be dropped.
# Conntrack and NAT
- name: Cilium agent conntrack table full
description: "Cilium agent {{ $labels.pod }} conntrack table is full, causing packet drops. Increase CT map size or investigate connection leaks."
query: 'sum(rate(cilium_drop_count_total{reason="CT: Map insertion failed"}[5m])) by (pod) > 0'
severity: critical
for: 5m
- name: Cilium agent conntrack failed garbage collection
description: "Cilium agent {{ $labels.pod }} conntrack garbage collection is failing. Stale entries may accumulate."
query: 'sum(rate(cilium_datapath_conntrack_gc_runs_total{status="uncompleted"}[5m])) by (pod) > 0'
severity: warning
for: 5m
- name: Cilium agent NAT table full
description: "Cilium agent {{ $labels.pod }} NAT table is full, causing masquerade failures. Increase NAT map size or investigate."
query: 'sum(rate(cilium_drop_count_total{reason="No mapping for NAT masquerade"}[1m])) by (pod) > 0'
severity: critical
for: 5m
# Packet drops
- name: Cilium agent high denied rate
description: "Cilium agent {{ $labels.pod }} is dropping packets due to policy denial. Verify network policies are correct."
query: 'sum(rate(cilium_drop_count_total{reason="Policy denied"}[1m])) by (pod) > 0'
severity: info
for: 10m
comments: Policy denials may be expected behavior. Investigate only if unexpected traffic is being blocked.
- name: Cilium agent high drop rate
description: "Cilium agent {{ $labels.pod }} is dropping packets for reason {{ $labels.reason }}. This indicates infrastructure issues."
query: 'sum(rate(cilium_drop_count_total{reason!~"Policy denied"}[5m])) by (pod, reason) > 0'
severity: warning
for: 5m
# Policy
- name: Cilium agent policy map pressure
description: "Cilium agent {{ $labels.pod }} policy BPF map is above 90% utilization. New policies may fail to apply."
query: 'sum(cilium_bpf_map_pressure{map_name=~"cilium_policy_.*"}) by (pod) > 0.9'
severity: warning
for: 5m
- name: Cilium agent policy import errors
description: "Cilium agent {{ $labels.pod }} is failing to import network policies. Policy enforcement may be incomplete."
query: 'sum(rate(cilium_policy_change_total{outcome="fail"}[5m])) by (pod) > 0'
severity: warning
for: 5m
- name: Cilium agent policy implementation delay
description: "Cilium agent {{ $labels.pod }} P99 policy deployment latency exceeds 60 seconds. Endpoints may run with stale policies."
query: "histogram_quantile(0.99, sum(rate(cilium_policy_implementation_delay[5m])) by (le, pod)) > 60"
severity: warning
for: 5m
comments: Threshold of 60s is a rough default. Adjust based on cluster size and policy complexity.
# Identity
- name: Cilium node-local high identity allocation
description: "Cilium agent {{ $labels.pod }} node-local identity allocation is above 80%. Approaching the 65535 identity limit."
query: '(sum(cilium_identity{type="node_local"}) by (pod) / (2^16-1)) > 0.8'
severity: warning
for: 5m
- name: Cilium cluster high identity allocation
description: "Cilium cluster-wide identity allocation is above 80%. Approaching the maximum identity limit."
query: '(sum(cilium_identity{type="cluster_local"}) by () / (2^16-256)) > 0.8'
severity: warning
for: 5m
# IPAM
- name: Cilium operator exhausted IPAM IPs
description: "Cilium operator has no available IPAM IPs. New pods will fail to schedule networking."
query: 'sum(cilium_operator_ipam_ips{type="available"}) by () <= 0'
severity: critical
for: 5m
- name: Cilium operator low available IPAM IPs
description: "Cilium operator IPAM IP pool is over 90% utilized. Allocate more IPs to avoid exhaustion."
query: 'sum(cilium_operator_ipam_ips{type!="available"}) by () / sum(cilium_operator_ipam_ips) by () > 0.9 and sum(cilium_operator_ipam_ips) by () > 0'
severity: warning
for: 5m
comments: Threshold of 90% is a rough default. Adjust based on your pod churn rate and IP pool size.
- name: Cilium operator IPAM interface creation failures
description: "Cilium operator is failing to create IPAM network interfaces. IP allocation may be impacted."
query: 'sum(rate(cilium_operator_ipam_interface_creation_ops{status!="success"}[5m])) by () > 0'
severity: warning
for: 10m
# API and K8s client
- name: Cilium agent API errors
description: "Cilium agent {{ $labels.pod }} API is returning 5xx errors ({{ $labels.return_code }}). Agent may be unhealthy."
query: 'sum(rate(cilium_agent_api_process_time_seconds_count{return_code=~"5[0-9][0-9]"}[5m])) by (pod, return_code) > 0'
severity: warning
for: 5m
- name: Cilium agent Kubernetes client errors
description: "Cilium agent {{ $labels.pod }} is receiving errors from K8s API for endpoint {{ $labels.endpoint }} ({{ $labels.return_code }})."
query: 'sum(rate(cilium_k8s_client_api_calls_total{endpoint!="metrics", return_code!~"2[0-9][0-9]"}[5m])) by (pod, endpoint, return_code) > 0'
severity: info
for: 5m
# ClusterMesh
- name: Cilium ClusterMesh remote cluster not ready
description: "Cilium ClusterMesh remote cluster {{ $labels.target_cluster }} is not ready from {{ $labels.source_cluster }}."
query: "count(cilium_clustermesh_remote_cluster_readiness_status < 1) by (source_cluster, target_cluster) > 0"
severity: critical
for: 5m
- name: Cilium ClusterMesh remote cluster failing
description: "Cilium ClusterMesh connectivity to remote cluster {{ $labels.target_cluster }} from {{ $labels.source_cluster }} is failing."
query: "sum(rate(cilium_clustermesh_remote_cluster_failures[5m])) by (source_cluster, target_cluster) > 0"
severity: critical
for: 5m
# KVStoreMesh
- name: Cilium KVStoreMesh remote cluster not ready
description: "Cilium KVStoreMesh remote cluster {{ $labels.target_cluster }} is not ready from {{ $labels.source_cluster }}."
query: "count(cilium_kvstoremesh_remote_cluster_readiness_status < 1) by (source_cluster, target_cluster) > 0"
severity: critical
for: 5m
- name: Cilium KVStoreMesh remote cluster failing
description: "Cilium KVStoreMesh remote cluster {{ $labels.target_cluster }} from {{ $labels.source_cluster }} is experiencing failures."
query: "sum(rate(cilium_kvstoremesh_remote_cluster_failures[5m])) by (source_cluster, target_cluster) > 0"
severity: critical
for: 5m
- name: Cilium KVStoreMesh sync errors
description: "Cilium KVStoreMesh from {{ $labels.source_cluster }} is experiencing kvstore sync errors."
query: "sum(rate(cilium_kvstoremesh_kvstore_sync_errors_total[5m])) by (source_cluster) > 0"
severity: critical
for: 5m
# Hubble
- name: Cilium Hubble lost events
description: "Cilium Hubble on {{ $labels.pod }} is losing flow events. Observability data may be incomplete."
query: "sum(rate(hubble_lost_events_total[5m])) by (pod) > 0"
severity: warning
for: 5m
- name: Cilium Hubble high DNS error rate
description: "Cilium Hubble on {{ $labels.pod }} is observing more than 10% DNS error responses."
query: 'sum(rate(hubble_dns_responses_total{rcode!="No Error"}[5m])) by (pod) / sum(rate(hubble_dns_responses_total[5m])) by (pod) > 0.1 and sum(rate(hubble_dns_responses_total[5m])) by (pod) > 0'
severity: warning
for: 5m
comments: Threshold of 10% is a rough default. Some DNS errors may be normal depending on your workload.
- name: WireGuard
exporters:
- name: MindFlavor/prometheus_wireguard_exporter