# # The following yaml cannot be copy-pasted to Prometheus configuration. # Please navigate to https://samber.github.io/awesome-prometheus-alerts/rules instead. # # Contributing guidelines: # https://github.com/samber/awesome-prometheus-alerts/blob/master/CONTRIBUTING.md # groups: - name: Basic resource monitoring services: - name: Prometheus self-monitoring exporters: - slug: embedded-exporter rules: - name: Prometheus job missing description: A Prometheus job has disappeared query: 'absent(up{job="prometheus"})' severity: warning - name: Prometheus target missing description: A Prometheus target has disappeared. An exporter might be crashed. query: "up == 0 unless on(job) (sum by (job) (up) == 0)" severity: critical for: 1m comments: | Only fire if at least one target in the job is still up. If all targets are down, PrometheusJobMissing or PrometheusAllTargetsMissing will fire instead. - name: Prometheus all targets missing description: A Prometheus job does not have living target anymore. query: "sum by (job) (up) == 0" severity: critical for: 1m - name: Prometheus target missing with warmup time description: "Allow a job time to start up (10 minutes) before alerting that it's down." query: "sum by (instance, job) ((up == 0) * on (instance) group_left(__name__) (node_time_seconds - node_boot_time_seconds > 600))" severity: critical for: 1m - name: Prometheus configuration reload failure description: Prometheus configuration reload error query: "prometheus_config_last_reload_successful != 1" severity: warning - name: Prometheus too many restarts description: Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping. query: 'changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2' severity: warning - name: Prometheus AlertManager job missing description: A Prometheus AlertManager job has disappeared query: 'absent(up{job="alertmanager"})' severity: warning - name: Prometheus AlertManager configuration reload failure description: AlertManager configuration reload error query: "alertmanager_config_last_reload_successful != 1" severity: warning - name: Prometheus AlertManager config not synced description: Configurations of AlertManager cluster instances are out of sync query: 'count(count_values("config_hash", alertmanager_config_hash)) > 1' severity: warning - name: Prometheus AlertManager E2E dead man switch description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager." query: "vector(1)" severity: critical - name: Prometheus not connected to alertmanager description: Prometheus cannot connect the alertmanager query: "prometheus_notifications_alertmanagers_discovered < 1" severity: critical - name: Prometheus rule evaluation failures description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts." query: "increase(prometheus_rule_evaluation_failures_total[3m]) > 0" severity: critical - name: Prometheus template text expansion failures description: "Prometheus encountered {{ $value }} template text expansion failures" query: "increase(prometheus_template_text_expansion_failures_total[3m]) > 0" severity: critical - name: Prometheus rule evaluation slow description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query." query: "prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds" severity: warning for: 5m - name: Prometheus notifications backlog description: The Prometheus notification queue has not been empty for 10 minutes query: "min_over_time(prometheus_notifications_queue_length[10m]) > 0" severity: warning - name: Prometheus AlertManager notification failing description: "Alertmanager is failing sending notifications ({{ $value }} notifications/s)" query: "rate(alertmanager_notifications_failed_total[3m]) > 0.05" severity: critical - name: Prometheus target empty description: Prometheus has no target in service discovery query: "prometheus_sd_discovered_targets == 0" severity: critical - name: Prometheus target scraping slow description: Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned. query: 'prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05' severity: warning for: 5m - name: Prometheus large scrape description: "Prometheus has many scrapes that exceed the sample limit ({{ $value }} scrapes)" query: "increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10" severity: warning for: 5m - name: Prometheus target scrape duplicate description: "Prometheus has many samples rejected due to duplicate timestamps but different values ({{ $value }} samples)" query: "increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 3" severity: warning - name: Prometheus TSDB checkpoint creation failures description: "Prometheus encountered {{ $value }} checkpoint creation failures" query: "increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0" severity: critical - name: Prometheus TSDB checkpoint deletion failures description: "Prometheus encountered {{ $value }} checkpoint deletion failures" query: "increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0" severity: critical - name: Prometheus TSDB compactions failed description: "Prometheus encountered {{ $value }} TSDB compactions failures" query: "increase(prometheus_tsdb_compactions_failed_total[1m]) > 0" severity: critical - name: Prometheus TSDB head truncations failed description: "Prometheus encountered {{ $value }} TSDB head truncation failures" query: "increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0" severity: critical - name: Prometheus TSDB reload failures description: "Prometheus encountered {{ $value }} TSDB reload failures" query: "increase(prometheus_tsdb_reloads_failures_total[1m]) > 0" severity: critical - name: Prometheus TSDB WAL corruptions description: "Prometheus encountered {{ $value }} TSDB WAL corruptions" query: "increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0" severity: critical - name: Prometheus TSDB WAL truncations failed description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures" query: "increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0" severity: critical - name: Prometheus timeseries cardinality description: 'The "{{ $labels.name }}" timeseries cardinality is getting very high: {{ $value }}' query: 'label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000' severity: warning - name: Host and hardware exporters: - name: node-exporter slug: node-exporter doc_url: https://github.com/prometheus/node_exporter rules: - name: Host out of memory description: Node memory is filling up (< 10% left) query: "(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < .10)" severity: warning for: 2m - name: Host memory under memory pressure description: "The node is under heavy memory pressure. High rate of major page faults ({{ $value }}/s)." query: "(deriv(node_vmstat_pgmajfault[5m]) > 1000)" severity: warning comments: | node_vmstat_pgmajfault is exposed as untyped/gauge by node_exporter (from /proc/vmstat), so deriv() is used instead of rate(). - name: Host Memory is underutilized description: "Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})" query: "min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8" severity: info comments: | You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly - name: Host unusual network throughput in description: Host receive bandwidth is high (>80%). query: "((rate(node_network_receive_bytes_total[5m]) / node_network_speed_bytes) > .80) and node_network_speed_bytes > 0" severity: warning - name: Host unusual network throughput out description: Host transmit bandwidth is high (>80%) query: "((rate(node_network_transmit_bytes_total[5m]) / node_network_speed_bytes) > .80) and node_network_speed_bytes > 0" severity: warning - name: Host disk IO utilization high description: Disk utilization is high (> 80%) query: "(rate(node_disk_io_time_seconds_total[5m]) > .80)" severity: warning - name: Host out of disk space description: Disk is almost full (< 10% left) query: '(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0)' severity: critical comments: | Please add ignored mountpoints in node_exporter parameters like "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. for: 2m - name: Host disk may fill in 24 hours description: Filesystem will likely run out of space within the next 24 hours. query: 'predict_linear(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[3h], 86400) <= 0 and node_filesystem_avail_bytes > 0' severity: warning comments: | Please add ignored mountpoints in node_exporter parameters like "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. for: 2m - name: Host out of inodes description: Disk is almost running out of available inodes (< 10% left) query: "(node_filesystem_files_free / node_filesystem_files < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) and node_filesystem_files > 0" severity: critical for: 2m - name: Host filesystem device error description: "Error stat-ing the {{ $labels.mountpoint }} filesystem" query: 'node_filesystem_device_error{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} == 1' severity: critical for: 2m - name: Host inodes may fill in 24 hours description: Filesystem will likely run out of inodes within the next 24 hours at current write rate query: 'predict_linear(node_filesystem_files_free{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_files_free > 0' severity: warning for: 2m - name: Host unusual disk read latency description: Disk latency is growing (read operations > 100ms) query: "(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0)" severity: warning for: 2m - name: Host unusual disk write latency description: Disk latency is growing (write operations > 100ms) query: "(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0)" severity: warning for: 2m - name: Host high CPU load description: CPU load is > 80% query: '1 - (avg without (cpu) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) > .80' severity: warning for: 10m - name: Host CPU is underutilized description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs." query: '(min without (cpu) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8' severity: info for: 1w comments: | You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly - name: Host CPU steal noisy neighbor description: CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit. query: 'avg without (cpu) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10' severity: warning - name: Host CPU high iowait description: CPU iowait > 10%. Your CPU is idling waiting for storage to respond. query: 'avg without (cpu) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10' severity: warning - name: Host unusual disk IO description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities." query: "rate(node_disk_io_time_seconds_total[5m]) > 0.8" severity: warning for: 5m - name: Host context switching high description: Context switching is growing on the node (twice the daily average during the last 15m) query: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2 and rate(node_context_switches_total[1d]) > 0' severity: warning comments: | x2 context switches is an arbitrary number. The alert threshold depends on the nature of the application. Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58 - name: Host swap is filling up description: Swap is filling up (>80%) query: "((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) and node_memory_SwapTotal_bytes > 0" severity: warning for: 2m - name: Host systemd service crashed description: "systemd service {{ $labels.name }} crashed" query: '(node_systemd_unit_state{state="failed"} == 1)' severity: warning - name: Host physical component too hot description: "Physical hardware component too hot" query: "node_hwmon_temp_celsius > node_hwmon_temp_max_celsius" severity: warning for: 5m - name: Host node overtemperature alarm description: "Physical node temperature alarm triggered" query: "((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1))" severity: critical - name: Host software RAID insufficient drives description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} has insufficient drives remaining." query: '((node_md_disks_required - ignoring(state) node_md_disks{state="active"}) > 0)' comments: | Uses ignoring(state) to handle additional labels on node_md_disks. severity: critical - name: Host software RAID disk failure description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} needs attention." query: '(node_md_disks{state="failed"} > 0)' severity: warning for: 2m - name: Host kernel version deviations description: Kernel version for {{ $labels.instance }} has changed. query: "changes(node_uname_info[1h]) > 0" severity: info - name: Host OOM kill detected description: OOM kill detected query: "(delta(node_vmstat_oom_kill[30m]) > 0)" comments: | node_vmstat_oom_kill is exposed as untyped/gauge by node_exporter (from /proc/vmstat), so delta() is used instead of increase(). severity: warning comments: | When a machine runs out of memory, the node exporter can become unresponsive for several minutes. Even if the system takes 15–20 minutes to recover, the alert should still trigger. - name: Host EDAC Correctable Errors detected description: 'Host {{ $labels.instance }} has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 1 minute.' query: "(increase(node_edac_correctable_errors_total[1m]) > 0)" severity: info - name: Host EDAC Uncorrectable Errors detected description: 'Host {{ $labels.instance }} has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC.' query: "(node_edac_uncorrectable_errors_total > 0)" severity: warning - name: Host Network Receive Errors description: 'Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.' query: "(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) and rate(node_network_receive_packets_total[2m]) > 0" severity: warning for: 2m - name: Host Network Transmit Errors description: 'Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.' query: "(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) and rate(node_network_transmit_packets_total[2m]) > 0" severity: warning for: 2m - name: Host Network Bond Degraded description: 'Bond "{{ $labels.device }}" degraded on "{{ $labels.instance }}".' query: "((node_bonding_active - node_bonding_slaves) != 0)" severity: warning for: 2m - name: Host conntrack limit description: "The number of conntrack is approaching limit" query: "(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) and node_nf_conntrack_entries_limit > 0" severity: warning for: 5m - name: Host clock skew description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host." query: "((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0))" severity: warning for: 10m - name: Host clock not synchronising description: "Clock not synchronising. Ensure NTP is configured on this host." query: "(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16)" severity: warning for: 2m - name: S.M.A.R.T Device Monitoring exporters: - name: smartctl-exporter slug: smartctl-exporter doc_url: https://github.com/prometheus-community/smartctl_exporter rules: - name: SMART device temperature warning description: Device temperature warning on {{ $labels.instance }} drive {{ $labels.device }} over 60°C query: '(avg_over_time(smartctl_device_temperature{temperature_type="current"} [5m]) unless on (instance, device) smartctl_device_temperature{temperature_type="drive_trip"}) > 60' severity: warning - name: SMART device temperature critical description: Device temperature critical on {{ $labels.instance }} drive {{ $labels.device }} over 70°C query: '(max_over_time(smartctl_device_temperature{temperature_type="current"} [5m]) unless on (instance, device) smartctl_device_temperature{temperature_type="drive_trip"}) > 70' severity: critical - name: SMART device temperature over trip value description: Device temperature over trip value on {{ $labels.instance }} drive {{ $labels.device }} query: 'max_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) >= on(device, instance) smartctl_device_temperature{temperature_type="drive_trip"}' severity: critical - name: SMART device temperature nearing trip value description: Device temperature at 80% of trip value on {{ $labels.instance }} drive {{ $labels.device }} query: 'max_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) >= on(device, instance) (smartctl_device_temperature{temperature_type="drive_trip"} * .80)' severity: warning - name: SMART status description: Device has a SMART status failure on {{ $labels.instance }} drive {{ $labels.device }} query: "smartctl_device_smart_status != 1" severity: critical - name: SMART critical warning description: Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }} query: "smartctl_device_critical_warning > 0" severity: critical - name: SMART media errors description: Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }} query: "smartctl_device_media_errors > 0" severity: critical - name: SMART Wearout Indicator description: Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }} query: "smartctl_device_available_spare < smartctl_device_available_spare_threshold" severity: critical - name: IPMI exporters: - name: prometheus-community/ipmi_exporter slug: ipmi-exporter doc_url: https://github.com/prometheus-community/ipmi_exporter rules: - name: IPMI collector down description: "IPMI collector {{ $labels.collector }} on {{ $labels.instance }} failed to scrape sensor data. Check FreeIPMI tools and BMC connectivity." query: 'ipmi_up == 0' severity: warning for: 5m comments: | The ipmi_up metric is per-collector. A value of 0 means the collector could not retrieve data from the BMC. - name: IPMI temperature sensor warning description: "IPMI temperature sensor {{ $labels.name }} on {{ $labels.instance }} is in warning state." query: 'ipmi_temperature_state == 1' severity: warning for: 5m comments: | State values: 0=nominal, 1=warning, 2=critical. Thresholds are defined in the BMC firmware. - name: IPMI temperature sensor critical description: "IPMI temperature sensor {{ $labels.name }} on {{ $labels.instance }} is in critical state. Immediate attention required to prevent hardware damage." query: 'ipmi_temperature_state == 2' severity: critical - name: IPMI fan speed sensor warning description: "IPMI fan sensor {{ $labels.name }} on {{ $labels.instance }} is in warning state." query: 'ipmi_fan_speed_state == 1' severity: warning for: 5m - name: IPMI fan speed sensor critical description: "IPMI fan sensor {{ $labels.name }} on {{ $labels.instance }} is in critical state. A fan may have failed." query: 'ipmi_fan_speed_state == 2' severity: critical - name: IPMI fan speed zero description: "IPMI fan {{ $labels.name }} on {{ $labels.instance }} reports 0 RPM. The fan may have failed." query: 'ipmi_fan_speed_rpm == 0' severity: critical for: 5m - name: IPMI voltage sensor warning description: "IPMI voltage sensor {{ $labels.name }} on {{ $labels.instance }} is in warning state." query: 'ipmi_voltage_state == 1' severity: warning for: 5m - name: IPMI voltage sensor critical description: "IPMI voltage sensor {{ $labels.name }} on {{ $labels.instance }} is in critical state. Power supply or motherboard issue possible." query: 'ipmi_voltage_state == 2' severity: critical - name: IPMI current sensor warning description: "IPMI current sensor {{ $labels.name }} on {{ $labels.instance }} is in warning state." query: 'ipmi_current_state == 1' severity: warning for: 5m - name: IPMI current sensor critical description: "IPMI current sensor {{ $labels.name }} on {{ $labels.instance }} is in critical state." query: 'ipmi_current_state == 2' severity: critical - name: IPMI power sensor warning description: "IPMI power sensor {{ $labels.name }} on {{ $labels.instance }} is in warning state." query: 'ipmi_power_state == 1' severity: warning for: 5m - name: IPMI power sensor critical description: "IPMI power sensor {{ $labels.name }} on {{ $labels.instance }} is in critical state." query: 'ipmi_power_state == 2' severity: critical - name: IPMI generic sensor critical description: "IPMI sensor {{ $labels.name }} (type={{ $labels.type }}) on {{ $labels.instance }} is in critical state." query: 'ipmi_sensor_state == 2' severity: critical for: 5m comments: | Catches any sensor type not covered by the specific temperature/fan/voltage/current/power alerts. - name: IPMI chassis power off description: "IPMI reports chassis power is off on {{ $labels.instance }}. The server may have shut down unexpectedly." query: 'ipmi_chassis_power_state == 0' severity: critical - name: IPMI chassis drive fault description: "IPMI reports a drive fault on {{ $labels.instance }}. Check disk health." query: 'ipmi_chassis_drive_fault_state == 0' severity: critical comments: | The metric uses inverted logic: 1=no fault, 0=fault detected. - name: IPMI chassis cooling fault description: "IPMI reports a cooling/fan fault on {{ $labels.instance }}. Check fans and airflow." query: 'ipmi_chassis_cooling_fault_state == 0' severity: critical comments: | The metric uses inverted logic: 1=no fault, 0=fault detected. - name: IPMI SEL almost full description: "IPMI System Event Log on {{ $labels.instance }} has only {{ printf \"%.0f\" $value }} bytes free. Clear the SEL to prevent loss of new events." query: 'ipmi_sel_free_space_bytes < 512' severity: warning for: 5m comments: | SEL storage is typically very limited (e.g., 16KB). When full, new events may be dropped. - name: Docker containers exporters: - name: google/cAdvisor slug: google-cadvisor doc_url: https://github.com/google/cadvisor rules: - name: Container killed description: A container has disappeared query: "time() - container_last_seen > 60" severity: warning comments: | This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment. - name: Container absent description: A container is absent for 5 min query: "absent(container_last_seen)" severity: warning for: 5m comments: | This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment. - name: Container High CPU utilization description: 'Container CPU utilization is above 80% (current: {{ $value | printf "%.2f" }}%)' query: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) > 80 and sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) > 0' comments: | Only fires for containers with explicit CPU limits. Containers without limits have cpu_quota=0, which is filtered out by the guard. severity: warning for: 2m - name: Container High Memory usage description: Container Memory usage is above 80% query: '(sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80' severity: warning comments: See https://medium.com/faun/how-much-is-too-much-the-linux-oomkiller-and-used-memory-d32186f29c9d for: 2m - name: Container Volume usage description: Container Volume usage is above 80% query: '(1 - (sum(container_fs_inodes_free{name!=""}) BY (instance) / sum(container_fs_inodes_total) BY (instance))) * 100 > 80 and sum(container_fs_inodes_total) BY (instance) > 0' severity: warning for: 2m - name: Container high throttle rate description: "Container is being throttled ({{ $value | humanizePercentage }})" query: 'sum(rate(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) by (container, pod, namespace) / sum(rate(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > ( 25 / 100 ) and sum(rate(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > 0' severity: warning for: 5m - name: Container high low change CPU usage description: This alert rule monitors the absolute change in CPU usage within a time window and triggers an alert when the change exceeds 25%. query: '(abs((sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[1m])) * 100) - (sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[1m] offset 1m)) * 100)) or abs((sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[1m])) * 100) - (sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[5m] offset 1m)) * 100))) > 25' severity: info - name: Container Low CPU utilization description: 'Container CPU utilization is under 20% for 1 week. Consider reducing the allocated CPU. (current: {{ $value | printf "%.2f" }}%)' query: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) < 20 and sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) > 0' severity: info for: 7d - name: Container Low Memory usage description: Container Memory usage is under 20% for 1 week. Consider reducing the allocated memory. query: '(sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) < 20' severity: info for: 7d - name: Blackbox exporters: - name: prometheus/blackbox_exporter slug: blackbox-exporter doc_url: https://github.com/prometheus/blackbox_exporter rules: - name: Blackbox probe failed description: Probe failed query: probe_success == 0 severity: critical for: 1m - name: Blackbox configuration reload failure description: Blackbox configuration reload failure query: "blackbox_exporter_config_last_reload_successful != 1" severity: warning - name: Blackbox slow probe description: Blackbox probe took more than 1s to complete query: "probe_duration_seconds > 1" severity: warning for: 1m - name: Blackbox probe HTTP failure description: HTTP status code is not 200-399 query: "probe_http_status_code <= 199 OR probe_http_status_code >= 400" severity: critical for: 1m - name: Blackbox SSL certificate will expire soon description: SSL certificate expires in less than 20 days query: "3 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 20" severity: warning - name: Blackbox SSL certificate will expire very soon description: SSL certificate expires in less than 3 days query: "0 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 3" severity: critical - name: Blackbox SSL certificate expired description: SSL certificate has expired already query: "round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 0" severity: critical comments: | For probe_ssl_earliest_cert_expiry to be exposed after expiration, you need to enable insecure_skip_verify. Note that this will disable certificate validation. See https://github.com/prometheus/blackbox_exporter/blob/master/CONFIGURATION.md#tls_config - name: Blackbox probe slow HTTP description: HTTP request took more than 1s query: "probe_http_duration_seconds > 1" severity: warning for: 1m - name: Blackbox probe slow ping description: Blackbox ping took more than 1s query: "probe_icmp_duration_seconds > 1" severity: warning for: 1m - name: Windows Server exporters: - name: prometheus-community/windows_exporter slug: windows-exporter doc_url: https://github.com/prometheus-community/windows_exporter rules: - name: Windows Server collector Error description: "Collector {{ $labels.collector }} was not successful" query: "windows_exporter_collector_success == 0" severity: critical - name: Windows Server service Status description: Windows Service state is not OK query: 'windows_service_status{status="ok"} != 1' severity: critical for: 1m - name: Windows Server CPU Usage description: CPU Usage is more than 80% query: '100 - (avg by (instance) (rate(windows_cpu_time_total{mode="idle"}[2m])) * 100) > 80' severity: warning - name: Windows Server memory Usage description: Memory usage is more than 90% query: "100 - ((windows_os_physical_memory_free_bytes / windows_cs_physical_memory_bytes) * 100) > 90" severity: warning for: 2m - name: Windows Server disk Space Usage description: Disk usage is more than 80% query: "100 - 100 * (windows_logical_disk_free_bytes / windows_logical_disk_size_bytes) > 80 and windows_logical_disk_size_bytes > 0" severity: critical for: 2m - name: VMware exporters: - name: pryorda/vmware_exporter slug: pryorda-vmware-exporter doc_url: https://github.com/pryorda/vmware_exporter rules: - name: Virtual Machine Memory Warning description: 'High memory usage on {{ $labels.instance }}: {{ $value | printf "%.2f"}}%' query: "vmware_vm_mem_usage_average / 100 >= 80 and vmware_vm_mem_usage_average / 100 < 90" severity: warning for: 5m - name: Virtual Machine Memory Critical description: 'High memory usage on {{ $labels.instance }}: {{ $value | printf "%.2f"}}%' query: "vmware_vm_mem_usage_average / 100 >= 90" severity: critical for: 1m - name: High Number of Snapshots description: "High snapshots number on {{ $labels.instance }}: {{ $value }}" query: "vmware_vm_snapshots > 3" severity: warning for: 30m - name: Outdated Snapshots description: 'Outdated snapshots on {{ $labels.instance }}: {{ $value | printf "%.0f"}} days' query: "(time() - vmware_vm_snapshot_timestamp_seconds) / (60 * 60 * 24) >= 3" severity: warning for: 5m - name: Proxmox VE exporters: - name: prometheus-pve/prometheus-pve-exporter slug: prometheus-pve-exporter doc_url: https://github.com/prometheus-pve/prometheus-pve-exporter rules: - name: PVE node down description: 'Proxmox VE node {{ $labels.id }} is down.' query: 'pve_up{id=~"node/.*"} == 0' severity: critical for: 2m - name: PVE VM/CT down description: 'Proxmox VE guest {{ $labels.id }} is not running.' query: 'pve_up{id=~"(qemu|lxc)/.*"} == 0' severity: warning for: 5m comments: | This alert triggers for all VMs and containers that are not running. You may want to filter by specific guests using the `id` label, or exclude intentionally stopped guests with additional label matchers. - name: PVE high CPU usage description: 'Proxmox VE CPU usage is above 90% on {{ $labels.id }}. Current value: {{ $value | printf "%.2f" }}%' query: 'pve_cpu_usage_ratio * 100 > 90' severity: warning for: 5m - name: PVE high memory usage description: 'Proxmox VE memory usage is above 90% on {{ $labels.id }}. Current value: {{ $value | printf "%.2f" }}%' query: 'pve_memory_usage_bytes / pve_memory_size_bytes * 100 > 90 and pve_memory_size_bytes > 0' severity: warning for: 5m - name: PVE storage filling up description: 'Proxmox VE storage {{ $labels.id }} is above 80% used. Current value: {{ $value | printf "%.2f" }}%' query: 'pve_disk_usage_bytes{id=~"storage/.*"} / pve_disk_size_bytes{id=~"storage/.*"} * 100 > 80 and pve_disk_size_bytes{id=~"storage/.*"} > 0' severity: warning for: 5m - name: PVE storage almost full description: 'Proxmox VE storage {{ $labels.id }} is above 95% used. Current value: {{ $value | printf "%.2f" }}%' query: 'pve_disk_usage_bytes{id=~"storage/.*"} / pve_disk_size_bytes{id=~"storage/.*"} * 100 > 95 and pve_disk_size_bytes{id=~"storage/.*"} > 0' severity: critical for: 2m - name: PVE guest not backed up description: '{{ $value }} Proxmox VE guest(s) are not covered by any backup job.' query: 'pve_not_backed_up_total > 0' severity: warning - name: PVE replication failed description: 'Proxmox VE replication for {{ $labels.id }} has {{ $value }} failed sync(s).' query: 'pve_replication_failed_syncs > 0' severity: warning - name: PVE cluster not quorate description: 'Proxmox VE cluster has lost quorum.' query: 'pve_cluster_info{quorate="0"} == 1' severity: critical comments: | Loss of quorum means the cluster cannot make decisions about VM placement and fencing. This requires immediate attention. - name: Netdata exporters: - name: Embedded exporter slug: embedded-exporter doc_url: https://github.com/netdata/netdata/blob/master/backends/prometheus/README.md rules: - name: Netdata high cpu usage description: Netdata high CPU usage (> 80%) query: 'netdata_cpu_cpu_percentage_average{dimension="idle"} < 20' severity: warning for: 5m comments: | This is a gauge metric (not a counter). Checking idle < 20% means CPU usage > 80%. - name: Netdata CPU steal noisy neighbor description: CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit. query: 'netdata_cpu_cpu_percentage_average{dimension="steal"} > 10' severity: warning for: 5m - name: Netdata high memory usage description: Netdata high memory usage (> 80%) query: '100 / netdata_system_ram_MiB_average * netdata_system_ram_MiB_average{dimension=~"free|cached"} < 20 and netdata_system_ram_MiB_average > 0' severity: warning for: 5m - name: Netdata low disk space description: Netdata low disk space (> 80%) query: '100 / netdata_disk_space_GB_average * netdata_disk_space_GB_average{dimension=~"avail|cached"} < 20 and netdata_disk_space_GB_average > 0' severity: warning for: 5m - name: Netdata predicted disk full description: Netdata predicted disk full in 24 hours query: 'predict_linear(netdata_disk_space_GB_average{dimension=~"avail|cached"}[3h], 24 * 3600) < 0' severity: warning - name: Netdata MD mismatch cnt unsynchronized blocks description: RAID Array have unsynchronized blocks query: "netdata_md_mismatch_cnt_unsynchronized_blocks_average > 1024" severity: warning for: 2m - name: Netdata disk reallocated sectors description: "Disk reallocated sectors detected ({{ $value }} sectors)" query: "increase(netdata_smartd_log_reallocated_sectors_count_sectors_average[1m]) > 0" severity: info - name: Netdata disk current pending sector description: Disk current pending sector query: "netdata_smartd_log_current_pending_sector_count_sectors_average > 0" severity: warning - name: Netdata reported uncorrectable disk sectors description: "Reported uncorrectable disk sectors ({{ $value }} sectors)" query: "increase(netdata_smartd_log_offline_uncorrectable_sector_count_sectors_average[2m]) > 0" severity: warning - name: eBPF exporters: - name: cloudflare/ebpf_exporter slug: ebpf-exporter doc_url: https://github.com/cloudflare/ebpf_exporter rules: - name: eBPF exporter program not attached description: "eBPF program {{ $labels.id }} failed to attach. The program is not collecting data. (instance {{ $labels.instance }})" query: 'ebpf_exporter_ebpf_program_attached == 0' severity: warning for: 5m comments: | The exporter uses loose attachment: if a program fails to load (missing BTF, kernel incompatibility), it sets this metric to 0 and continues running. - name: eBPF exporter decoder errors description: "eBPF exporter is experiencing decoder errors for config {{ $labels.config }}. Kernel data is not being correctly transformed into labels. (instance {{ $labels.instance }})" query: 'rate(ebpf_exporter_decoder_errors_total[5m]) > 0.05' severity: warning for: 5m - name: eBPF exporter no enabled configs description: "eBPF exporter has no enabled configurations. No eBPF programs are being run. (instance {{ $labels.instance }})" query: 'ebpf_exporter_enabled_configs == 0 or absent(ebpf_exporter_enabled_configs)' severity: warning for: 5m - name: Process Exporter exporters: - name: ncabatoff/process-exporter slug: process-exporter doc_url: https://github.com/ncabatoff/process-exporter rules: - name: Process exporter group down description: "No processes found for group {{ $labels.groupname }}. The service may have stopped. (instance {{ $labels.instance }})" query: 'namedprocess_namegroup_num_procs == 0' severity: warning for: 5m - name: Process exporter high memory usage description: "Process group {{ $labels.groupname }} is using {{ $value | humanize }}B of resident memory. (instance {{ $labels.instance }})" query: 'namedprocess_namegroup_memory_bytes{memtype="resident"} > 4e+09' severity: warning for: 5m comments: | Threshold of 4GB is arbitrary and depends on the process being monitored. Adjust per group. - name: Process exporter high CPU usage description: "Process group {{ $labels.groupname }} is using {{ $value }}% CPU (core-equivalent). (instance {{ $labels.instance }})" query: 'rate(namedprocess_namegroup_cpu_seconds_total[5m]) * 100 > 80' severity: warning for: 5m comments: | Value is core-equivalent %: 100% = 1 full core, 200% = 2 cores, etc. Threshold of 80% is per-core. Adjust based on expected workload. - name: Process exporter high file descriptor usage description: "Process group {{ $labels.groupname }} is using more than 80% of its file descriptor limit. (instance {{ $labels.instance }})" query: 'namedprocess_namegroup_worst_fd_ratio > 0.8' severity: warning for: 5m - name: Process exporter file descriptors exhausted description: "Process group {{ $labels.groupname }} has nearly exhausted its file descriptor limit. (instance {{ $labels.instance }})" query: 'namedprocess_namegroup_worst_fd_ratio > 0.95' severity: critical for: 2m - name: Process exporter high swap usage description: "Process group {{ $labels.groupname }} is using {{ $value | humanize }}B of swap. (instance {{ $labels.instance }})" query: 'namedprocess_namegroup_memory_bytes{memtype="swapped"} > 512e+06' severity: warning for: 5m comments: | Threshold of 512MB is arbitrary. Adjust per group and environment. - name: Process exporter zombie processes description: "Process group {{ $labels.groupname }} has {{ $value }} zombie processes. (instance {{ $labels.instance }})" query: 'namedprocess_namegroup_states{state="Zombie"} > 5' severity: warning for: 5m - name: Process exporter high context switching description: "Process group {{ $labels.groupname }} has a high rate of context switches ({{ $value }}/s). (instance {{ $labels.instance }})" query: 'rate(namedprocess_namegroup_context_switches_total{ctxswitchtype="voluntary"}[5m]) > 50000' severity: warning for: 5m comments: | Filters to voluntary switches only — involuntary switches are normal under CPU contention. Threshold of 50000/s is a rough default. Adjust based on workload. - name: Process exporter high disk write IO description: "Process group {{ $labels.groupname }} is performing {{ $value | humanize }}B/s of disk writes. (instance {{ $labels.instance }})" query: 'rate(namedprocess_namegroup_write_bytes_total[5m]) > 100e+06' severity: warning for: 5m comments: | Threshold of 100MB/s is arbitrary. Adjust per group. - name: Process exporter process restarting description: "Process group {{ $labels.groupname }} has restarted (oldest process start time changed). (instance {{ $labels.instance }})" query: 'changes(namedprocess_namegroup_oldest_start_time_seconds[5m]) > 0 and namedprocess_namegroup_num_procs > 0' severity: info comments: | Detects restarts by watching for changes in the oldest process start time within the group. - name: Systemd exporters: - name: prometheus-community/systemd_exporter slug: systemd-exporter doc_url: https://github.com/prometheus-community/systemd_exporter rules: - name: Systemd unit failed description: "Systemd unit {{ $labels.name }} has entered failed state. (instance {{ $labels.instance }})" query: 'systemd_unit_state{state="failed"} == 1' severity: warning for: 5m - name: Systemd unit inactive description: "Systemd unit {{ $labels.name }} is inactive. (instance {{ $labels.instance }})" query: 'systemd_unit_state{state="inactive", type="service", name=~"your-critical-service.+"} == 1' severity: warning for: 5m comments: | Many units are legitimately inactive. You must adjust the name=~ filter to match your critical services. - name: Systemd service crash looping description: "Systemd service {{ $labels.name }} has restarted {{ $value }} times in the last hour. (instance {{ $labels.instance }})" query: 'increase(systemd_service_restart_total[1h]) > 5' severity: critical for: 5m - name: Systemd unit tasks near limit description: "Systemd unit {{ $labels.name }} is using {{ $value | humanizePercentage }} of its task limit. (instance {{ $labels.instance }})" query: 'systemd_unit_tasks_current / ignoring(type) systemd_unit_tasks_max > 0.9 and ignoring(type) systemd_unit_tasks_max > 0' severity: warning for: 5m - name: Systemd socket refused connections description: "Systemd socket {{ $labels.name }} is refusing connections. ({{ $value }} refused in last 5m, instance {{ $labels.instance }})" query: 'delta(systemd_socket_refused_connections_total[5m]) > 3' comments: | systemd_socket_refused_connections_total is declared as Gauge by the exporter despite the _total suffix, so delta() is used instead of increase(). severity: warning for: 2m - name: Systemd socket high connections description: "Systemd socket {{ $labels.name }} has {{ $value }} active connections. (instance {{ $labels.instance }})" query: 'systemd_socket_current_connections > 100' severity: warning for: 2m comments: | Threshold of 100 connections is arbitrary. Adjust to your workload. - name: Systemd timer missed trigger description: "Systemd timer {{ $labels.name }} has not triggered for over 24 hours. (instance {{ $labels.instance }})" query: '(time() - systemd_timer_last_trigger_seconds) / 3600 > 24 and systemd_timer_last_trigger_seconds > 0' severity: warning for: 5m comments: | Triggers if timer hasn't fired in 24 hours. Adjust threshold per timer schedule. - name: Databases services: - name: MySQL exporters: - name: prometheus/mysqld_exporter slug: mysqld-exporter doc_url: https://github.com/prometheus/mysqld_exporter rules: - name: MySQL down description: MySQL instance is down on {{ $labels.instance }} query: "mysql_up == 0" severity: critical for: 1m comments: | 1m delay allows a restart without triggering an alert. - name: MySQL too many connections (> 80%) description: "More than 80% of MySQL connections are in use on {{ $labels.instance }}" query: "max_over_time(mysql_global_status_threads_connected[1m]) / mysql_global_variables_max_connections * 100 > 80 and mysql_global_variables_max_connections > 0" severity: warning for: 2m - name: MySQL high prepared statements utilization (> 80%) description: "High utilization of prepared statements (>80%) on {{ $labels.instance }}" query: "max_over_time(mysql_global_status_prepared_stmt_count[1m]) / mysql_global_variables_max_prepared_stmt_count * 100 > 80 and mysql_global_variables_max_prepared_stmt_count > 0" severity: warning for: 2m - name: MySQL high threads running description: "More than 60% of MySQL connections are in running state on {{ $labels.instance }}" query: "max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections * 100 > 60 and mysql_global_variables_max_connections > 0" severity: warning for: 2m - name: MySQL Slave IO thread not running description: "MySQL Slave IO thread not running on {{ $labels.instance }}" query: "( mysql_slave_status_slave_io_running and ON (instance) mysql_slave_status_master_server_id > 0 ) == 0" severity: critical for: 1m comments: | 1m delay allows a restart without triggering an alert. - name: MySQL Slave SQL thread not running description: "MySQL Slave SQL thread not running on {{ $labels.instance }}" query: "( mysql_slave_status_slave_sql_running and ON (instance) mysql_slave_status_master_server_id > 0) == 0" severity: critical for: 1m comments: | 1m delay allows a restart without triggering an alert. - name: MySQL Slave replication lag description: "MySQL replication lag on {{ $labels.instance }}" query: "( (mysql_slave_status_seconds_behind_master - mysql_slave_status_sql_delay) and ON (instance) mysql_slave_status_master_server_id > 0 ) > 30" severity: critical for: 1m - name: MySQL slow queries description: "MySQL server has some new slow queries ({{ $value }} in the last minute)." query: delta(mysql_global_status_slow_queries[1m]) > 0 comments: | mysqld_exporter exposes SHOW GLOBAL STATUS variables as untyped/gauge, so delta() is used instead of increase(). severity: warning for: 2m - name: MySQL InnoDB log waits description: "MySQL innodb log writes stalling ({{ $value }} waits/s)" query: deriv(mysql_global_status_innodb_log_waits[15m]) > 10 comments: | mysqld_exporter exposes SHOW GLOBAL STATUS variables as untyped/gauge, so deriv() is used instead of rate(). severity: warning - name: MySQL restarted description: MySQL has just been restarted, less than one minute ago on {{ $labels.instance }}. query: "mysql_global_status_uptime < 60" severity: info - name: MySQL High QPS description: MySQL is being overload with unusual QPS (> 10k QPS). query: "deriv(mysql_global_status_questions[1m]) > 10000" comments: | mysqld_exporter exposes SHOW GLOBAL STATUS variables as untyped/gauge, so deriv() is used instead of irate(). severity: info for: 2m - name: MySQL too many open files description: MySQL has too many open files, consider increase variables open_files_limit on {{ $labels.instance }}. query: "mysql_global_status_innodb_num_open_files / mysql_global_variables_open_files_limit * 100 > 75 and mysql_global_variables_open_files_limit > 0" severity: warning for: 2m - name: MySQL InnoDB Force Recovery is enabled description: "MySQL InnoDB force recovery is enabled on {{ $labels.instance }}" query: "mysql_global_variables_innodb_force_recovery != 0" severity: warning for: 2m - name: MySQL InnoDB history_len too long description: "MySQL history_len (undo log) too long on {{ $labels.instance }}" query: "mysql_info_schema_innodb_metrics_transaction_trx_rseg_history_len > 50000" severity: warning for: 2m - name: PostgreSQL exporters: - name: prometheus-community/postgres_exporter slug: postgres-exporter doc_url: https://github.com/prometheus-community/postgres_exporter rules: - name: Postgresql down description: Postgresql instance is down query: "pg_up == 0" severity: critical for: 1m comments: | 1m delay allows a restart without triggering an alert. - name: Postgresql restarted description: Postgresql restarted query: "time() - pg_postmaster_start_time_seconds < 60" severity: critical - name: Postgresql exporter error description: Postgresql exporter is showing errors. A query may be buggy in query.yaml query: "pg_exporter_last_scrape_error > 0" severity: critical - name: Postgresql table not auto vacuumed description: Table {{ $labels.relname }} has not been auto vacuumed for 10 days query: "((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_vacuum_threshold) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10" severity: warning - name: Postgresql table not auto analyzed description: Table {{ $labels.relname }} has not been auto analyzed for 10 days query: "((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_analyze_threshold) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10" severity: warning - name: Postgresql too many connections description: PostgreSQL instance has too many connections (> 80%). query: "sum by (instance, job, server) (pg_stat_activity_count) > min by (instance, job, server) (pg_settings_max_connections * 0.8)" severity: warning for: 2m - name: Postgresql not enough connections description: PostgreSQL instance should have more connections (> 5) query: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5' severity: critical for: 2m - name: Postgresql dead locks description: "PostgreSQL has dead-locks ({{ $value }} in the last minute)" query: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres",datid!="0"}[1m]) > 5' severity: warning - name: Postgresql high rollback rate description: Ratio of transactions being aborted compared to committed is > 2 % query: 'sum by (namespace,datname,instance) (rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) / (sum by (namespace,datname,instance) (rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + sum by (namespace,datname,instance) (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m]))) > 0.02 and (sum by (namespace,datname,instance) (rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + sum by (namespace,datname,instance) (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m]))) > 0' severity: warning - name: Postgresql commit rate low description: Postgresql seems to be processing very few transactions query: 'increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[5m]) < 5' severity: critical for: 2m - name: Postgresql low XID consumption description: Postgresql seems to be consuming transaction IDs very slowly query: "rate(pg_txid_current[1m]) < 5" severity: warning for: 2m comments: | pg_txid_current is not a default postgres_exporter metric. You need to define a custom query. See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737 - name: Postgresql unused replication slot description: Unused Replication Slots query: "(pg_replication_slots_active == 0) and (pg_replication_is_replica == 0)" severity: warning for: 1m - name: Postgresql too many dead tuples description: PostgreSQL dead tuples is too large query: "((pg_stat_user_tables_n_dead_tup > 10000) / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)) >= 0.1 and (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup) > 0" severity: warning for: 2m - name: Postgresql configuration changed description: Postgres Database configuration change has occurred query: '{__name__=~"pg_settings_.*",__name__!="pg_settings_transaction_read_only"} != ON(__name__, instance) {__name__=~"pg_settings_.*",__name__!="pg_settings_transaction_read_only"} OFFSET 5m' severity: info - name: Postgresql SSL compression active description: Database allows connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`. query: "sum by (instance) (pg_stat_ssl_compression) > 0" severity: warning comments: | pg_stat_ssl_compression is not a default postgres_exporter metric and is only available on PostgreSQL 9.5-13 (removed in PG 14). See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737 - name: Postgresql too many locks acquired description: Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction. query: "((sum by (instance) (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20 and (pg_settings_max_locks_per_transaction * pg_settings_max_connections) > 0" severity: critical for: 2m - name: Postgresql bloat index high (> 80%) description: "The index {{ $labels.idxname }} is bloated. You should execute `REINDEX INDEX CONCURRENTLY {{ $labels.idxname }};`" query: "pg_bloat_btree_bloat_pct > 80 and on (idxname) (pg_bloat_btree_real_size > 100000000)" severity: warning for: 1h comments: | See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737 - name: Postgresql bloat table high (> 80%) description: "The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`" query: "pg_bloat_table_bloat_pct > 80 and on (relname) (pg_bloat_table_real_size > 200000000)" severity: warning for: 1h comments: | See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737 - name: Postgresql invalid index description: "The table {{ $labels.relname }} has an invalid index: {{ $labels.indexrelname }}. You should execute `DROP INDEX {{ $labels.indexrelname }};`" query: 'pg_general_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}' severity: warning for: 6h comments: | See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737 - name: Postgresql replication lag description: The PostgreSQL replication lag is high (> 5s) query: "pg_replication_lag_seconds > 5" severity: warning for: 30s - name: SQL Server exporters: - name: Ozarklake/prometheus-mssql-exporter slug: ozarklake-mssql-exporter doc_url: https://github.com/Ozarklake/prometheus-mssql-exporter rules: - name: SQL Server down description: SQL server instance is down query: mssql_up == 0 severity: critical for: 1m comments: | 1m delay allows a restart without triggering an alert. - name: SQL Server deadlock description: SQL Server {{ $labels.instance }} is experiencing deadlocks ({{ $value }}/s) query: mssql_deadlocks > 5 severity: warning for: 1m - name: Oracle Database exporters: - name: iamseth/oracledb_exporter slug: iamseth-oracledb-exporter doc_url: https://github.com/iamseth/oracledb_exporter rules: - name: Oracle DB down description: Oracle Database instance is down on {{ $labels.instance }} query: "oracledb_up == 0" severity: critical for: 1m comments: | 1m delay allows a restart without triggering an alert. - name: Oracle DB sessions reaching limit (> 85%) description: "Oracle Database session utilization is above 85% on {{ $labels.instance }} (current value: {{ $value }}%)" query: "oracledb_resource_current_utilization{resource_name=\"sessions\"} / oracledb_resource_limit_value{resource_name=\"sessions\"} * 100 > 85 and oracledb_resource_limit_value{resource_name=\"sessions\"} > 0" severity: warning for: 5m comments: | Threshold is workload-dependent. Adjust 85% to suit your environment. - name: Oracle DB processes reaching limit (> 85%) description: "Oracle Database process utilization is above 85% on {{ $labels.instance }} (current value: {{ $value }}%)" query: "oracledb_resource_current_utilization{resource_name=\"processes\"} / oracledb_resource_limit_value{resource_name=\"processes\"} * 100 > 85 and oracledb_resource_limit_value{resource_name=\"processes\"} > 0" severity: warning for: 5m comments: | Threshold is workload-dependent. Adjust 85% to suit your environment. - name: Oracle DB tablespace reaching capacity (> 85%) description: "Oracle Database tablespace {{ $labels.tablespace }} is above 85% usage on {{ $labels.instance }} (current value: {{ $value }}%)" query: "oracledb_tablespace_used_percent > 85" severity: warning for: 5m - name: Oracle DB tablespace full (> 95%) description: "Oracle Database tablespace {{ $labels.tablespace }} is critically full on {{ $labels.instance }} (current value: {{ $value }}%)" query: "oracledb_tablespace_used_percent > 95" severity: critical for: 5m - name: Oracle DB high user rollbacks description: "Oracle Database on {{ $labels.instance }} has a high rollback rate ({{ $value }}% of transactions are rolled back)" query: "rate(oracledb_activity_user_rollbacks[5m]) / (rate(oracledb_activity_user_commits[5m]) + rate(oracledb_activity_user_rollbacks[5m])) * 100 > 20 and (rate(oracledb_activity_user_commits[5m]) + rate(oracledb_activity_user_rollbacks[5m])) > 0" severity: warning for: 5m comments: | A high rollback rate (>20%) often indicates application-level issues such as deadlocks, constraint violations, or poorly designed transactions. - name: Oracle DB too many active sessions description: "Oracle Database on {{ $labels.instance }} has too many active user sessions (current value: {{ $value }})" query: "oracledb_sessions_value{status=\"ACTIVE\", type=\"USER\"} > 200" severity: warning for: 5m comments: | Threshold is highly workload-dependent. Adjust 200 to suit your environment. - name: Oracle DB high wait time (user I/O) description: "Oracle Database on {{ $labels.instance }} is experiencing high user I/O wait time" query: "oracledb_wait_time_user_io > 300" severity: warning for: 5m comments: | The metric from v$waitclassmetric is already a normalized rate (centiseconds per second). Threshold 300 means 3 seconds of I/O wait per second of wall time. - name: Patroni exporters: - name: Embedded exporter (Patroni >= 2.1.0) slug: embedded-exporter-patroni doc_url: https://patroni.readthedocs.io/en/latest/rest_api.html?highlight=prometheus#monitoring-endpoint rules: - name: Patroni has no Leader description: A leader node (neither primary nor standby) cannot be found inside the cluster {{ $labels.scope }} query: (max by (scope) (patroni_primary) < 1) and (max by (scope) (patroni_standby_leader) < 1) severity: critical for: 1m comments: | 1m delay allows a restart without triggering an alert. - name: PGBouncer exporters: - name: spreaker/prometheus-pgbouncer-exporter slug: spreaker-pgbouncer-exporter doc_url: https://github.com/spreaker/prometheus-pgbouncer-exporter rules: - name: PGBouncer active connections description: PGBouncer pools are filling up query: "pgbouncer_pools_server_active_connections > 200" severity: warning for: 2m - name: PGBouncer errors description: PGBouncer is logging errors. This may be due to a server restart or an admin typing commands at the pgbouncer console. query: 'increase(pgbouncer_errors_count{errmsg!="server conn crashed?"}[1m]) > 10' severity: warning - name: PGBouncer max connections description: The number of PGBouncer client connections has reached max_client_conn. query: 'increase(pgbouncer_errors_count{errmsg="no more connections allowed (max_client_conn)"}[2m]) > 0' severity: critical - name: Redis exporters: - name: oliver006/redis_exporter slug: oliver006-redis-exporter doc_url: https://github.com/oliver006/redis_exporter rules: - name: Redis down description: Redis instance is down query: "redis_up == 0" severity: critical for: 1m comments: | 1m delay allows a restart without triggering an alert. - name: Redis missing master description: Redis cluster has no node marked as master. query: '(count(redis_instance_info{role="master"}) or vector(0)) < 1' severity: critical - name: Redis too many masters description: Redis cluster has too many nodes marked as master. query: 'count(redis_instance_info{role="master"}) > 1' severity: critical for: 1m comments: | 1m delay allows a restart without triggering an alert. - name: Redis disconnected slaves description: Redis not replicating for all slaves. Consider reviewing the redis replication status. query: "count without (instance, job) (redis_connected_slaves) - sum without (instance, job) (redis_connected_slaves) - 1 > 0" severity: critical - name: Redis replication broken description: Redis instance lost a slave query: "delta(redis_connected_slaves[1m]) < 0" severity: critical - name: Redis cluster flapping description: Changes have been detected in Redis replica connection. This can occur when replica nodes lose connection to the master and reconnect (a.k.a flapping). query: "changes(redis_connected_slaves[1m]) > 1" severity: critical for: 2m - name: Redis missing backup description: Redis has not been backed up for 48 hours query: "time() - redis_rdb_last_save_timestamp_seconds > 60 * 60 * 48" severity: critical - name: Redis out of system memory description: Redis is running out of system memory (> 90%) query: "redis_memory_used_bytes / redis_total_system_memory_bytes * 100 > 90 and redis_total_system_memory_bytes > 0" severity: warning for: 2m comments: | The exporter must be started with --include-system-metrics flag or REDIS_EXPORTER_INCL_SYSTEM_METRICS=true environment variable. - name: Redis out of configured maxmemory description: Redis is running out of configured maxmemory (> 90%) query: "redis_memory_used_bytes / redis_memory_max_bytes * 100 > 90 and on(instance) redis_memory_max_bytes > 0" severity: warning for: 2m - name: Redis too many connections description: Redis is running out of connections (> 90% used) query: "redis_connected_clients / redis_config_maxclients * 100 > 90 and redis_config_maxclients > 0" severity: warning for: 2m - name: Redis not enough connections description: Redis instance should have more connections (> 5) query: "redis_connected_clients < 5" severity: warning for: 2m - name: Redis rejected connections description: Some connections to Redis has been rejected query: "increase(redis_rejected_connections_total[1m]) > 5" severity: warning - name: Memcached exporters: - name: prometheus/memcached_exporter slug: memcached-exporter doc_url: https://github.com/prometheus/memcached_exporter rules: - name: Memcached down description: Memcached instance is down on {{ $labels.instance }} query: "memcached_up == 0" severity: critical for: 1m comments: | 1m delay allows a restart without triggering an alert. - name: Memcached connection limit approaching (> 80%) description: "Memcached connection usage is above 80% on {{ $labels.instance }} (current value: {{ $value }}%)" query: "(memcached_current_connections / memcached_max_connections * 100) > 80 and memcached_max_connections > 0" severity: warning for: 2m - name: Memcached connection limit approaching (> 95%) description: "Memcached connection usage is above 95% on {{ $labels.instance }} (current value: {{ $value }}%)" query: "(memcached_current_connections / memcached_max_connections * 100) > 95 and memcached_max_connections > 0" severity: critical for: 2m - name: Memcached out of memory errors description: "Memcached is returning out-of-memory errors on {{ $labels.instance }} ({{ $value }} errors/s)" query: "sum without (slab) (rate(memcached_slab_items_outofmemory_total[5m])) > 0.05" severity: warning for: 5m - name: Memcached memory usage high (> 90%) description: "Memcached memory usage is above 90% on {{ $labels.instance }} (current value: {{ $value }}%)" query: "(memcached_current_bytes / memcached_limit_bytes * 100) > 90 and memcached_limit_bytes > 0" severity: warning for: 5m comments: | High memory usage is expected if the cache is well-utilized. This alert fires when it approaches the configured limit, which may cause evictions. - name: Memcached high eviction rate description: "Memcached is evicting items at a high rate on {{ $labels.instance }} ({{ $value }} evictions/s)" query: "rate(memcached_items_evicted_total[5m]) > 10" severity: warning for: 5m comments: | A sustained eviction rate indicates memory pressure. Consider increasing memcached memory limit or reducing cache usage. Threshold of 10 evictions/s is a rough default — adjust based on your workload. - name: Memcached low cache hit rate (< 80%) description: "Memcached cache hit rate is below 80% on {{ $labels.instance }} (current value: {{ $value }}%)" query: '(rate(memcached_commands_total{command="get", status="hit"}[5m]) / (rate(memcached_commands_total{command="get", status="hit"}[5m]) + rate(memcached_commands_total{command="get", status="miss"}[5m])) * 100) < 80 and (rate(memcached_commands_total{command="get", status="hit"}[5m]) + rate(memcached_commands_total{command="get", status="miss"}[5m])) > 0' severity: warning for: 10m comments: | A low hit rate may indicate poor cache utilization, incorrect cache keys, or TTLs that are too short. Threshold of 80% is a rough default — adjust based on your workload and access patterns. - name: Memcached connections rejected description: "Memcached is rejecting connections on {{ $labels.instance }} ({{ $value }} rejections in the last 5m)" query: "increase(memcached_connections_rejected_total[5m]) > 3" severity: warning for: 5m - name: Memcached items too large description: "Memcached is rejecting items exceeding max-item-size on {{ $labels.instance }} ({{ $value }} rejections in the last 5m)" query: "increase(memcached_item_too_large_total[5m]) > 3" severity: info for: 5m - name: MongoDB exporters: - name: percona/mongodb_exporter slug: percona-mongodb-exporter doc_url: https://github.com/percona/mongodb_exporter rules: - name: MongoDB Down description: MongoDB instance is down query: "mongodb_up == 0" severity: critical for: 1m comments: | 1m delay allows a restart without triggering an alert. - name: Mongodb replica member unhealthy description: MongoDB replica member is not healthy query: "mongodb_rs_members_health == 0" severity: critical for: 1m comments: | 1m delay allows a restart without triggering an alert. - name: MongoDB replication lag (Percona) description: Mongodb replication lag is more than 10s query: '(mongodb_rs_members_optimeDate{member_state="PRIMARY"} - on (set) group_right mongodb_rs_members_optimeDate{member_state="SECONDARY"}) / 1000 > 10' severity: critical - name: MongoDB replication headroom description: MongoDB replication headroom is <= 0 query: 'sum(avg(mongodb_mongod_replset_oplog_head_timestamp - mongodb_mongod_replset_oplog_tail_timestamp)) - sum(avg(mongodb_rs_members_optimeDate{member_state="PRIMARY"} - on (set) group_right mongodb_rs_members_optimeDate{member_state="SECONDARY"})) <= 0' severity: critical comments: | This query mixes old (mongodb_mongod_*) and new (mongodb_rs_*) metric names. It requires the Percona exporter to run with --compatible-mode to expose both. - name: MongoDB number cursors open (Percona) description: Too many cursors opened by MongoDB for clients (> 10k) query: 'mongodb_ss_metrics_cursor_open{csr_type="total"} > 10 * 1000' severity: warning for: 2m - name: MongoDB cursors timeouts (Percona) description: "Too many cursors are timing out ({{ $value }} in the last minute)" query: "increase(mongodb_ss_metrics_cursor_timedOut[1m]) > 100" severity: warning for: 2m - name: MongoDB too many connections (Percona) description: Too many connections (> 80%) query: 'mongodb_ss_connections{conn_type="current"} / (mongodb_ss_connections{conn_type="current"} + mongodb_ss_connections{conn_type="available"}) * 100 > 80 and (mongodb_ss_connections{conn_type="current"} + mongodb_ss_connections{conn_type="available"}) > 0' severity: warning for: 2m - name: dcu/mongodb_exporter slug: dcu-mongodb-exporter doc_url: https://github.com/dcu/mongodb_exporter rules: - name: MongoDB replication lag (DCU) description: Mongodb replication lag is more than 10s query: 'avg(mongodb_replset_member_optime_date{state="PRIMARY"}) - avg(mongodb_replset_member_optime_date{state="SECONDARY"}) > 10' severity: critical - name: MongoDB replication Status 3 description: MongoDB Replication set member either perform startup self-checks, or transition from completing a rollback or resync query: "mongodb_replset_member_state == 3" severity: critical - name: MongoDB replication Status 6 description: MongoDB Replication set member as seen from another member of the set, is not yet known query: "mongodb_replset_member_state == 6" severity: critical - name: MongoDB replication Status 8 description: MongoDB Replication set member as seen from another member of the set, is unreachable query: "mongodb_replset_member_state == 8" severity: critical - name: MongoDB replication Status 9 description: MongoDB Replication set member is actively performing a rollback. Data is not available for reads query: "mongodb_replset_member_state == 9" severity: critical - name: MongoDB replication Status 10 description: MongoDB Replication set member was once in a replica set but was subsequently removed query: "mongodb_replset_member_state == 10" severity: critical - name: MongoDB number cursors open (DCU) description: Too many cursors opened by MongoDB for clients (> 10k) query: 'mongodb_metrics_cursor_open{state="total_open"} > 10000' severity: warning for: 2m - name: MongoDB cursors timeouts (DCU) description: "Too many cursors are timing out ({{ $value }} in the last minute)" query: "increase(mongodb_metrics_cursor_timed_out_total[1m]) > 100" severity: warning for: 2m - name: MongoDB too many connections (DCU) description: Too many connections (> 80%) query: 'mongodb_connections{state="current"} / (mongodb_connections{state="current"} + mongodb_connections{state="available"}) * 100 > 80 and (mongodb_connections{state="current"} + mongodb_connections{state="available"}) > 0' severity: warning for: 2m - name: stefanprodan/mgob slug: stefanprodan-mgob-exporter doc_url: https://github.com/stefanprodan/mgob rules: - name: Mgob backup failed description: MongoDB backup has failed query: 'changes(mgob_scheduler_backup_total{status="500"}[1h]) > 0' severity: critical - name: Elasticsearch exporters: - name: prometheus-community/elasticsearch_exporter slug: prometheus-community-elasticsearch-exporter doc_url: https://github.com/prometheus-community/elasticsearch_exporter rules: - name: Elasticsearch Heap Usage Too High description: "The heap usage is over 90%" query: '(elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 90 and elasticsearch_jvm_memory_max_bytes{area="heap"} > 0' severity: critical for: 2m - name: Elasticsearch Heap Usage warning description: "The heap usage is over 80%" query: '(elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 80 and elasticsearch_jvm_memory_max_bytes{area="heap"} > 0' severity: warning for: 2m - name: Elasticsearch disk out of space description: The disk usage is over 90% query: "elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 10 and elasticsearch_filesystem_data_size_bytes > 0" severity: critical - name: Elasticsearch disk space low description: The disk usage is over 80% query: "elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 20 and elasticsearch_filesystem_data_size_bytes > 0" severity: warning for: 2m - name: Elasticsearch Cluster Red description: Elastic Cluster Red status query: 'elasticsearch_cluster_health_status{color="red"} == 1' severity: critical - name: Elasticsearch Cluster Yellow description: Elastic Cluster Yellow status query: 'elasticsearch_cluster_health_status{color="yellow"} == 1' severity: warning - name: Elasticsearch Healthy Nodes description: "Missing node in Elasticsearch cluster" query: "elasticsearch_cluster_health_number_of_nodes < 3" severity: critical for: 1m comments: | 1m delay allows a restart without triggering an alert. - name: Elasticsearch Healthy Data Nodes description: "Missing data node in Elasticsearch cluster" query: "elasticsearch_cluster_health_number_of_data_nodes < 3" severity: critical for: 1m comments: | 1m delay allows a restart without triggering an alert. - name: Elasticsearch relocating shards description: "Elasticsearch is relocating shards" query: "elasticsearch_cluster_health_relocating_shards > 0" severity: info - name: Elasticsearch relocating shards too long description: "Elasticsearch has been relocating shards for 15min" query: "elasticsearch_cluster_health_relocating_shards > 0" severity: warning for: 15m - name: Elasticsearch initializing shards description: "Elasticsearch is initializing shards" query: "elasticsearch_cluster_health_initializing_shards > 0" severity: info - name: Elasticsearch initializing shards too long description: "Elasticsearch has been initializing shards for 15 min" query: "elasticsearch_cluster_health_initializing_shards > 0" severity: warning for: 15m - name: Elasticsearch unassigned shards description: "Elasticsearch has unassigned shards" query: "elasticsearch_cluster_health_unassigned_shards > 0" severity: critical for: 2m - name: Elasticsearch pending tasks description: "Elasticsearch has pending tasks. Cluster works slowly." query: "elasticsearch_cluster_health_number_of_pending_tasks > 0" severity: warning for: 15m - name: Elasticsearch no new documents description: "No new documents for 10 min!" query: 'increase(elasticsearch_indices_indexing_index_total{es_data_node="true"}[10m]) < 1' severity: warning - name: Elasticsearch High Indexing Latency description: "The indexing latency on Elasticsearch cluster is higher than the threshold (current value: {{ $value }}s)." query: "rate(elasticsearch_indices_indexing_index_time_seconds_total[5m]) / rate(elasticsearch_indices_indexing_index_total[5m]) > 0.01 and rate(elasticsearch_indices_indexing_index_total[5m]) > 0" severity: warning for: 10m comments: | Threshold of 10ms (0.01s) per indexing operation is a rough default. Adjust based on your document size and cluster performance. - name: Elasticsearch High Indexing Rate description: "The indexing rate on Elasticsearch cluster is higher than the threshold." query: "sum(rate(elasticsearch_indices_indexing_index_total[1m]))> 10000" severity: warning for: 5m comments: | Threshold of 10000 ops/s is a rough default. Adjust based on your cluster capacity and expected workload. - name: Elasticsearch High Query Rate description: "The query rate on Elasticsearch cluster is higher than the threshold." query: "sum(rate(elasticsearch_indices_search_query_total[1m])) > 100" severity: warning for: 5m comments: | Threshold of 100 queries/s is very low for most production clusters. Adjust based on your expected query volume. - name: Elasticsearch High Query Latency description: "The query latency on Elasticsearch cluster is higher than the threshold (current value: {{ $value }}s)." query: "rate(elasticsearch_indices_search_query_time_seconds[1m]) / rate(elasticsearch_indices_search_query_total[1m]) > 1 and rate(elasticsearch_indices_search_query_total[1m]) > 0" severity: warning for: 5m - name: OpenSearch exporters: - name: opensearch-project/opensearch-prometheus-exporter slug: opensearch-project-opensearch-prometheus-exporter doc_url: https://github.com/opensearch-project/opensearch-prometheus-exporter rules: - name: OpenSearch is unhealthy description: "OpenSearch cluster {{ $labels.cluster }} is unhealthy" query: "opensearch_cluster_status != 0" severity: critical - name: OpenSearch high heap usage description: "OpenSearch heap usage on cluster {{ $labels.cluster }} is too high" query: opensearch_jvm_mem_heap_used_percent > 90 severity: warning for: 5m - name: OpenSearch circuitbreaker tripped description: "The circuitbreaker on OpenSearch cluster {{ $labels.cluster }} has tripped to prevent Java OutOfMemoryError" query: "opensearch_circuitbreaker_tripped_count > 0" severity: warning for: 5m - name: OpenSearch has pending tasks query: "opensearch_cluster_pending_tasks_number > 0" description: "OpenSearch cluster {{ $labels.cluster }} has pending tasks" severity: warning for: 5m - name: OpenSearch indexing is throttled description: "Indexing on OpenSearch cluster {{ $labels.cluster }} is throttled" query: "opensearch_indices_indexing_is_throttled_bool > 0" severity: warning for: 5m - name: OpenSearch has inactive shards description: "OpenSearch cluster {{ $labels.cluster }} has inactive shards" query: "opensearch_cluster_shards_active_percent < 100.0" severity: warning for: 5m - name: Meilisearch exporters: - name: Embedded exporter slug: embedded-exporter doc_url: https://github.com/orgs/meilisearch/discussions/625 rules: - name: Meilisearch index is empty description: Meilisearch index {{ $labels.index }} has zero documents query: "meilisearch_index_docs_count == 0" severity: warning - name: Meilisearch http response time description: Meilisearch http response time is too high query: "meilisearch_http_response_time_seconds > 0.5" severity: warning - name: Cassandra exporters: - name: instaclustr/cassandra-exporter slug: instaclustr-cassandra-exporter doc_url: https://github.com/instaclustr/cassandra-exporter rules: - name: "Cassandra Node is unavailable" description: "Cassandra Node is unavailable - {{ $labels.cassandra_cluster }} {{ $labels.exported_endpoint }}" query: "cassandra_endpoint_active < 1" severity: critical for: 1m comments: | 1m delay allows a restart without triggering an alert. - name: "Cassandra many compaction tasks are pending" description: "Many Cassandra compaction tasks are pending - {{ $labels.cassandra_cluster }}" query: "cassandra_table_estimated_pending_compactions > 100" severity: warning - name: "Cassandra commitlog pending tasks (Instaclustr)" description: "Cassandra commitlog pending tasks - {{ $labels.cassandra_cluster }}" query: "cassandra_commit_log_pending_tasks > 15" for: 2m severity: warning - name: "Cassandra compaction executor blocked tasks (Instaclustr)" description: "Some Cassandra compaction executor tasks are blocked - {{ $labels.cassandra_cluster }}" query: 'cassandra_thread_pool_blocked_tasks{pool="CompactionExecutor"} > 15' for: 2m severity: warning - name: "Cassandra flush writer blocked tasks (Instaclustr)" description: "Some Cassandra flush writer tasks are blocked - {{ $labels.cassandra_cluster }}" query: 'cassandra_thread_pool_blocked_tasks{pool="MemtableFlushWriter"} > 15' for: 2m severity: warning - name: "Cassandra connection timeouts total (Instaclustr)" description: "Some connection between nodes are ending in timeout - {{ $labels.cassandra_cluster }}" query: "sum by (cassandra_cluster,instance) (rate(cassandra_client_request_timeouts_total[5m])) > 5" for: 2m severity: critical - name: "Cassandra storage exceptions (Instaclustr)" description: "Something is going wrong with cassandra storage - {{ $labels.cassandra_cluster }}" query: "changes(cassandra_storage_exceptions_total[1m]) > 1" severity: critical - name: "Cassandra tombstone dump (Instaclustr)" description: "Cassandra tombstone dump - {{ $labels.cassandra_cluster }}" query: 'avg(cassandra_table_tombstones_scanned{quantile="0.99"}) by (instance,cassandra_cluster,keyspace) > 100' for: 2m severity: critical - name: "Cassandra client request unavailable write (Instaclustr)" description: "Some Cassandra client requests are unavailable to write - {{ $labels.cassandra_cluster }}" query: 'changes(cassandra_client_request_unavailable_exceptions_total{operation="write"}[1m]) > 0' for: 2m severity: critical - name: "Cassandra client request unavailable read (Instaclustr)" description: "Some Cassandra client requests are unavailable to read - {{ $labels.cassandra_cluster }}" query: 'changes(cassandra_client_request_unavailable_exceptions_total{operation="read"}[1m]) > 0' for: 2m severity: critical - name: "Cassandra client request write failure (Instaclustr)" description: "Write failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}" query: 'increase(cassandra_client_request_failures_total{operation="write"}[1m]) > 5' for: 2m severity: critical - name: "Cassandra client request read failure (Instaclustr)" description: "Read failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}" query: 'increase(cassandra_client_request_failures_total{operation="read"}[1m]) > 5' for: 2m severity: critical - name: criteo/cassandra_exporter slug: criteo-cassandra-exporter doc_url: https://github.com/criteo/cassandra_exporter rules: - name: Cassandra hints count description: Cassandra hints count has changed on {{ $labels.instance }} some nodes may go down query: 'changes(cassandra_stats{name="org:apache:cassandra:metrics:storage:totalhints:count"}[1m]) > 3' severity: critical - name: Cassandra compaction task pending description: Many Cassandra compaction tasks are pending. You might need to increase I/O capacity by adding nodes to the cluster. query: 'cassandra_stats{name="org:apache:cassandra:metrics:compaction:pendingtasks:value"} > 100' severity: warning for: 2m - name: Cassandra viewwrite latency description: High viewwrite latency on {{ $labels.instance }} cassandra node query: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:viewwrite:viewwritelatency:99thpercentile"} > 100000' severity: warning for: 2m - name: Cassandra authentication failures description: Increase of Cassandra authentication failures query: 'delta(cassandra_stats{name="org:apache:cassandra:metrics:client:authfailure:count"}[1m]) > 5' severity: warning for: 2m - name: Cassandra node down description: Cassandra node down query: 'sum(cassandra_stats{name="org:apache:cassandra:net:failuredetector:downendpointcount"}) by (service,group,cluster,env) > 0' severity: critical for: 1m comments: | 1m delay allows a restart without triggering an alert. - name: Cassandra commitlog pending tasks (Criteo) description: Unexpected number of Cassandra commitlog pending tasks query: 'cassandra_stats{name="org:apache:cassandra:metrics:commitlog:pendingtasks:value"} > 15' severity: warning for: 2m - name: Cassandra compaction executor blocked tasks (Criteo) description: Some Cassandra compaction executor tasks are blocked query: 'cassandra_stats{name="org:apache:cassandra:metrics:threadpools:internal:compactionexecutor:currentlyblockedtasks:count"} > 0' severity: warning for: 2m - name: Cassandra flush writer blocked tasks (Criteo) description: Some Cassandra flush writer tasks are blocked query: 'cassandra_stats{name="org:apache:cassandra:metrics:threadpools:internal:memtableflushwriter:currentlyblockedtasks:count"} > 0' severity: warning for: 2m - name: Cassandra repair pending tasks description: Some Cassandra repair tasks are pending query: 'cassandra_stats{name="org:apache:cassandra:metrics:threadpools:internal:antientropystage:pendingtasks:value"} > 2' severity: warning for: 2m - name: Cassandra repair blocked tasks description: Some Cassandra repair tasks are blocked query: 'cassandra_stats{name="org:apache:cassandra:metrics:threadpools:internal:antientropystage:currentlyblockedtasks:count"} > 0' severity: warning for: 2m - name: Cassandra connection timeouts total (Criteo) description: Some connection between nodes are ending in timeout query: 'delta(cassandra_stats{name="org:apache:cassandra:metrics:connection:totaltimeouts:count"}[1m]) > 5' severity: critical for: 2m - name: Cassandra storage exceptions (Criteo) description: Something is going wrong with cassandra storage query: 'changes(cassandra_stats{name="org:apache:cassandra:metrics:storage:exceptions:count"}[1m]) > 1' severity: critical - name: Cassandra tombstone dump (Criteo) description: Too much tombstones scanned in queries query: 'cassandra_stats{name="org:apache:cassandra:metrics:table:tombstonescannedhistogram:99thpercentile"} > 1000' severity: critical - name: Cassandra client request unavailable write (Criteo) description: Write failures have occurred because too many nodes are unavailable query: 'changes(cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:write:unavailables:count"}[1m]) > 0' severity: critical - name: Cassandra client request unavailable read (Criteo) description: Read failures have occurred because too many nodes are unavailable query: 'changes(cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:read:unavailables:count"}[1m]) > 0' severity: critical - name: Cassandra client request write failure (Criteo) description: A lot of write failures encountered. A write failure is a non-timeout exception encountered during a write request. Examine the reason map to find to the root cause. The most common cause for this type of error is when batch sizes are too large. query: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:write:failures:oneminuterate"} > 0.05' severity: critical - name: Cassandra client request read failure (Criteo) description: A lot of read failures encountered. A read failure is a non-timeout exception encountered during a read request. Examine the reason map to find to the root cause. The most common cause for this type of error is when batch sizes are too large. query: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:read:failures:oneminuterate"} > 0.05' severity: critical - name: Cassandra cache hit rate key cache description: Key cache hit rate is below 85% query: 'cassandra_stats{name="org:apache:cassandra:metrics:cache:keycache:hitrate:value"} < .85' severity: warning for: 2m comments: | A low key cache hit rate increases disk I/O. Threshold is workload-dependent — adjust based on your data access patterns. - name: Clickhouse exporters: - name: Embedded Exporter slug: embedded-exporter doc_url: https://clickhouse.com/docs/en/operations/system-tables/metrics rules: - name: ClickHouse node down description: "No metrics received from ClickHouse exporter for over 2 minutes." query: 'up{job="clickhouse"} == 0' severity: critical for: 2m comments: | Adjust the job label to match your Prometheus configuration. - name: ClickHouse Memory Usage Critical description: "Memory usage is critically high, over 90%." query: "ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 90 and ClickHouseAsyncMetrics_CGroupMemoryTotal > 0" severity: critical for: 5m - name: ClickHouse Memory Usage Warning description: "Memory usage is over 80%." query: "ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 80 and ClickHouseAsyncMetrics_CGroupMemoryTotal > 0" severity: warning for: 5m - name: ClickHouse Disk Space Low on Default description: "Disk space on default is below 20%." query: "ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 20 and (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) > 0" severity: warning for: 2m - name: ClickHouse Disk Space Critical on Default description: "Disk space on default disk is critically low, below 10%." query: "ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 10 and (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) > 0" severity: critical for: 2m - name: ClickHouse Disk Space Low on Backups description: "Disk space on backups is below 20%." query: "ClickHouseAsyncMetrics_DiskAvailable_backups / (ClickHouseAsyncMetrics_DiskAvailable_backups + ClickHouseAsyncMetrics_DiskUsed_backups) * 100 < 20 and (ClickHouseAsyncMetrics_DiskAvailable_backups + ClickHouseAsyncMetrics_DiskUsed_backups) > 0" severity: warning for: 2m - name: ClickHouse Replica Errors description: "Critical replica errors detected, either all replicas are stale or lost." query: "ClickHouseErrorMetric_ALL_REPLICAS_ARE_STALE == 1 or ClickHouseErrorMetric_ALL_REPLICAS_LOST == 1" severity: critical - name: ClickHouse No Available Replicas description: "No available replicas in ClickHouse." query: "ClickHouseErrorMetric_NO_AVAILABLE_REPLICA == 1" severity: critical - name: ClickHouse No Live Replicas description: "There are too few live replicas available, risking data loss and service disruption." query: "ClickHouseErrorMetric_TOO_FEW_LIVE_REPLICAS == 1" severity: critical - name: ClickHouse High TCP Connections description: "High number of TCP connections, indicating heavy client or inter-cluster communication." query: "ClickHouseMetrics_TCPConnection > 400" severity: warning for: 5m comments: | Please replace the threshold with an appropriate value - name: ClickHouse Interserver Connection Issues description: "High number of interserver connections may indicate replication or distributed query handling issues." query: "ClickHouseMetrics_InterserverConnection > 50" severity: warning for: 5m comments: | Adjust the threshold based on your cluster size and expected replication traffic. - name: ClickHouse ZooKeeper Connection Issues description: "ClickHouse is experiencing issues with ZooKeeper connections, which may affect cluster state and coordination." query: "ClickHouseMetrics_ZooKeeperSession != 1" severity: warning for: 3m - name: ClickHouse Authentication Failures description: "Authentication failures detected, indicating potential security issues or misconfiguration." query: "increase(ClickHouseErrorMetric_AUTHENTICATION_FAILED[5m]) > 3" severity: info - name: ClickHouse Access Denied Errors description: "Access denied errors have been logged, which could indicate permission issues or unauthorized access attempts." query: "increase(ClickHouseErrorMetric_RESOURCE_ACCESS_DENIED[5m]) > 3" severity: info - name: ClickHouse rejected insert queries description: "INSERTs rejected due to too many active data parts. Reduce insert frequency." query: "increase(ClickHouseProfileEvents_RejectedInserts[1m]) > 2" severity: warning for: 1m - name: ClickHouse delayed insert queries description: "INSERTs delayed due to high number of active parts." query: "increase(ClickHouseProfileEvents_DelayedInserts[5m]) > 10" severity: warning for: 2m - name: ClickHouse zookeeper hardware exception description: "Zookeeper hardware exception: network issues communicating with ZooKeeper" query: "increase(ClickHouseProfileEvents_ZooKeeperHardwareExceptions[1m]) > 0" severity: critical for: 1m - name: ClickHouse high network usage description: High network usage. ClickHouse network usage exceeds 100MB/s. query: "rate(ClickHouseProfileEvents_NetworkSendBytes[1m]) > 100*1024*1024 or rate(ClickHouseProfileEvents_NetworkReceiveBytes[1m]) > 100*1024*1024" severity: warning for: 2m comments: | Please replace the threshold with an appropriate value - name: ClickHouse distributed rejected inserts description: "INSERTs into Distributed tables rejected due to pending bytes limit." query: "increase(ClickHouseProfileEvents_DistributedRejectedInserts[5m]) > 3" severity: critical for: 2m - name: CouchDB exporters: - name: gesellix/couchdb-prometheus-exporter slug: gesellix-couchdb-prometheus-exporter doc_url: https://github.com/gesellix/couchdb-prometheus-exporter rules: - name: CouchDB node down description: CouchDB node is not responding (node_up metric is 0) for more than 2 minutes query: "couchdb_httpd_node_up == 0 or couchdb_httpd_up == 0" severity: critical for: 2m - name: CouchDB atom memory usage critical description: Atom memory usage is above 90% of limit query: "couchdb_erlang_memory_atom_used > 0.9 * couchdb_erlang_memory_atom" severity: critical for: 5m - name: CouchDB open databases critical description: Number of open databases exceeds 90% of node capacity query: "couchdb_httpd_open_databases > 0.9 * 1000" severity: critical for: 5m comments: | The default max_dbs_open is 500. Adjust the threshold (currently 0.9 * 1000) to match your max_dbs_open setting. - name: CouchDB open OS files critical description: CouchDB is using more than 90% of allowed OS file descriptors, may fail to open new files query: "couchdb_httpd_open_os_files > 0.9 * 65535" severity: critical for: 5m comments: | Adjust 65535 to match your system's file descriptor limit (ulimit -n). - name: CouchDB 5xx error ratio high description: More than 5% of HTTP requests are returning 5xx errors query: "rate(couchdb_httpd_status_codes{code=~\"5..\"}[5m]) / rate(couchdb_httpd_requests[5m]) > 0.05 and rate(couchdb_httpd_requests[5m]) > 0" severity: critical for: 5m - name: CouchDB temporary view read rate critical description: Temporary view read rate exceeds 100 reads/sec, high risk of performance degradation query: "rate(couchdb_httpd_temporary_view_reads[5m]) > 100" severity: critical for: 5m - name: CouchDB Mango queries scanning too many docs description: Some Mango queries are scanning too many documents, consider adding indexes query: "rate(couchdb_mango_too_many_docs_scanned[5m]) > 50" severity: warning for: 5m - name: CouchDB Mango queries failed due to invalid index description: Some Mango queries failed to execute because the index was missing or invalid query: "rate(couchdb_mango_query_invalid_index[5m]) > 5" severity: warning for: 5m - name: CouchDB Mango docs examined high description: High number of documents examined per Mango queries, consider indexing query: "rate(couchdb_mango_docs_examined[5m]) > 1000" severity: warning for: 5m - name: CouchDB Replicator manager died description: Replication manager process has crashed query: "increase(couchdb_replicator_changes_manager_deaths[5m]) > 0" severity: critical for: 1m - name: CouchDB Replicator queue process died description: Replication queue process has crashed query: "increase(couchdb_replicator_changes_queue_deaths[5m]) > 0" severity: critical for: 1m - name: CouchDB Replicator reader process died description: Replication reader process has crashed query: "increase(couchdb_replicator_changes_reader_deaths[5m]) > 0" severity: critical for: 1m - name: CouchDB Replicator failed to start description: One or more replication tasks failed to start query: "increase(couchdb_replicator_failed_starts[5m]) > 0" severity: critical for: 1m - name: CouchDB replication cluster unstable description: The replication cluster is unstable, replication may be interrupted query: "couchdb_replicator_cluster_is_stable == 0" severity: critical for: 2m - name: CouchDB replication read failures description: Replication changes feed has failed reads more than 5 times in 5 minutes query: "increase(couchdb_replicator_changes_read_failures[5m]) > 5" severity: warning for: 5m - name: CouchDB file descriptors high description: Process is using more than 85% of allowed file descriptors query: "process_open_fds / process_max_fds > 0.85 and process_max_fds > 0" severity: warning for: 5m - name: CouchDB process restarted description: CouchDB process has restarted recently query: "changes(process_start_time_seconds[1h]) > 0" severity: info for: 1m - name: CouchDB critical log entries description: Critical or error log entries detected in the last 5 minutes query: "increase(couchdb_server_couch_log{level=~\"error|critical\"}[5m]) > 5" severity: critical for: 1m - name: Solr exporters: - name: embedded exporter slug: embedded-exporter doc_url: https://solr.apache.org/guide/8_11/monitoring-solr-with-prometheus-and-grafana.html rules: - name: Solr update errors description: Solr collection {{ $labels.collection }} has failed updates for replica {{ $labels.replica }} on {{ $labels.base_url }}. query: "increase(solr_metrics_core_update_handler_errors_total[1m]) > 1" severity: critical - name: Solr query errors description: Solr has increased query errors in collection {{ $labels.collection }} for replica {{ $labels.replica }} on {{ $labels.base_url }}. query: 'increase(solr_metrics_core_errors_total{category="QUERY"}[1m]) > 1' severity: warning for: 5m - name: Solr replication errors description: Solr collection {{ $labels.collection }} has replication errors for replica {{ $labels.replica }} on {{ $labels.base_url }}. query: 'increase(solr_metrics_core_errors_total{category="REPLICATION"}[1m]) > 1' severity: critical - name: Solr low live node count description: Solr collection {{ $labels.collection }} has less than two live nodes for replica {{ $labels.replica }} on {{ $labels.base_url }}. query: "solr_collections_live_nodes < 2" severity: critical - name: Message brokers services: - name: RabbitMQ exporters: - name: rabbitmq/rabbitmq-prometheus slug: rabbitmq-exporter doc_url: https://github.com/rabbitmq/rabbitmq-prometheus rules: - name: RabbitMQ node down description: Less than 3 nodes running in RabbitMQ cluster query: "sum(rabbitmq_build_info) < 3" severity: critical for: 1m comments: | 1m delay allows a restart without triggering an alert. - name: RabbitMQ node not distributed description: Distribution link to peer {{ $labels.peer }} is not 'up' (state {{ $value }}) query: "erlang_vm_dist_node_state < 3" severity: critical for: 1m comments: | 1m delay allows a restart without triggering an alert. - name: RabbitMQ instances different versions description: Running different version of RabbitMQ in the same cluster, can lead to failure. query: "count(count(rabbitmq_build_info) by (rabbitmq_version)) > 1" severity: warning for: 1h - name: RabbitMQ memory high description: A node use more than 90% of allocated RAM query: "rabbitmq_process_resident_memory_bytes / rabbitmq_resident_memory_limit_bytes * 100 > 90 and rabbitmq_resident_memory_limit_bytes > 0" severity: warning for: 2m - name: RabbitMQ file descriptors usage description: A node use more than 90% of file descriptors query: "rabbitmq_process_open_fds / rabbitmq_process_max_fds * 100 > 90 and rabbitmq_process_max_fds > 0" severity: warning for: 2m - name: RabbitMQ too many ready messages description: RabbitMQ too many ready messages on queue {{ $labels.queue }} ({{ $value }}) query: "sum(rabbitmq_queue_messages_ready) BY (queue) > 1000" severity: warning for: 1m - name: RabbitMQ too many unack messages description: Too many unacknowledged messages on queue {{ $labels.queue }} ({{ $value }}) query: "sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000" severity: warning for: 1m - name: RabbitMQ too many connections description: The total connections of a node is too high query: "rabbitmq_connections > 1000" severity: warning for: 2m - name: RabbitMQ no queue consumer description: A queue has less than 1 consumer query: "rabbitmq_queue_consumers < 1" severity: warning for: 1m # allows a short service restart - name: RabbitMQ unroutable messages description: A queue has unroutable messages ({{ $value }} in the last 5m) query: "increase(rabbitmq_channel_messages_unroutable_returned_total[5m]) > 3 or increase(rabbitmq_channel_messages_unroutable_dropped_total[5m]) > 3" severity: warning for: 2m comments: | Threshold of 3 avoids noise from occasional misroutes. Adjust based on your expected traffic patterns. - name: kbudde/rabbitmq-exporter slug: kbudde-rabbitmq-exporter doc_url: https://github.com/kbudde/rabbitmq_exporter rules: - name: RabbitMQ down description: RabbitMQ node down query: "rabbitmq_up == 0" severity: critical for: 1m comments: | 1m delay allows a restart without triggering an alert. - name: RabbitMQ cluster down description: Less than 3 nodes running in RabbitMQ cluster query: "sum(rabbitmq_running) < 3" severity: critical for: 1m comments: | 1m delay allows a restart without triggering an alert. - name: RabbitMQ cluster partition description: RabbitMQ cluster has a network partition ({{ $value }} partitions detected). Messages may be lost or duplicated. query: "rabbitmq_partitions > 0" severity: critical - name: RabbitMQ out of memory description: Memory available for RabbitMQ is low (< 10%) query: "rabbitmq_node_mem_used / rabbitmq_node_mem_limit * 100 > 90 and rabbitmq_node_mem_limit > 0" severity: warning for: 2m - name: RabbitMQ instance too many connections description: RabbitMQ instance has too many connections (> 1000) query: "rabbitmq_connections > 1000" severity: warning for: 2m - name: RabbitMQ dead letter queue filling up description: Dead letter queue is filling up (> 10 msgs) query: 'rabbitmq_queue_messages{queue="my-dead-letter-queue"} > 10' severity: warning for: 1m comments: | Indicate the queue name in dedicated label. - name: RabbitMQ too many messages in queue description: Queue is filling up (> 1000 msgs) query: 'rabbitmq_queue_messages_ready{queue="my-queue"} > 1000' severity: warning for: 2m comments: | Indicate the queue name in dedicated label. - name: RabbitMQ slow queue consuming description: Queue messages are consumed slowly (> 60s) query: 'time() - rabbitmq_queue_head_message_timestamp{queue="my-queue"} > 60' severity: warning for: 2m comments: | Indicate the queue name in dedicated label. - name: RabbitMQ no consumer description: Queue has no consumer query: "rabbitmq_queue_consumers == 0" severity: critical for: 5m comments: | Allows a short service restart. - name: RabbitMQ too many consumers description: Queue should have only 1 consumer query: 'rabbitmq_queue_consumers{queue="my-queue"} > 1' severity: critical comments: | Indicate the queue name in dedicated label. - name: RabbitMQ inactive exchange description: Exchange receive less than 5 msgs per second query: 'rate(rabbitmq_exchange_messages_published_in_total{exchange="my-exchange"}[1m]) < 5' severity: warning comments: | Indicate the exchange name in dedicated label. for: 2m - name: Zookeeper exporters: - name: cloudflare/kafka_zookeeper_exporter slug: cloudflare-kafka-zookeeper-exporter doc_url: https://github.com/cloudflare/kafka_zookeeper_exporter rules: - name: dabealu/zookeeper-exporter slug: dabealu-zookeeper-exporter doc_url: https://github.com/dabealu/zookeeper-exporter rules: - name: Zookeeper Down description: "Zookeeper down on instance {{ $labels.instance }}" query: "zk_up == 0" severity: critical for: 1m comments: | 1m delay allows a restart without triggering an alert. - name: Zookeeper missing leader description: "Zookeeper cluster has no node marked as leader" query: "sum(zk_server_leader) == 0" severity: critical - name: Zookeeper Too Many Leaders description: "Zookeeper cluster has {{ $value }} nodes marked as leader (expected 1), indicating a split-brain" query: "sum(zk_server_leader) > 1" severity: critical - name: Zookeeper Not Ok description: "Zookeeper instance {{ $labels.instance }} is not ok (ruok check failed)" query: "zk_ruok == 0" severity: warning for: 3m - name: Kafka exporters: - name: danielqsj/kafka_exporter slug: danielqsj-kafka-exporter doc_url: https://github.com/danielqsj/kafka_exporter rules: - name: Kafka topics replicas description: Kafka topic {{ $labels.topic }} has fewer than 3 in-sync replicas ({{ $value }}), data durability is at risk. query: "min(kafka_topic_partition_in_sync_replica) by (topic) < 3" severity: critical - name: Kafka consumer group lag description: Kafka consumer group {{ $labels.consumergroup }} is lagging behind ({{ $value }} messages) query: "sum(kafka_consumergroup_lag) by (consumergroup) > 10000" severity: warning for: 1m - name: linkedin/Burrow slug: linkedin-kafka-exporter doc_url: https://github.com/linkedin/Burrow rules: - name: Kafka topic offset decreased description: Kafka topic offset has decreased query: "delta(kafka_burrow_partition_current_offset[1m]) < 0" severity: warning - name: Kafka consumer lag description: Kafka consumer has a 30 minutes and increasing lag query: "kafka_burrow_topic_partition_offset - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset >= (kafka_burrow_topic_partition_offset offset 15m - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset offset 15m) AND kafka_burrow_topic_partition_offset - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset > 0" severity: warning for: 15m - name: Pulsar exporters: - name: embedded exporter slug: embedded-exporter doc_url: https://pulsar.apache.org/docs/reference-metrics/ rules: - name: Pulsar subscription high number of backlog entries description: "The number of subscription backlog entries is over 5k" query: sum(pulsar_subscription_back_log) by (subscription) > 5000 for: 1h severity: warning - name: Pulsar subscription very high number of backlog entries description: "The number of subscription backlog entries is over 100k" query: sum(pulsar_subscription_back_log) by (subscription) > 100000 for: 1h severity: critical - name: Pulsar topic large backlog storage size description: "The topic backlog storage size is over 5 GB" query: sum(pulsar_storage_size) by (topic) > 5*1024*1024*1024 for: 1h severity: warning - name: Pulsar topic very large backlog storage size description: "The topic backlog storage size is over 20 GB" query: sum(pulsar_storage_size) by (topic) > 20*1024*1024*1024 for: 1h severity: critical - name: Pulsar high write latency description: "Pulsar topic {{ $labels.topic }} has {{ $value }} storage write operations exceeding the maximum latency bucket (> 1000ms)" query: sum(pulsar_storage_write_latency_le_overflow > 0) by (topic) for: 1h severity: critical comments: | pulsar_storage_write_latency_le_overflow is the overflow bucket of Pulsar's non-standard histogram. It counts write operations exceeding all defined latency bounds (> 1000ms). - name: Pulsar large message payload description: "Pulsar topic {{ $labels.topic }} has {{ $value }} message entries exceeding the maximum size bucket (> 1MB)" query: sum(pulsar_entry_size_le_overflow > 0) by (topic) for: 1h severity: warning comments: | pulsar_entry_size_le_overflow is the overflow bucket of Pulsar's non-standard histogram. It counts message entries exceeding all defined size bounds. - name: Pulsar high ledger disk usage description: "Observing Ledger Disk Usage (> 75%)" query: sum(bookie_ledger_dir__pulsar_data_bookkeeper_ledgers_usage) by (kubernetes_pod_name) > 75 for: 1h severity: critical comments: | This metric name is path-dependent and may differ based on your BookKeeper data directory configuration. Adjust the metric name to match your actual ledger directory path. - name: Pulsar read only bookies description: "Observing Readonly Bookies" query: count(bookie_SERVER_STATUS{} == 0) by (pod) for: 5m severity: critical - name: Pulsar high number of function errors description: "Pulsar function {{ $labels.name }} has more than 10 errors per second ({{ $value | printf \"%.2f\" }}/s)" query: sum(rate(pulsar_function_user_exceptions_total[1m]) + rate(pulsar_function_system_exceptions_total[1m])) by (name) > 10 for: 1m severity: critical - name: Pulsar high number of sink errors description: "Pulsar sink {{ $labels.name }} has more than 10 errors per second ({{ $value | printf \"%.2f\" }}/s)" query: sum(rate(pulsar_sink_sink_exceptions_total[1m])) by (name) > 10 for: 1m severity: critical - name: Nats exporters: - name: nats-io/prometheus-nats-exporter slug: nats-exporter doc_url: https://github.com/nats-io/prometheus-nats-exporter rules: - name: Nats high routes count description: High number of NATS routes ({{ $value }}) for {{ $labels.instance }} query: "gnatsd_varz_routes > 10" severity: warning for: 3m - name: Nats high memory usage description: NATS server memory usage is above 200MB for {{ $labels.instance }} query: "gnatsd_varz_mem > 200 * 1024 * 1024" severity: warning for: 5m - name: Nats slow consumers description: There are slow consumers in NATS for {{ $labels.instance }} query: "gnatsd_varz_slow_consumers > 0" severity: critical for: 3m - name: Nats server down description: NATS server has been down for more than 5 minutes query: 'absent(up{job="nats"})' severity: critical for: 5m comments: | Replace job="nats" with the actual job name in your Prometheus configuration. - name: Nats high CPU usage description: NATS server is using more than 80% CPU for the last 5 minutes query: "gnatsd_varz_cpu > 80" severity: warning for: 5m comments: | gnatsd_varz_cpu is a gauge reporting CPU percentage (0-100 scale). - name: Nats high number of connections description: NATS server has more than 1000 active connections query: "gnatsd_connz_num_connections > 1000" severity: warning for: 5m - name: Nats high JetStream store usage description: JetStream store usage is over 80% query: "gnatsd_varz_jetstream_stats_storage / gnatsd_varz_jetstream_config_max_storage > 0.8 and gnatsd_varz_jetstream_config_max_storage > 0" severity: warning for: 5m - name: Nats high JetStream memory usage description: JetStream memory usage is over 80% query: "gnatsd_varz_jetstream_stats_memory / gnatsd_varz_jetstream_config_max_memory > 0.8 and gnatsd_varz_jetstream_config_max_memory > 0" severity: warning for: 5m - name: Nats high number of subscriptions description: NATS server has more than 1000 active subscriptions query: "gnatsd_varz_subscriptions > 1000" severity: warning for: 5m - name: Nats high pending bytes description: NATS server has more than 100,000 pending bytes query: "gnatsd_connz_pending_bytes > 100000" severity: warning for: 5m - name: Nats too many errors description: NATS server has encountered {{ $value }} JetStream API errors in the last 5 minutes query: "increase(gnatsd_varz_jetstream_stats_api_errors[5m]) > 5" severity: warning for: 5m - name: Nats JetStream accounts exceeded description: JetStream has more than 100 active accounts query: "sum(gnatsd_varz_jetstream_stats_accounts) > 100" severity: warning for: 5m - name: Nats leaf node connection issue description: No leaf node connections on {{ $labels.instance }} query: "gnatsd_varz_leafnodes == 0" severity: warning for: 5m comments: | Only enable this alert if your deployment requires leaf node connections. This will fire spuriously if leaf nodes are not configured. - name: Proxies, load balancers and service meshes services: - name: Nginx exporters: - name: knyar/nginx-lua-prometheus slug: knyar-nginx-exporter doc_url: https://github.com/knyar/nginx-lua-prometheus rules: - name: Nginx high HTTP 4xx error rate description: Too many HTTP requests with status 4xx (> 5%) query: 'sum(rate(nginx_http_requests_total{status=~"^4.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5 and sum(rate(nginx_http_requests_total[1m])) > 0' severity: critical for: 1m - name: Nginx high HTTP 5xx error rate description: Too many HTTP requests with status 5xx (> 5%) query: 'sum(rate(nginx_http_requests_total{status=~"^5.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5 and sum(rate(nginx_http_requests_total[1m])) > 0' severity: critical for: 1m - name: Nginx latency high description: Nginx p99 latency is higher than 3 seconds query: "histogram_quantile(0.99, sum(rate(nginx_http_request_duration_seconds_bucket[2m])) by (host, node, le)) > 3" severity: warning for: 2m - name: Apache exporters: - name: Lusitaniae/apache_exporter slug: lusitaniae-apache-exporter doc_url: https://github.com/Lusitaniae/apache_exporter rules: - name: Apache down description: Apache down query: "apache_up == 0" severity: critical - name: Apache workers load description: Apache workers in busy state approach the max workers count 80% workers busy on {{ $labels.instance }} query: '(sum by (instance) (apache_workers{state="busy"}) / sum by (instance) (apache_scoreboard) ) * 100 > 80 and sum by (instance) (apache_scoreboard) > 0' severity: warning for: 2m - name: Apache restart description: Apache has just been restarted. query: "apache_uptime_seconds_total / 60 < 1" severity: info - name: HaProxy exporters: - name: Embedded exporter (HAProxy >= v2) slug: embedded-exporter-v2 doc_url: https://github.com/haproxy/haproxy/tree/master/contrib/prometheus-exporter rules: - name: HAProxy high HTTP 4xx error rate backend description: Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.proxy }} query: ((sum by (proxy) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 and sum by (proxy) (rate(haproxy_server_http_responses_total[1m])) > 0 severity: critical for: 1m - name: HAProxy high HTTP 5xx error rate backend description: Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.proxy }} query: ((sum by (proxy) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 and sum by (proxy) (rate(haproxy_server_http_responses_total[1m])) > 0 severity: critical for: 1m - name: HAProxy high HTTP 4xx error rate server description: Too many HTTP requests with status 4xx (> 5%) on server {{ $labels.server }} query: ((sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0 severity: critical for: 1m - name: HAProxy high HTTP 5xx error rate server description: Too many HTTP requests with status 5xx (> 5%) on server {{ $labels.server }} query: ((sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0 severity: critical for: 1m - name: HAProxy server response errors description: Too many response errors to {{ $labels.server }} server (> 5%). query: (sum by (server) (rate(haproxy_server_response_errors_total[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100 > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0 severity: critical for: 1m - name: HAProxy backend connection errors description: Too many connection errors to {{ $labels.proxy }} backend (> 100 req/s). Request throughput may be too high. query: (sum by (proxy) (rate(haproxy_backend_connection_errors_total[1m]))) > 100 severity: critical for: 1m - name: HAProxy server connection errors description: Too many connection errors to {{ $labels.proxy }} (> 100 req/s). Request throughput may be too high. query: (sum by (proxy) (rate(haproxy_server_connection_errors_total[1m]))) > 100 severity: critical - name: HAProxy backend max active session > 80% description: Session limit from backend {{ $labels.proxy }} reached 80% of limit - {{ $value | printf "%.2f"}}% query: (haproxy_backend_current_sessions / haproxy_backend_limit_sessions * 100) > 80 and haproxy_backend_limit_sessions > 0 severity: warning for: 2m - name: HAProxy pending requests description: Some HAProxy requests are pending on {{ $labels.proxy }} - {{ $value | printf "%.2f"}} query: sum by (proxy) (haproxy_backend_current_queue) > 0 comments: | haproxy_backend_current_queue is a gauge (current queue depth), not a counter. severity: warning for: 2m - name: HAProxy HTTP slowing down description: HAProxy backend max total time is above 1s on {{ $labels.proxy }} - {{ $value | printf "%.2f"}}s query: avg by (instance, proxy) (haproxy_backend_max_total_time_seconds) > 1 severity: warning for: 1m - name: HAProxy retry high description: High rate of retry on {{ $labels.proxy }} - {{ $value | printf "%.2f"}} query: sum by (proxy) (rate(haproxy_backend_retry_warnings_total[1m])) > 10 severity: warning for: 2m - name: HAproxy has no alive backends description: HAProxy has no alive active or backup backends for {{ $labels.proxy }} query: haproxy_backend_active_servers + haproxy_backend_backup_servers == 0 severity: critical - name: HAProxy frontend security blocked requests description: HAProxy is blocking requests for security reason query: sum by (proxy) (rate(haproxy_frontend_denied_connections_total[2m])) > 10 severity: warning for: 2m - name: HAProxy server healthcheck failure description: Some server healthcheck are failing on {{ $labels.server }} ({{ $value }} in the last 1m) query: increase(haproxy_server_check_failures_total[1m]) > 2 severity: warning - name: prometheus/haproxy_exporter (HAProxy < v2) slug: haproxy-exporter-v1 doc_url: https://github.com/prometheus/haproxy_exporter rules: - name: HAProxy down description: HAProxy down query: "haproxy_up == 0" severity: critical - name: HAProxy high HTTP 4xx error rate backend (v1) description: Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.backend }} query: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) * 100 > 5 and sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 0' severity: critical for: 1m - name: HAProxy high HTTP 5xx error rate backend (v1) description: Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.backend }} query: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) * 100 > 5 and sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 0' severity: critical for: 1m - name: HAProxy high HTTP 4xx error rate server (v1) description: Too many HTTP requests with status 4xx (> 5%) on server {{ $labels.server }} query: 'sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0' severity: critical for: 1m - name: HAProxy high HTTP 5xx error rate server (v1) description: Too many HTTP requests with status 5xx (> 5%) on server {{ $labels.server }} query: 'sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0' severity: critical for: 1m - name: HAProxy server response errors (v1) description: Too many response errors to {{ $labels.server }} server (> 5%). query: "sum by (server) (rate(haproxy_server_response_errors_total[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0" severity: critical for: 1m - name: HAProxy backend connection errors (v1) description: Too many connection errors to {{ $labels.backend }} backend (> 100 req/s). Request throughput may be too high. query: "sum by (backend) (rate(haproxy_backend_connection_errors_total[1m])) > 100" severity: critical for: 1m - name: HAProxy server connection errors (v1) description: Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high. query: "sum by (server) (rate(haproxy_server_connection_errors_total[1m])) > 100" severity: critical - name: HAProxy backend max active session description: HAProxy backend {{ $labels.backend }} is reaching session limit (> 80%). query: "((sum by (backend) (haproxy_backend_current_sessions * 100) / sum by (backend) (haproxy_backend_limit_sessions))) > 80 and sum by (backend) (haproxy_backend_limit_sessions) > 0" severity: warning for: 2m - name: HAProxy pending requests (v1) description: Some HAProxy requests are pending on {{ $labels.backend }} backend query: "sum by (backend) (haproxy_backend_current_queue) > 0" severity: warning for: 2m - name: HAProxy HTTP slowing down (v1) description: Average request time is increasing query: "avg by (backend) (haproxy_backend_http_total_time_average_seconds) > 1" severity: warning for: 1m - name: HAProxy retry high (v1) description: High rate of retry on {{ $labels.backend }} backend query: "sum by (backend) (rate(haproxy_backend_retry_warnings_total[1m])) > 10" severity: warning for: 2m - name: HAProxy backend down description: HAProxy backend is down query: "haproxy_backend_up == 0" severity: critical - name: HAProxy server down description: HAProxy server is down query: "haproxy_server_up == 0" severity: critical - name: HAProxy frontend security blocked requests (v1) description: HAProxy is blocking requests for security reason query: "sum by (frontend) (rate(haproxy_frontend_requests_denied_total[2m])) > 10" severity: warning for: 2m - name: HAProxy server healthcheck failure (v1) description: Some server healthcheck are failing on {{ $labels.server }} ({{ $value }} in the last 1m) query: "increase(haproxy_server_check_failures_total[1m]) > 2" severity: warning - name: Traefik exporters: - name: Embedded exporter v2 slug: embedded-exporter-v2 doc_url: https://docs.traefik.io/observability/metrics/prometheus/ rules: - name: Traefik service down description: All Traefik services are down query: "count(traefik_service_server_up) by (service) == 0" severity: critical - name: Traefik high HTTP 4xx error rate service description: Traefik service 4xx error rate is above 5% query: 'sum(rate(traefik_service_requests_total{code=~"4.*"}[3m])) by (service) / sum(rate(traefik_service_requests_total[3m])) by (service) * 100 > 5 and sum(rate(traefik_service_requests_total[3m])) by (service) > 0' severity: critical for: 1m - name: Traefik high HTTP 5xx error rate service description: Traefik service 5xx error rate is above 5% query: 'sum(rate(traefik_service_requests_total{code=~"5.*"}[3m])) by (service) / sum(rate(traefik_service_requests_total[3m])) by (service) * 100 > 5 and sum(rate(traefik_service_requests_total[3m])) by (service) > 0' severity: critical for: 1m - name: Embedded exporter v1 slug: embedded-exporter-v1 doc_url: https://docs.traefik.io/observability/metrics/prometheus/ rules: - name: Traefik backend down description: All Traefik backends are down query: "count(traefik_backend_server_up) by (backend) == 0" severity: critical - name: Traefik high HTTP 4xx error rate backend description: Traefik backend 4xx error rate is above 5% query: 'sum(rate(traefik_backend_requests_total{code=~"4.*"}[3m])) by (backend) / sum(rate(traefik_backend_requests_total[3m])) by (backend) * 100 > 5 and sum(rate(traefik_backend_requests_total[3m])) by (backend) > 0' severity: critical for: 1m - name: Traefik high HTTP 5xx error rate backend description: Traefik backend 5xx error rate is above 5% query: 'sum(rate(traefik_backend_requests_total{code=~"5.*"}[3m])) by (backend) / sum(rate(traefik_backend_requests_total[3m])) by (backend) * 100 > 5 and sum(rate(traefik_backend_requests_total[3m])) by (backend) > 0' severity: critical for: 1m - name: Caddy exporters: - name: Embedded exporter slug: embedded-exporter doc_url: https://caddyserver.com/docs/metrics rules: - name: Caddy Reverse Proxy Down description: "Caddy reverse proxy upstream {{ $labels.upstream }} is unhealthy" query: "caddy_reverse_proxy_upstreams_healthy == 0" severity: critical - name: Caddy high HTTP 4xx error rate service description: "Caddy service 4xx error rate is above 5%" query: 'sum(rate(caddy_http_request_duration_seconds_count{code=~"4.."}[3m])) by (instance) / sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) * 100 > 5 and sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) > 0' severity: critical for: 1m - name: Caddy high HTTP 5xx error rate service description: "Caddy service 5xx error rate is above 5%" query: 'sum(rate(caddy_http_request_duration_seconds_count{code=~"5.."}[3m])) by (instance) / sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) * 100 > 5 and sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) > 0' severity: critical for: 1m - name: Envoy exporters: - name: Built-in metrics slug: embedded-exporter doc_url: https://www.envoyproxy.io/docs/envoy/latest/configuration/observability/statistics rules: - name: Envoy server not live description: "Envoy server is not live (draining or shutting down) on {{ $labels.instance }}" query: "envoy_server_live != 1" severity: critical for: 1m - name: Envoy high memory usage description: "Envoy memory allocated is above 90% of heap size on {{ $labels.instance }}" query: "envoy_server_memory_allocated / envoy_server_memory_heap_size * 100 > 90 and envoy_server_memory_heap_size > 0" severity: warning for: 5m - name: Envoy high downstream HTTP 5xx error rate description: "More than 5% of downstream HTTP responses are 5xx on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)" query: 'sum by (instance) (rate(envoy_http_downstream_rq_xx{envoy_response_code_class="5"}[5m])) / sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) * 100 > 5 and sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) > 0' severity: critical for: 1m - name: Envoy high downstream HTTP 4xx error rate description: "More than 10% of downstream HTTP responses are 4xx on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)" query: 'sum by (instance) (rate(envoy_http_downstream_rq_xx{envoy_response_code_class="4"}[5m])) / sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) * 100 > 10 and sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) > 0' severity: warning for: 5m - name: Envoy downstream connections overflowing description: "Downstream connections are being rejected due to listener overflow on {{ $labels.instance }} ({{ $value }} in the last 5m)" query: "increase(envoy_listener_downstream_cx_overflow[5m]) > 5" severity: warning - name: Envoy cluster membership empty description: "Envoy cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} has no healthy members" query: "envoy_cluster_membership_healthy == 0" severity: critical for: 1m - name: Envoy cluster membership degraded description: "Only {{ $value | printf \"%.1f\" }}% of members in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} are healthy (threshold: 75%)" query: "envoy_cluster_membership_healthy / envoy_cluster_membership_total * 100 < 75 and envoy_cluster_membership_total > 0" severity: warning for: 5m - name: Envoy high cluster upstream connection failures description: "High rate of upstream connection failures in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} ({{ $value }} in the last 5m)" query: "increase(envoy_cluster_upstream_cx_connect_fail[5m]) > 10" severity: warning for: 5m - name: Envoy high cluster upstream request timeout rate description: "More than 5% of upstream requests are timing out in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}" query: "rate(envoy_cluster_upstream_rq_timeout[5m]) / rate(envoy_cluster_upstream_rq_completed[5m]) * 100 > 5 and rate(envoy_cluster_upstream_rq_completed[5m]) > 0" severity: warning for: 5m - name: Envoy high cluster upstream 5xx error rate description: "More than 5% of upstream requests return 5xx in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}" query: 'rate(envoy_cluster_upstream_rq_xx{envoy_response_code_class="5"}[5m]) / rate(envoy_cluster_upstream_rq_completed[5m]) * 100 > 5 and rate(envoy_cluster_upstream_rq_completed[5m]) > 0' severity: critical for: 1m - name: Envoy cluster health check failures description: "Health checks are consistently failing in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} ({{ $value }} in the last 5m)" query: "increase(envoy_cluster_health_check_failure[5m]) > 5" severity: warning for: 5m - name: Envoy cluster outlier detection ejections active description: "There are active outlier detection ejections in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}" query: "envoy_cluster_outlier_detection_ejections_active > 0" severity: info for: 5m - name: Envoy listener SSL connection errors description: "Envoy listener is experiencing SSL/TLS connection errors on {{ $labels.instance }} ({{ $value }} in the last 5m)" query: "increase(envoy_listener_ssl_connection_error[5m]) > 5" severity: warning - name: Envoy global downstream connections overflowing description: "Downstream connections are being rejected due to global connection limit on {{ $labels.instance }} ({{ $value }} in the last 5m)" query: "increase(envoy_listener_downstream_global_cx_overflow[5m]) > 5" severity: critical - name: Envoy SSL certificate expiring soon description: "SSL certificate loaded by Envoy on {{ $labels.instance }} expires in less than 7 days" query: "envoy_server_days_until_first_cert_expiring < 7" severity: warning - name: Envoy SSL certificate expired description: "SSL certificate loaded by Envoy on {{ $labels.instance }} has expired" query: "envoy_server_days_until_first_cert_expiring < 0" severity: critical - name: Envoy cluster circuit breaker tripped description: "Circuit breaker is open for cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}" query: "envoy_cluster_circuit_breakers_default_cx_open == 1 or envoy_cluster_circuit_breakers_default_rq_open == 1" severity: critical - name: Envoy no healthy upstream description: "Upstream connection attempts failed because no healthy upstream was available in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} ({{ $value }} in the last 5m)" query: "increase(envoy_cluster_upstream_cx_none_healthy[5m]) > 3" severity: critical - name: Envoy high downstream request timeout rate description: "Downstream requests are timing out on {{ $labels.instance }} ({{ $value }} in the last 5m)" query: "increase(envoy_http_downstream_rq_timeout[5m]) > 5" severity: warning for: 5m - name: Linkerd exporters: - name: Embedded exporter slug: embedded-exporter doc_url: https://linkerd.io/2/tasks/exporting-metrics/ rules: - name: Linkerd high error rate description: "Linkerd error rate for {{ $labels.deployment }}{{ $labels.statefulset }}{{ $labels.daemonset }} is over 10%" query: 'sum(rate(response_total{classification="failure"}[1m])) by (deployment, statefulset, daemonset) / sum(rate(response_total[1m])) by (deployment, statefulset, daemonset) * 100 > 10 and sum(rate(response_total[1m])) by (deployment, statefulset, daemonset) > 0' comments: | Linkerd does not expose request_errors_total. Errors are tracked via response_total{classification="failure"}. severity: warning for: 1m - name: Istio exporters: - name: Embedded exporter slug: embedded-exporter doc_url: https://istio.io/latest/docs/tasks/observability/metrics/querying-metrics/ rules: - name: Istio Kubernetes gateway availability drop description: Istio ingress gateway has only {{ $value }} available pod(s). Inbound traffic will likely be affected. query: 'min(kube_deployment_status_replicas_available{deployment="istio-ingressgateway", namespace="istio-system"}) without (instance, pod) < 2' severity: warning for: 1m - name: Istio Pilot high push error rate description: Number of Istio Pilot push errors is too high (> 5%). Envoy sidecars might have outdated configuration. query: "sum(rate(pilot_xds_push_errors[1m])) / sum(rate(pilot_xds_pushes[1m])) * 100 > 5 and sum(rate(pilot_xds_pushes[1m])) > 0" severity: warning for: 1m - name: Istio Mixer Prometheus dispatches low description: Number of Mixer dispatches to Prometheus is too low. Istio metrics might not be being exported properly. query: 'sum(rate(mixer_runtime_dispatches_total{adapter=~"prometheus"}[1m])) < 180' severity: warning for: 1m comments: | Mixer was deprecated in Istio 1.5 and removed in Istio 1.8+. This alert only applies to Istio < 1.8. - name: Istio high total request rate description: Global request rate in the service mesh is unusually high ({{ $value | printf "%.2f" }} req/s). query: 'sum(rate(istio_requests_total{reporter="destination"}[5m])) > 1000' severity: warning for: 2m comments: | Threshold of 1000 req/s is a rough default. Adjust to your expected peak traffic. - name: Istio low total request rate description: Global request rate in the service mesh is unusually low ({{ $value | printf "%.2f" }} req/s). query: 'sum(rate(istio_requests_total{reporter="destination"}[5m])) < 100' severity: warning for: 2m comments: | Threshold of 100 req/s is a rough default. Adjust to your expected baseline traffic. This alert may fire on startup or low-traffic environments. - name: Istio high 4xx error rate description: High percentage of HTTP 4xx responses in Istio ({{ $value | printf "%.1f" }}% > 5%). query: 'sum(rate(istio_requests_total{reporter="destination", response_code=~"4.*"}[5m])) / sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100 > 5 and sum(rate(istio_requests_total{reporter="destination"}[5m])) > 0' severity: warning for: 1m - name: Istio high 5xx error rate description: High percentage of HTTP 5xx responses in Istio ({{ $value | printf "%.1f" }}% > 5%). query: 'sum(rate(istio_requests_total{reporter="destination", response_code=~"5.*"}[5m])) / sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100 > 5 and sum(rate(istio_requests_total{reporter="destination"}[5m])) > 0' severity: warning for: 1m - name: Istio high request latency description: Istio average request duration is {{ $value }}ms (> 100ms). query: 'rate(istio_request_duration_milliseconds_sum{reporter="destination"}[1m]) / rate(istio_request_duration_milliseconds_count{reporter="destination"}[1m]) > 100 and rate(istio_request_duration_milliseconds_count{reporter="destination"}[1m]) > 0' severity: warning for: 1m - name: Istio latency 99 percentile description: "Istio p99 request latency is {{ $value }}ms (threshold: 1000ms)." query: "histogram_quantile(0.99, sum(rate(istio_request_duration_milliseconds_bucket[1m])) by (destination_canonical_service, destination_workload_namespace, le)) > 1000" severity: warning for: 1m - name: Istio Pilot Duplicate Entry description: Istio Pilot has detected {{ $value }} duplicate Envoy cluster(s), indicating misconfigured DestinationRules or ServiceEntries. query: "sum(pilot_duplicate_envoy_clusters{}) > 0" severity: critical - name: Runtimes services: - name: PHP-FPM exporters: - name: bakins/php-fpm-exporter slug: bakins-fpm-exporter doc_url: https://github.com/bakins/php-fpm-exporter rules: - name: PHP-FPM max-children reached description: PHP-FPM reached max children on {{ $labels.instance }} ({{ $value }} times in the last 5m) query: "sum(increase(phpfpm_max_children_reached_total[5m])) by (instance) > 3" severity: warning - name: JVM exporters: - name: java-client slug: jvm-exporter doc_url: https://github.com/prometheus/client_java rules: - name: JVM memory filling up description: JVM memory is filling up (> 80%) query: '(sum by (instance)(jvm_memory_used_bytes{area="heap"}) / sum by (instance)(jvm_memory_max_bytes{area="heap"})) * 100 > 80 and sum by (instance)(jvm_memory_max_bytes{area="heap"}) > 0' severity: warning for: 2m - name: JVM non-heap memory filling up description: JVM non-heap memory (metaspace/code cache) is filling up (> 80%) query: '(sum by (instance)(jvm_memory_used_bytes{area="nonheap"}) / (sum by (instance)(jvm_memory_max_bytes{area="nonheap"}) > 0)) * 100 > 80' severity: warning for: 2m comments: | Many JVM configurations leave metaspace unbounded, in which case jvm_memory_max_bytes{area="nonheap"} is -1 and this alert will not fire. The query filters out max_bytes <= 0 to avoid false negatives. - name: JVM GC time too high description: JVM is spending too much time in garbage collection (> 5% of wall clock time) query: 'sum by (instance)(rate(jvm_gc_collection_seconds_sum[5m])) > 0.05' severity: warning for: 5m - name: JVM threads deadlocked description: JVM has deadlocked threads query: 'jvm_threads_deadlocked > 0' severity: critical for: 1m - name: JVM thread count high description: JVM thread count is high (> 300), potential thread leak query: 'jvm_threads_current > 300' severity: warning for: 5m - name: JVM threads BLOCKED description: JVM has high number of BLOCKED threads, indicating lock contention query: 'jvm_threads_state{state="BLOCKED"} > 50' severity: warning for: 5m - name: JVM old gen GC frequency description: Frequent old/major GC cycles, indicating memory pressure query: 'rate(jvm_gc_collection_seconds_count{gc=~".*old.*|.*major.*"}[5m]) > 0.3' severity: warning for: 5m comments: | This regex matches CMS, G1, and Parallel collector names. It will not match ZGC or Shenandoah cycle names. Adjust the gc label filter if you use a different collector. - name: JVM direct buffer pool filling up description: JVM direct buffer pool is filling up (> 90%) query: '(jvm_buffer_pool_used_bytes / jvm_buffer_pool_capacity_bytes) * 100 > 90 and jvm_buffer_pool_capacity_bytes > 0' severity: warning for: 5m - name: JVM objects pending finalization description: JVM has objects pending finalization, potential memory leak query: 'jvm_memory_objects_pending_finalization > 1000' severity: warning for: 5m - name: JVM file descriptors exhaustion description: JVM process is running out of file descriptors (> 90% used) query: '(process_open_fds / process_max_fds) * 100 > 90 and process_max_fds > 0' severity: warning for: 5m comments: | process_open_fds and process_max_fds are generic metrics from the Prometheus client library, not JVM-specific. This alert will also fire for Go, Python, or any process exposing these metrics. - name: JVM class loading anomaly description: Rapid class loading detected, potential classloader leak query: 'rate(jvm_classes_loaded_total[5m]) > 100' severity: warning for: 5m - name: JVM compilation time spike description: Excessive JIT compilation time consuming CPU query: 'rate(jvm_compilation_time_seconds_total[5m]) > 0.1' severity: warning for: 5m - name: Golang exporters: - name: client_golang slug: golang-exporter doc_url: https://github.com/prometheus/client_golang rules: - name: Go goroutine count high description: Go application has too many goroutines (> 1000), potential goroutine leak query: 'go_goroutines > 1000' severity: warning for: 5m comments: | Threshold is a rough default. High-concurrency servers may legitimately run thousands of goroutines. Adjust to match your baseline. - name: Go GC duration high description: Go GC pause duration is too high (max > 1s) query: 'go_gc_duration_seconds{quantile="1"} > 1' severity: warning for: 5m comments: | quantile="1" is the maximum observed GC pause in the current summary window, not p99. A single outlier pause can push this above 1s. The for: 5m ensures the max stays elevated. - name: Go memory usage high description: Go heap allocation is using most of the runtime's reserved memory (> 90%), indicating the process may need more memory or has a leak query: '(go_memstats_heap_alloc_bytes / go_memstats_sys_bytes) * 100 > 90' severity: warning for: 5m comments: | go_memstats_sys_bytes is the total memory obtained from the OS by the Go runtime, not total host memory. This ratio measures Go-internal memory utilization, not system-level memory pressure. - name: Go thread count high description: Go OS thread count is high (> 500), potential blocking syscall or CGo leak query: 'go_threads > 500' severity: warning for: 5m comments: | Threshold is workload-dependent. Applications with heavy CGo or blocking I/O may legitimately use more OS threads. Adjust to match your baseline. - name: Go heap objects count high description: Go heap has too many live objects (> 10M), high GC pressure query: 'go_memstats_heap_objects > 10000000' severity: warning for: 5m comments: | Threshold is a rough default. Adjust based on your application's normal object count. - name: Go GC CPU fraction high description: Go GC is consuming too much CPU (> 5%) query: 'rate(go_gc_duration_seconds_sum[5m]) > 0.05' severity: warning for: 5m comments: | rate(go_gc_duration_seconds_sum) approximates the fraction of wall-clock time spent in GC. This replaces go_memstats_gc_cpu_fraction which was removed in client_golang v1.12+. - name: Go goroutine spike description: Go goroutine count is growing rapidly ({{ $value | printf "%.0f" }} goroutines/s) query: 'deriv(go_goroutines[5m]) > 10' severity: warning for: 5m comments: | A threshold of 100/s only catches catastrophic leaks (30k goroutines in 5m). 10/s catches gradual leaks (~3k in 5m). Adjust based on your application's expected concurrency patterns. - name: Go heap in-use growing description: Go heap in-use memory is growing steadily, potential memory leak or under-sized heap query: 'deriv(go_memstats_heap_inuse_bytes[10m]) > 1e7' severity: warning comments: | Alerts when heap in-use grows by more than 10MB/s sustained over 10 minutes. Adjust threshold based on your workload. - name: Go memory leak description: Go application has sustained high allocation rate (> 1GB/s), potential memory leak query: 'rate(go_memstats_alloc_bytes_total[5m]) > 1e9' severity: warning for: 5m - name: Go stack memory high description: Go stack memory usage is high (> 1GB), likely excessive goroutines or deep recursion query: 'go_memstats_stack_inuse_bytes > 1e9' severity: warning for: 5m - name: Ruby exporters: - name: prometheus_exporter slug: ruby-exporter doc_url: https://github.com/discourse/prometheus_exporter rules: - name: Ruby heap live slots high description: Ruby heap has too many live slots (> 500k), heap bloat query: 'ruby_heap_live_slots > 500000' severity: warning for: 5m comments: | Threshold is a rough default. Adjust based on your application's normal heap size. - name: Ruby heap free slots high description: Ruby heap has too many free slots (> 500k), memory fragmentation after large allocations query: 'ruby_heap_free_slots > 500000' severity: warning for: 5m - name: Ruby major GC rate high description: Ruby is performing too many major GC cycles, indicating memory pressure query: 'rate(ruby_major_gc_ops_total[5m]) > 2' severity: warning for: 5m comments: | Major GC rate > 5/s only fires if the app is essentially non-functional. Threshold of 2/s provides earlier detection. - name: Ruby RSS high description: Ruby process RSS is high (> 1GB) query: 'ruby_rss > 1e9' severity: warning for: 5m - name: Ruby allocated objects spike description: Ruby is allocating objects at a high rate query: 'rate(ruby_allocated_objects_total[5m]) > 100000' severity: warning for: 5m - name: Python exporters: - name: client_python slug: python-exporter doc_url: https://github.com/prometheus/client_python rules: - name: Python GC objects uncollectable description: Python has uncollectable objects ({{ $value }}), potential memory leak via reference cycles query: 'increase(python_gc_objects_uncollectable_total[5m]) > 1' severity: warning for: 5m - name: Python GC collections high description: Python GC is collecting too many objects (> 10k/s), high allocation pressure query: 'rate(python_gc_objects_collected_total[5m]) > 10000' severity: warning for: 5m - name: Python file descriptors exhaustion description: Python process is running out of file descriptors (> 90% used) query: '(process_open_fds / process_max_fds) * 100 > 90 and process_max_fds > 0' severity: warning for: 5m comments: | process_open_fds and process_max_fds are generic metrics from the Prometheus client library, not Python-specific. - name: Python GC generation 2 collections high description: Python full GC (generation 2) is running too frequently, indicating memory pressure query: 'rate(python_gc_collections_total{generation="2"}[5m]) > 1' severity: warning for: 5m comments: | Gen2 collection rate > 1/s is very high. In most applications, gen2 runs are infrequent. Adjust threshold based on your workload. - name: Python virtual memory high description: Python process virtual memory is high (> 4GB) query: 'process_virtual_memory_bytes > 4e9' severity: warning for: 5m comments: | Threshold is a rough default. Adjust based on your application's expected memory footprint. - name: Sidekiq exporters: - name: Strech/sidekiq-prometheus-exporter slug: strech-sidekiq-exporter doc_url: https://github.com/Strech/sidekiq-prometheus-exporter rules: - name: Sidekiq queue size description: Sidekiq queue {{ $labels.name }} is growing ({{ $value }} enqueued jobs) query: "sidekiq_queue_enqueued_jobs > 100" severity: warning for: 1m - name: Sidekiq scheduling latency too high description: Sidekiq jobs are taking more than 1min to be picked up. Users may be seeing delays in background processing. query: "max(sidekiq_queue_latency_seconds) > 60" severity: critical - name: Data engineering services: - name: Apache Flink exporters: - name: Built-in Prometheus reporter slug: flink-prometheus-reporter doc_url: https://nightlies.apache.org/flink/flink-docs-stable/docs/deployment/metric_reporters/ rules: - name: Flink job is not running description: "No Flink jobs are currently running. All jobs may have failed or been cancelled." query: "flink_jobmanager_numRunningJobs == 0" severity: critical for: 1m - name: Flink no TaskManagers registered description: "No TaskManagers are registered with the JobManager. The cluster has no processing capacity." query: "flink_jobmanager_numRegisteredTaskManagers == 0" severity: critical for: 1m - name: Flink all task slots used description: "All Flink task slots are in use ({{ $value }} available). New jobs cannot be scheduled." query: "flink_jobmanager_taskSlotsAvailable == 0" severity: warning for: 5m comments: | This alert fires when there are no available task slots. Adjust the threshold if your cluster is expected to run at full capacity. - name: Flink job restart increasing description: "Flink job {{ $labels.job_name }} has restarted {{ $value }} times in the last 5 minutes." query: "delta(flink_jobmanager_job_numRestarts[5m]) > 1" comments: | Flink exposes numRestarts as a gauge (cumulative count), so delta() is used instead of increase(). severity: warning for: 5m comments: | A single restart may be normal during deployments. Adjust threshold based on restart tolerance. - name: Flink checkpoint failures description: "Flink job {{ $labels.job_name }} has {{ $value }} failed checkpoints in the last 10 minutes." query: "delta(flink_jobmanager_job_numberOfFailedCheckpoints[10m]) > 1" severity: warning for: 5m - name: Flink checkpoint duration high description: "Flink job {{ $labels.job_name }} last checkpoint took {{ $value | humanizeDuration }} to complete." query: "flink_jobmanager_job_lastCheckpointDuration / 1000 > 60" severity: warning for: 5m comments: | Value is converted from milliseconds to seconds for correct humanizeDuration display. Threshold is 60 seconds. Adjust based on your checkpoint interval and state size. - name: Flink task backpressured description: "Flink task {{ $labels.task_name }} in job {{ $labels.job_name }} is backpressured." query: "flink_taskmanager_job_task_isBackPressured == 1" severity: warning for: 5m - name: Flink task high backpressure time description: "Flink task {{ $labels.task_name }} is spending {{ $value | humanize }}ms/sec in backpressure." query: "flink_taskmanager_job_task_backPressuredTimeMsPerSecond > 500" severity: warning for: 5m comments: | Fires when a task spends more than 500ms/sec backpressured. This indicates the task cannot keep up with upstream data rate. - name: Flink TaskManager heap memory high description: "Flink TaskManager {{ $labels.instance }} heap memory usage is above 90%." query: "flink_taskmanager_Status_JVM_Memory_Heap_Used / flink_taskmanager_Status_JVM_Memory_Heap_Max > 0.9 and flink_taskmanager_Status_JVM_Memory_Heap_Max > 0" severity: warning for: 5m comments: | Flink TaskManagers manage their own memory pool. High JVM heap usage (outside managed memory) may indicate memory leaks or misconfiguration. - name: Flink JobManager heap memory high description: "Flink JobManager {{ $labels.instance }} heap memory usage is above 90%." query: "flink_jobmanager_Status_JVM_Memory_Heap_Used / flink_jobmanager_Status_JVM_Memory_Heap_Max > 0.9 and flink_jobmanager_Status_JVM_Memory_Heap_Max > 0" severity: warning for: 5m - name: Flink TaskManager GC time high description: "Flink TaskManager {{ $labels.instance }} is spending more than 10% of time in garbage collection." query: "deriv(flink_taskmanager_Status_JVM_GarbageCollector_All_Time[5m]) > 100" severity: warning for: 5m comments: | Flink exposes GC time as a gauge (cumulative milliseconds), so deriv() is used instead of rate(). Threshold: more than 100ms/sec of GC time (10% of wall clock). Adjust based on your workload. - name: Flink no records processed description: "Flink task {{ $labels.task_name }} has not processed any records in the last 5 minutes." query: "delta(flink_taskmanager_job_task_numRecordsIn[5m]) == 0 and flink_taskmanager_job_task_numRecordsIn > 0" severity: warning for: 5m comments: | Only fires for tasks that have previously received records, to avoid false positives during startup. - name: Apache Spark exporters: - name: Built-in Prometheus (PrometheusServlet + PrometheusResource) slug: spark-prometheus doc_url: https://spark.apache.org/docs/latest/monitoring.html comments: | Spark exposes metrics via two built-in endpoints: - PrometheusServlet: master/worker/driver metrics at /metrics/prometheus/ (ports 8080, 8081, 4040) - PrometheusResource: executor metrics at /metrics/executors/prometheus/ (port 4040, requires spark.ui.prometheus.enabled=true in Spark 3.x) Metric names from PrometheusServlet include a dynamic namespace (application ID), making static PromQL queries challenging. Configuration: spark.metrics.conf.*.sink.prometheusServlet.class=org.apache.spark.metrics.sink.PrometheusServlet rules: - name: Spark no alive workers description: "No Spark workers are alive. The cluster has no processing capacity." query: "metrics_master_aliveWorkers_Value == 0" severity: critical for: 1m - name: Spark too many waiting apps description: "Spark has {{ $value }} applications waiting for resources." query: "metrics_master_waitingApps_Value > 10" severity: warning for: 5m comments: | Adjust the threshold based on your cluster's typical queuing behavior. - name: Spark worker memory exhausted description: "Spark worker {{ $labels.instance }} has no free memory ({{ $value }}MB free)." query: "metrics_worker_memFree_MB_Value == 0" severity: warning for: 2m - name: Spark worker cores exhausted description: "Spark worker {{ $labels.instance }} has no free cores." query: "metrics_worker_coresFree_Value == 0" severity: warning for: 5m comments: | Fires when a worker has no free cores. This may be normal under high load but can indicate capacity issues. - name: Spark executor high GC time description: "Spark executor {{ $labels.executor_id }} in {{ $labels.application_name }} is spending too much time in GC." query: "metrics_executor_totalGCTime_seconds_total / metrics_executor_totalDuration > 0.1 and metrics_executor_totalDuration > 0" severity: warning for: 5m comments: | Fires when more than 10% of executor time is spent in garbage collection. This metric comes from the PrometheusResource endpoint (/metrics/executors/prometheus/). - name: Spark executor all tasks failing description: "Spark executor {{ $labels.executor_id }} has only failing tasks ({{ $value }} failed, 0 completed)." query: "metrics_executor_failedTasks_total > 0 and metrics_executor_completedTasks_total == 0" severity: critical for: 5m - name: Spark executor high task failure rate description: "Spark executor {{ $labels.executor_id }} has a task failure rate above 10%." query: "metrics_executor_failedTasks_total / metrics_executor_totalTasks_total > 0.1 and metrics_executor_totalTasks_total > 0" severity: warning for: 5m - name: Spark executor high disk spill description: "Spark executor {{ $labels.executor_id }} is spilling data to disk. Consider increasing executor memory." query: "metrics_executor_diskUsed_bytes > 1e9" severity: warning for: 5m comments: | diskUsed is a gauge, not a counter — do not use rate(). Threshold of 1GB is a rough default. Disk spilling indicates insufficient memory for the workload. - name: Hadoop exporters: - name: hadoop/jmx_exporter slug: jmx_exporter doc_url: https://github.com/prometheus/jmx_exporter rules: # Alert rule for NameNode availability - name: Hadoop Name Node Down query: up{job="hadoop-namenode"} == 0 for: 5m severity: critical description: "The Hadoop NameNode service is unavailable." comments: | When targets are managed via service discovery, a disappeared target goes stale rather than reporting up==0, so this alert may not fire. Prefer application-level availability metrics if available. Rename job="hadoop-namenode" to match the actual job name in your Prometheus scrape config. # Alert rule for ResourceManager availability - name: Hadoop Resource Manager Down query: up{job="hadoop-resourcemanager"} == 0 for: 5m severity: critical description: "The Hadoop ResourceManager service is unavailable." comments: | When targets are managed via service discovery, a disappeared target goes stale rather than reporting up==0, so this alert may not fire. Prefer application-level availability metrics if available. Rename job="hadoop-resourcemanager" to match the actual job name in your Prometheus scrape config. # Alert rule for DataNode status - name: Hadoop Data Node Out Of Service query: hadoop_datanode_last_heartbeat == 0 for: 10m severity: warning description: "The Hadoop DataNode is not sending heartbeats." # Alert rule for low HDFS disk space - name: Hadoop HDFS Disk Space Low query: (hadoop_hdfs_bytes_total - hadoop_hdfs_bytes_used) / hadoop_hdfs_bytes_total < 0.1 and hadoop_hdfs_bytes_total > 0 for: 15m severity: warning description: "Available HDFS disk space is running low." # Alert rule for excessive MapReduce task failures - name: Hadoop Map Reduce Task Failures query: increase(hadoop_mapreduce_task_failures_total[1h]) > 100 for: 10m severity: critical description: "There is an unusually high number of MapReduce task failures." # Alert rule for high ResourceManager memory usage - name: Hadoop Resource Manager Memory High query: hadoop_resourcemanager_memory_bytes / hadoop_resourcemanager_memory_max_bytes > 0.8 and hadoop_resourcemanager_memory_max_bytes > 0 for: 15m severity: warning description: "The Hadoop ResourceManager is approaching its memory limit." # Alert rule for high YARN container allocation failures - name: Hadoop YARN Container Allocation Failures query: increase(hadoop_yarn_container_allocation_failures_total[1h]) > 10 for: 10m severity: warning description: "There is a significant number of YARN container allocation failures." # Alert rule for excessive HBase region server region count - name: Hadoop HBase Region Count High query: hadoop_hbase_region_count > 5000 for: 15m severity: warning description: "The HBase cluster has an unusually high number of regions." # Alert rule for low HBase region server heap space - name: Hadoop HBase Region Server Heap Low query: hadoop_hbase_region_server_heap_bytes / hadoop_hbase_region_server_max_heap_bytes > 0.8 and hadoop_hbase_region_server_max_heap_bytes > 0 for: 10m severity: warning description: "HBase Region Servers are running low on heap space." # Alert rule for high HBase Write Requests latency - name: Hadoop HBase Write Requests Latency High query: hadoop_hbase_write_requests_latency_seconds > 0.5 for: 10m severity: warning description: "HBase Write Requests are experiencing high latency." - name: Orchestrators services: - name: Kubernetes exporters: - name: kube-state-metrics slug: kubestate-exporter doc_url: https://github.com/kubernetes/kube-state-metrics/tree/master/docs rules: - name: Kubernetes Node not ready description: Node {{ $labels.node }} has been unready for a long time query: 'kube_node_status_condition{condition="Ready",status="true"} == 0' severity: critical for: 10m - name: Kubernetes Node scheduling disabled description: Node {{ $labels.node }} has been marked as unschedulable for more than 30 minutes. query: 'kube_node_spec_taint{key="node.kubernetes.io/unschedulable"} == 1' severity: warning for: 30m comments: | Kubernetes Node with disabled schedules are fine. This alarm can be useful to get warned if there are nodes which are longer unscheduled. - name: Kubernetes Node memory pressure description: "Node {{ $labels.node }} has MemoryPressure condition" query: 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1' severity: critical for: 2m - name: Kubernetes Node disk pressure description: "Node {{ $labels.node }} has DiskPressure condition" query: 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1' severity: critical for: 2m - name: Kubernetes Node network unavailable description: "Node {{ $labels.node }} has NetworkUnavailable condition" query: 'kube_node_status_condition{condition="NetworkUnavailable",status="true"} == 1' severity: critical for: 2m - name: Kubernetes Node out of pod capacity description: "Node {{ $labels.node }} is out of pod capacity" query: 'sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid, instance) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90' severity: warning for: 2m - name: Kubernetes Container oom killer description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes." query: '(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1' severity: warning - name: Kubernetes Job failed description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete" query: "kube_job_status_failed > 0" severity: warning - name: Kubernetes Job not starting description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} did not start for 10 minutes" query: "kube_job_status_active == 0 and kube_job_status_failed == 0 and kube_job_status_succeeded == 0 and (time() - kube_job_status_start_time) > 600" severity: warning - name: Kubernetes CronJob failing description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is failing" query: "(kube_cronjob_status_last_schedule_time > kube_cronjob_status_last_successful_time) AND (kube_cronjob_status_active == 0) AND (kube_cronjob_spec_suspend == 0)" severity: critical - name: Kubernetes CronJob suspended description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is suspended" query: "kube_cronjob_spec_suspend != 0" severity: warning - name: Kubernetes PersistentVolumeClaim pending description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pending" query: 'kube_persistentvolumeclaim_status_phase{phase="Pending"} == 1' severity: warning for: 2m - name: Kubernetes Volume out of disk space description: Volume is almost full (< 10% left) query: "kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes * 100 < 10 and kubelet_volume_stats_capacity_bytes > 0" severity: warning for: 2m - name: Kubernetes Volume full in four days description: "Volume under {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available." query: "predict_linear(kubelet_volume_stats_available_bytes[6h:5m], 4 * 24 * 3600) < 0" severity: critical - name: Kubernetes PersistentVolume error description: "Persistent volume {{ $labels.persistentvolume }} is in bad state" query: 'kube_persistentvolume_status_phase{phase=~"Failed|Pending"} > 0' severity: critical - name: Kubernetes StatefulSet down description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} went down query: "kube_statefulset_replicas != kube_statefulset_status_replicas_ready > 0" severity: critical for: 1m - name: Kubernetes HPA scale inability description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to scale query: '(kube_horizontalpodautoscaler_spec_max_replicas - kube_horizontalpodautoscaler_status_desired_replicas) * on (horizontalpodautoscaler,namespace) (kube_horizontalpodautoscaler_status_condition{condition="ScalingLimited", status="true"} == 1) == 0' severity: warning for: 2m - name: Kubernetes HPA metrics unavailability description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to collect metrics query: 'kube_horizontalpodautoscaler_status_condition{status="false", condition="ScalingActive"} == 1' severity: warning - name: Kubernetes HPA scale maximum description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has hit maximum number of desired pods query: "(kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas) and (kube_horizontalpodautoscaler_spec_max_replicas > 1) and (kube_horizontalpodautoscaler_spec_min_replicas != kube_horizontalpodautoscaler_spec_max_replicas)" severity: info for: 2m - name: Kubernetes HPA underutilized description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is constantly at minimum replicas for 50% of the time. Potential cost saving here. query: "max(quantile_over_time(0.5, kube_horizontalpodautoscaler_status_desired_replicas[1d]) == kube_horizontalpodautoscaler_spec_min_replicas) by (horizontalpodautoscaler) > 3" # allow minimum 3 replicas running severity: info - name: Kubernetes Pod not healthy description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-running state for longer than 15 minutes. query: 'sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"}) > 0' severity: critical for: 15m - name: Kubernetes pod crash looping description: Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping query: "increase(kube_pod_container_status_restarts_total[1m]) > 3" severity: warning for: 2m - name: Kubernetes ReplicaSet replicas mismatch description: ReplicaSet {{ $labels.namespace }}/{{ $labels.replicaset }} replicas mismatch query: "kube_replicaset_spec_replicas != kube_replicaset_status_ready_replicas" severity: warning for: 10m - name: Kubernetes Deployment replicas mismatch description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replicas mismatch query: "kube_deployment_spec_replicas != kube_deployment_status_replicas_available" severity: warning for: 10m - name: Kubernetes StatefulSet replicas mismatch description: StatefulSet does not match the expected number of replicas. query: "kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas" severity: warning for: 10m - name: Kubernetes Deployment generation mismatch description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has failed but has not been rolled back. query: "kube_deployment_status_observed_generation != kube_deployment_metadata_generation" severity: critical for: 10m - name: Kubernetes StatefulSet generation mismatch description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has failed but has not been rolled back. query: "kube_statefulset_status_observed_generation != kube_statefulset_metadata_generation" severity: critical for: 10m - name: Kubernetes StatefulSet update not rolled out description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out. query: "max without (revision) (kube_statefulset_status_current_revision unless kube_statefulset_status_update_revision) * (kube_statefulset_replicas != kube_statefulset_status_replicas_updated)" severity: warning for: 10m - name: Kubernetes DaemonSet rollout stuck description: Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled or not ready query: "(kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 and kube_daemonset_status_desired_number_scheduled > 0) or kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0" severity: warning for: 10m - name: Kubernetes DaemonSet misscheduled description: Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run query: "kube_daemonset_status_number_misscheduled > 0" severity: critical for: 1m - name: Kubernetes CronJob too long description: CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete. query: "kube_job_status_start_time > 0 and absent(kube_job_status_completion_time) and (time() - kube_job_status_start_time) > 3600" severity: warning comments: | Threshold should be customized for each cronjob name. - name: Kubernetes Job slow completion description: Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in time. query: "kube_job_spec_completions - kube_job_status_succeeded - kube_job_status_failed > 0" severity: critical for: 12h - name: Kubernetes API server errors description: "Kubernetes API server is experiencing {{ $value | humanize }}% error rate" query: 'sum(rate(apiserver_request_total{job="apiserver",code=~"(?:5..)"}[1m])) by (instance, job) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) by (instance, job) * 100 > 3 and sum(rate(apiserver_request_total{job="apiserver"}[1m])) by (instance, job) > 0' severity: critical for: 2m - name: Kubernetes API client errors description: "Kubernetes API client is experiencing {{ $value | humanize }}% error rate" query: '(sum(rate(rest_client_requests_total{code=~"(4|5).."}[1m])) by (instance, job) / sum(rate(rest_client_requests_total[1m])) by (instance, job)) * 100 > 1 and sum(rate(rest_client_requests_total[1m])) by (instance, job) > 0' severity: critical for: 2m - name: Kubernetes client certificate expires next week description: A client certificate used to authenticate to the apiserver is expiring next week. query: 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60' severity: warning - name: Kubernetes client certificate expires soon description: A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours. query: 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 24*60*60' severity: critical - name: Kubernetes API server latency description: "Kubernetes API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}." query: 'histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{verb!~"(?:CONNECT|WATCHLIST|WATCH|PROXY)"} [10m])) WITHOUT (subresource)) > 1' severity: warning for: 2m - name: Nomad exporters: - name: Embedded exporter slug: embedded-exporter rules: - name: Nomad job failed description: "Nomad job {{ $labels.job }} has {{ $value }} failed allocations." query: "nomad_nomad_job_summary_failed > 0" severity: warning - name: Nomad job lost description: "Nomad job {{ $labels.job }} has {{ $value }} lost allocations." query: "nomad_nomad_job_summary_lost > 0" severity: warning - name: Nomad job queued description: "Nomad job {{ $labels.job }} has {{ $value }} queued allocations." query: "nomad_nomad_job_summary_queued > 0" severity: warning for: 2m - name: Nomad blocked evaluation description: "Nomad has {{ $value }} blocked evaluations. The cluster may lack resources to place allocations." query: "nomad_nomad_blocked_evals_total_blocked > 0" severity: warning - name: Consul exporters: - name: prometheus/consul_exporter slug: consul-exporter doc_url: https://github.com/prometheus/consul_exporter rules: - name: Consul service healthcheck failed description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`" query: "consul_catalog_service_node_healthy == 0" severity: critical for: 1m # allows a short service restart - name: Consul missing master node description: Numbers of consul raft peers should be 3, in order to preserve quorum. query: "consul_raft_peers < 3" severity: critical - name: Consul agent unhealthy description: A Consul agent is down query: 'consul_health_node_status{status="critical"} == 1' severity: critical - name: Etcd exporters: - name: Embedded exporter slug: embedded-exporter rules: - name: Etcd insufficient Members description: Etcd cluster should have an odd number of members query: "count(etcd_server_id) % 2 == 0" severity: critical - name: Etcd no Leader description: Etcd cluster have no leader query: "etcd_server_has_leader == 0" severity: critical - name: Etcd high number of leader changes description: "Etcd leader changed {{ $value }} times during 10 minutes" query: "increase(etcd_server_leader_changes_seen_total[10m]) > 2" severity: warning - name: Etcd high number of failed GRPC requests warning description: More than 1% GRPC request failure detected in Etcd query: 'sum(rate(grpc_server_handled_total{grpc_code=~"Internal|Unavailable|DeadlineExceeded|ResourceExhausted|Aborted|Unknown"}[1m])) BY (grpc_service, grpc_method) / sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0.01 and sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0' severity: warning for: 2m comments: | Filters to actual error codes. grpc_code!="OK" includes benign codes like NotFound, AlreadyExists, and Cancelled. - name: Etcd high number of failed GRPC requests critical description: More than 5% GRPC request failure detected in Etcd query: 'sum(rate(grpc_server_handled_total{grpc_code=~"Internal|Unavailable|DeadlineExceeded|ResourceExhausted|Aborted|Unknown"}[1m])) BY (grpc_service, grpc_method) / sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0.05 and sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0' severity: critical for: 2m comments: | Filters to actual error codes. grpc_code!="OK" includes benign codes like NotFound, AlreadyExists, and Cancelled. - name: Etcd GRPC requests slow description: GRPC requests slowing down, 99th percentile is over 0.15s query: 'histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{grpc_type="unary"}[1m])) by (grpc_service, grpc_method, le)) > 0.15' severity: warning for: 2m # etcd_http_* metrics are from the v2 API and were removed in etcd 3.x. # These rules only apply if you are running etcd 2.x. - name: Etcd high number of failed HTTP requests warning description: More than 1% HTTP failure detected in Etcd query: "sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.01 and sum(rate(etcd_http_received_total[1m])) BY (method) > 0" severity: warning for: 2m comments: "These etcd_http_* metrics are from the etcd v2 API and do not exist in etcd 3.x. Remove these rules if running etcd 3.x." - name: Etcd high number of failed HTTP requests critical description: More than 5% HTTP failure detected in Etcd query: "sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.05 and sum(rate(etcd_http_received_total[1m])) BY (method) > 0" severity: critical for: 2m comments: "These etcd_http_* metrics are from the etcd v2 API and do not exist in etcd 3.x. Remove these rules if running etcd 3.x." - name: Etcd HTTP requests slow description: HTTP requests slowing down, 99th percentile is over 0.15s query: "histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[1m])) > 0.15" severity: warning for: 2m comments: "This etcd_http_* metric is from the etcd v2 API and does not exist in etcd 3.x. Remove this rule if running etcd 3.x." - name: Etcd member communication slow description: Etcd member communication slowing down, 99th percentile is over 0.15s query: "histogram_quantile(0.99, sum(rate(etcd_network_peer_round_trip_time_seconds_bucket[5m])) by (instance, le)) > 0.15" severity: warning for: 2m - name: Etcd high number of failed proposals description: "Etcd server got {{ $value }} failed proposals in the past hour" query: "increase(etcd_server_proposals_failed_total[1h]) > 5" severity: warning for: 2m - name: Etcd high fsync durations description: Etcd WAL fsync duration increasing, 99th percentile is over 0.5s query: "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) by (instance, le)) > 0.5" severity: warning for: 2m - name: Etcd high commit durations description: Etcd commit duration increasing, 99th percentile is over 0.25s query: "histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) by (instance, le)) > 0.25" severity: warning for: 2m - name: OpenStack exporters: - name: openstack-exporter/openstack-exporter slug: openstack-exporter doc_url: https://github.com/openstack-exporter/openstack-exporter rules: - name: OpenStack exporter down description: The OpenStack exporter is down. OpenStack cloud metrics are no longer being collected. query: 'up{job=~".*openstack.*"} == 0' severity: critical for: 2m comments: | Adjust the job label regex to match the actual job name in your Prometheus scrape config. - name: OpenStack Nova agent down description: "Nova agent {{ $labels.hostname }} ({{ $labels.service }}) is down in zone {{ $labels.zone }}" query: 'openstack_nova_agent_state{adminState="enabled"} == 0' severity: critical for: 2m - name: OpenStack Neutron agent down description: "Neutron agent {{ $labels.hostname }} ({{ $labels.service }}) is down" query: 'openstack_neutron_agent_state{adminState="up"} == 0' severity: critical for: 2m - name: OpenStack Cinder agent down description: "Cinder agent {{ $labels.hostname }} ({{ $labels.service }}) is down in zone {{ $labels.zone }}" query: 'openstack_cinder_agent_state{adminState="enabled"} == 0' severity: critical for: 2m - name: OpenStack hypervisor high vCPU usage description: "Hypervisor {{ $labels.hostname }} vCPU usage is above 90%" query: 'openstack_nova_vcpus_used / openstack_nova_vcpus_available > 0.9 and openstack_nova_vcpus_available > 0' severity: warning for: 5m comments: | The threshold of 90% is a rough default. Adjust based on your overcommit ratio and workload patterns. - name: OpenStack hypervisor high memory usage description: "Hypervisor {{ $labels.hostname }} memory usage is above 90%" query: 'openstack_nova_memory_used_bytes / openstack_nova_memory_available_bytes > 0.9 and openstack_nova_memory_available_bytes > 0' severity: warning for: 5m comments: | The threshold of 90% is a rough default. Adjust based on your overcommit ratio and workload patterns. - name: OpenStack hypervisor high disk usage description: "Hypervisor {{ $labels.hostname }} local disk usage is above 90%" query: 'openstack_nova_local_storage_used_bytes / openstack_nova_local_storage_available_bytes > 0.9 and openstack_nova_local_storage_available_bytes > 0' severity: warning for: 5m - name: OpenStack Nova tenant vCPU quota nearly exhausted description: "Tenant {{ $labels.tenant }} has used over 90% of its vCPU quota" query: 'openstack_nova_limits_vcpus_used / openstack_nova_limits_vcpus_max > 0.9 and openstack_nova_limits_vcpus_max > 0' severity: warning comments: | A value of -1 for limits_vcpus_max means unlimited quota (no limit set). - name: OpenStack Nova tenant memory quota nearly exhausted description: "Tenant {{ $labels.tenant }} has used over 90% of its memory quota" query: 'openstack_nova_limits_memory_used / openstack_nova_limits_memory_max > 0.9 and openstack_nova_limits_memory_max > 0' severity: warning - name: OpenStack Nova tenant instance quota nearly exhausted description: "Tenant {{ $labels.tenant }} has used over 90% of its instance quota" query: 'openstack_nova_limits_instances_used / openstack_nova_limits_instances_max > 0.9 and openstack_nova_limits_instances_max > 0' severity: warning - name: OpenStack Cinder tenant volume quota nearly exhausted description: "Tenant {{ $labels.tenant }} has used over 90% of its volume storage quota" query: 'openstack_cinder_limits_volume_used_gb / openstack_cinder_limits_volume_max_gb > 0.9 and openstack_cinder_limits_volume_max_gb > 0' severity: warning - name: OpenStack Cinder pool low free capacity description: "Cinder storage pool {{ $labels.name }} has less than 10% free capacity" query: 'openstack_cinder_pool_capacity_free_gb / openstack_cinder_pool_capacity_total_gb < 0.1 and openstack_cinder_pool_capacity_total_gb > 0' severity: warning for: 5m - name: OpenStack Neutron floating IPs associated but not active description: "{{ $value }} floating IPs are associated to a private IP but are not in ACTIVE state" query: 'openstack_neutron_floating_ips_associated_not_active > 0' severity: warning for: 5m - name: OpenStack Neutron routers not active description: "{{ $value }} Neutron routers are not in ACTIVE state" query: 'openstack_neutron_routers_not_active > 0' severity: warning for: 5m - name: OpenStack Neutron subnet IP pool exhaustion description: "Subnet {{ $labels.subnet_name }} on network {{ $labels.network_name }} has used over 90% of its IP pool" query: 'openstack_neutron_network_ip_availabilities_used / openstack_neutron_network_ip_availabilities_total > 0.9 and openstack_neutron_network_ip_availabilities_total > 0' severity: warning - name: OpenStack Neutron ports without IPs description: "{{ $value }} active ports have no IP addresses assigned" query: 'openstack_neutron_ports_no_ips > 0' severity: warning for: 5m - name: OpenStack load balancer not online description: "Load balancer {{ $labels.name }} ({{ $labels.id }}) operating status is {{ $labels.operating_status }}" query: 'openstack_loadbalancer_loadbalancer_status{operating_status!="ONLINE"} > 0' severity: warning for: 5m - name: OpenStack Nova instances in ERROR state description: "{{ $value }} Nova instances are in ERROR state" query: 'sum(openstack_nova_server_status{status="ERROR"}) > 0' severity: warning for: 5m - name: OpenStack Cinder volumes in error state description: "{{ $value }} Cinder volumes are in an error state" query: 'openstack_cinder_volume_status_counter{status=~"error.*"} > 0' severity: warning for: 5m - name: OpenStack placement resource high usage description: "Resource {{ $labels.resourcetype }} on host {{ $labels.hostname }} usage exceeds 90% of its allocation" query: 'openstack_placement_resource_usage / (openstack_placement_resource_total * openstack_placement_resource_allocation_ratio) > 0.9 and openstack_placement_resource_total > 0' severity: warning for: 5m comments: | This alert factors in the allocation ratio to compute effective capacity. The threshold of 90% is a rough default. Adjust based on your allocation ratios and workload patterns. - name: CI/CD services: - name: Jenkins exporters: - name: Metric plugin slug: metric-plugin doc_url: https://plugins.jenkins.io/prometheus/ rules: - name: Jenkins node offline description: "At least one Jenkins node offline: `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})" query: "jenkins_node_offline_value > 0" severity: critical for: 5m - name: Jenkins no node online description: "No Jenkins nodes are online: `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})" query: "jenkins_node_online_value == 0" severity: critical - name: Jenkins healthcheck description: "Jenkins healthcheck score: {{$value}}. Healthcheck failure for `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})" query: "jenkins_health_check_score < 1" severity: critical - name: Jenkins outdated plugins description: "{{ $value }} plugins need update" query: "sum(jenkins_plugins_withUpdate) by (instance) > 3" severity: warning for: 1d - name: Jenkins builds health score description: "Healthcheck failure for `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})" query: "default_jenkins_builds_health_score < 1" severity: critical - name: Jenkins run failure total description: "Job run failures: ({{$value}}) {{$labels.jenkins_job}}. Healthcheck failure for `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})" query: "increase(jenkins_runs_failure_total[1h]) > 100" severity: warning - name: Jenkins build tests failing description: "Last build tests failed: {{$labels.jenkins_job}}. Failed build Tests for job `{{$labels.jenkins_job}}` on {{$labels.instance}}/{{$labels.env}} ({{$labels.region}})" query: "default_jenkins_builds_last_build_tests_failing > 0" severity: warning - name: Jenkins last build failed description: "Last build failed: {{$labels.jenkins_job}}. Failed build for job `{{$labels.jenkins_job}}` on {{$labels.instance}}/{{$labels.env}} ({{$labels.region}})" query: "default_jenkins_builds_last_build_result_ordinal == 2" severity: warning comments: | * RUNNING -1 true - The build had no errors. * SUCCESS 0 true - The build had no errors. * UNSTABLE 1 true - The build had some errors but they were not fatal. For example, some tests failed. * FAILURE 2 false - The build had a fatal error. * NOT_BUILT 3 false - The module was not built. * ABORTED 4 false - The build was manually aborted. - name: ArgoCD exporters: - name: Embedded exporter slug: embedded-exporter doc_url: https://argo-cd.readthedocs.io/en/stable/operator-manual/metrics/ rules: - name: ArgoCD service not synced description: Service {{ $labels.name }} run by argo is currently not in sync. query: 'argocd_app_info{sync_status!="Synced"} != 0' severity: warning for: 15m - name: ArgoCD service unhealthy description: Service {{ $labels.name }} run by argo is currently not healthy. query: 'argocd_app_info{health_status!="Healthy"} != 0' severity: warning for: 15m - name: FluxCD exporters: - name: Embedded exporter slug: embedded-exporter doc_url: https://fluxcd.io/flux/monitoring/metrics/ rules: - name: Flux Kustomization Failure description: The {{ $labels.customresource_kind }} '{{ $labels.name }}' in namespace {{ $labels.exported_namespace }} is marked as not ready. query: 'gotk_resource_info{ready="False", customresource_kind="Kustomization"} > 0' severity: warning for: 15m - name: Flux HelmRelease Failure description: The {{ $labels.customresource_kind }} '{{ $labels.name }}' in namespace {{ $labels.exported_namespace }} is marked as not ready. query: 'gotk_resource_info{ready="False", customresource_kind="HelmRelease"} > 0' severity: warning for: 15m - name: Flux Source Issue description: Flux source {{ $labels.customresource_kind }} '{{ $labels.name }}' has issue(s). query: 'gotk_resource_info{ready="False", customresource_kind=~"GitRepository|HelmRepository|Bucket|OCIRepository"} > 0' severity: warning for: 15m - name: Flux Image Issue description: The {{ $labels.customresource_kind }} '{{ $labels.name }}' is marked as not ready. query: 'gotk_resource_info{ready="False", customresource_kind=~"ImagePolicy|ImageRepository|ImageUpdateAutomation"} > 0' severity: warning for: 15m - name: GitLab CI exporters: - name: GitLab built-in exporter slug: gitlab-built-in-exporter doc_url: https://docs.gitlab.com/administration/monitoring/prometheus/gitlab_metrics/ rules: # Puma web server - name: GitLab Puma high queued connections description: "GitLab Puma has {{ $value }} queued connections on {{ $labels.instance }}. Requests are waiting for an available worker thread." query: "puma_queued_connections > 5" severity: warning for: 5m comments: | Queued connections indicate Puma workers are saturated. Consider increasing puma['worker_processes'] or puma['max_threads'] in gitlab.rb. - name: GitLab Puma no available pool capacity description: "GitLab Puma pool capacity on {{ $labels.instance }} has been at 0 for 5 minutes. All threads are busy." query: "puma_pool_capacity == 0" severity: critical for: 5m - name: GitLab Puma workers not running description: "GitLab Puma on {{ $labels.instance }} has {{ $value }} running workers out of expected total." query: "puma_running_workers < puma_workers" severity: warning for: 5m # HTTP request handling - name: GitLab high HTTP error rate description: "GitLab is returning more than 5% HTTP 5xx errors on {{ $labels.instance }}." query: 'sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m])) * 100 > 5 and sum(rate(http_requests_total[5m])) > 0' severity: critical for: 5m comments: | Threshold is 5% of all requests returning server errors. Check GitLab logs at /var/log/gitlab/ for root cause. - name: GitLab high HTTP request latency description: "GitLab p95 HTTP request latency on {{ $labels.instance }} is above 10 seconds." query: "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le)) > 10" severity: warning for: 5m comments: | Threshold of 10s may need adjustment based on your instance size and workload. # Sidekiq background jobs - name: GitLab Sidekiq jobs failing description: "GitLab Sidekiq jobs are failing at a rate of {{ $value }} per second on {{ $labels.instance }}." query: "rate(sidekiq_jobs_failed_total[5m]) > 0.1" severity: warning for: 10m comments: | This metric requires the emit_sidekiq_histogram_metrics feature flag to be enabled. A sustained failure rate indicates background processing issues. - name: GitLab Sidekiq queue too large description: "GitLab Sidekiq has {{ $value }} running jobs, approaching concurrency limit on {{ $labels.instance }}." query: "sum(sidekiq_running_jobs) >= sum(sidekiq_concurrency) * 0.9" severity: warning for: 10m comments: | When running jobs approach the concurrency limit, new jobs will queue up. Consider scaling Sidekiq workers or increasing concurrency. - name: GitLab Sidekiq high job completion time description: "GitLab Sidekiq job p95 completion time on {{ $labels.instance }} is above 5 minutes ({{ $value | humanizeDuration }})." query: "histogram_quantile(0.95, sum(rate(sidekiq_jobs_completion_seconds_bucket[5m])) by (le, worker)) > 300" severity: warning for: 10m comments: | This metric requires the emit_sidekiq_histogram_metrics feature flag to be enabled. - name: GitLab Sidekiq high queue latency description: "GitLab Sidekiq jobs on {{ $labels.instance }} are waiting more than 60 seconds before being processed." query: "histogram_quantile(0.95, sum(rate(sidekiq_jobs_queue_duration_seconds_bucket[5m])) by (le)) > 60" severity: warning for: 5m comments: | This metric requires the emit_sidekiq_histogram_metrics feature flag to be enabled. High queue latency means jobs are stuck waiting. Check Sidekiq concurrency and queue sizes. # Database connection pool - name: GitLab database connection pool saturation description: "GitLab database connection pool on {{ $labels.instance }} ({{ $labels.class }}) is {{ $value }}% busy." query: "gitlab_database_connection_pool_busy / gitlab_database_connection_pool_size * 100 > 90 and gitlab_database_connection_pool_size > 0" severity: warning for: 5m comments: | When the pool is near saturation, requests may block waiting for a connection. Increase db_pool_size in gitlab.rb or investigate slow queries. - name: GitLab database connection pool dead connections description: "GitLab database connection pool on {{ $labels.instance }} ({{ $labels.class }}) has {{ $value }} dead connections." query: "gitlab_database_connection_pool_dead > 0" severity: warning for: 5m - name: GitLab database connection pool waiting description: "GitLab on {{ $labels.instance }} has {{ $value }} threads waiting for a database connection." query: "gitlab_database_connection_pool_waiting > 0" severity: warning for: 5m # CI/CD pipelines - name: GitLab CI pipeline creation slow description: "GitLab CI pipeline creation p95 latency on {{ $labels.instance }} is above 30 seconds." query: "histogram_quantile(0.95, sum(rate(gitlab_ci_pipeline_creation_duration_seconds_bucket[5m])) by (le)) > 30" severity: warning for: 5m - name: GitLab CI pipeline failures increasing description: "GitLab CI pipeline failures are increasing on {{ $labels.instance }} ({{ $value }}/s)." query: "deriv(gitlab_ci_pipeline_failure_reasons[5m]) > 0.05" severity: warning for: 10m comments: | This metric may not exist in all GitLab versions. Verify against your GitLab installation. - name: GitLab CI runner authentication failures description: "GitLab CI runners are experiencing authentication failures on {{ $labels.instance }} ({{ $value }} failures)." query: "increase(gitlab_ci_runner_authentication_failure_total[5m]) > 5" severity: warning for: 5m comments: | Frequent runner auth failures may indicate expired tokens or misconfigured runners. # Ruby process health - name: GitLab high memory usage description: "GitLab process on {{ $labels.instance }} is using {{ $value | humanize1024 }}B of RSS memory." query: "process_resident_memory_bytes{job=~\".*gitlab.*\"} > 2e+9" severity: warning for: 10m comments: | Threshold of 2GB may need adjustment based on your instance size. High memory usage can lead to OOM kills and service disruptions. - name: GitLab Ruby heap fragmentation description: "GitLab Ruby heap fragmentation on {{ $labels.instance }} is {{ $value }}. High fragmentation wastes memory." query: "ruby_gc_stat_ext_heap_fragmentation{job=~\".*gitlab.*\"} > 0.5" severity: warning for: 15m comments: | Heap fragmentation above 50% means a significant amount of memory is wasted. A Puma worker restart may help reclaim memory. # Uncaught errors - name: GitLab rack uncaught errors description: "GitLab is experiencing uncaught errors in the Rack layer on {{ $labels.instance }} ({{ $value }}/s)." query: "rate(rack_uncaught_errors_total[5m]) > 0.05" severity: warning for: 5m # Application version / deployment - name: GitLab version mismatch description: "Multiple GitLab versions are running across the fleet." query: 'count(count by (version) (gitlab_build_info)) > 1' severity: warning comments: | This may happen during a rolling deployment. If it persists, investigate incomplete upgrades. # File descriptors - name: GitLab high file descriptor usage description: "GitLab on {{ $labels.instance }} is using {{ $value }}% of available file descriptors." query: 'process_open_fds{job=~".*gitlab.*"} / process_max_fds * 100 > 80 and process_max_fds > 0' severity: warning for: 5m # Ruby threads - name: GitLab Ruby threads saturated description: "GitLab running threads on {{ $labels.instance }} have exceeded the expected maximum ({{ $value }})." query: "sum by (instance) (gitlab_ruby_threads_running_threads) > on(instance) gitlab_ruby_threads_max_expected_threads * 1.5" severity: warning for: 10m - name: Workhorse slug: workhorse doc_url: https://docs.gitlab.com/administration/monitoring/prometheus/gitlab_metrics/#gitlab-workhorse rules: - name: GitLab Workhorse high error rate description: "GitLab Workhorse on {{ $labels.instance }} is returning more than 10% HTTP 5xx errors." query: 'sum(rate(gitlab_workhorse_http_request_duration_seconds_count{code=~"5.."}[5m])) / sum(rate(gitlab_workhorse_http_request_duration_seconds_count[5m])) * 100 > 10 and sum(rate(gitlab_workhorse_http_request_duration_seconds_count[5m])) > 0' severity: critical for: 5m comments: | Workhorse sits in front of Puma and handles Git HTTP, file uploads, and proxying. Threshold from GitLab Omnibus default rules: 10% for high-traffic instances. - name: GitLab Workhorse high latency description: "GitLab Workhorse on {{ $labels.instance }} p95 request latency is above 10 seconds." query: "histogram_quantile(0.95, sum(rate(gitlab_workhorse_http_request_duration_seconds_bucket[5m])) by (le)) > 10" severity: warning for: 5m - name: GitLab Workhorse high in-flight requests description: "GitLab Workhorse on {{ $labels.instance }} has {{ $value }} in-flight requests." query: "gitlab_workhorse_http_in_flight_requests > 100" severity: warning for: 5m comments: | Threshold of 100 may need adjustment based on instance size. - name: Gitaly slug: gitaly doc_url: https://docs.gitlab.com/administration/gitaly/monitoring/ rules: - name: GitLab Gitaly high gRPC error rate description: "Gitaly on {{ $labels.instance }} is returning more than 5% gRPC errors." query: 'sum(rate(grpc_server_handled_total{job="gitaly",grpc_code=~"Internal|Unavailable|DeadlineExceeded|ResourceExhausted|Aborted|Unknown|DataLoss"}[5m])) / sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) * 100 > 5 and sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) > 0' severity: warning for: 5m comments: | Filters to actual error codes. grpc_code!="OK" includes benign codes like NotFound, AlreadyExists, and Cancelled. - name: GitLab Gitaly resource exhausted description: "Gitaly on {{ $labels.instance }} is returning ResourceExhausted errors, indicating overload ({{ $value }}%)." query: 'sum(rate(grpc_server_handled_total{job="gitaly",grpc_code="ResourceExhausted"}[5m])) / sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) * 100 > 1 and sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) > 0' severity: critical for: 5m comments: | ResourceExhausted errors from Gitaly mean Git operations are being rejected due to concurrency limits. This directly impacts users trying to push, pull, or clone. - name: GitLab Gitaly high RPC latency description: "Gitaly on {{ $labels.instance }} p95 unary RPC latency exceeds 1 second ({{ $value }}s)." query: 'histogram_quantile(0.95, sum(rate(grpc_server_handling_seconds_bucket{job="gitaly",grpc_type="unary"}[5m])) by (le)) > 1' severity: warning for: 5m - name: GitLab Gitaly CPU throttled description: "Gitaly processes on {{ $labels.instance }} are being CPU throttled by cgroups." query: "rate(gitaly_cgroup_cpu_cfs_throttled_seconds_total[5m]) > 0.1" severity: warning for: 5m comments: | Brief throttling spikes are normal. Threshold of 0.1s/s (10% of CPU time throttled) filters out transient noise. - name: GitLab Gitaly authentication failures description: "Gitaly on {{ $labels.instance }} has authentication failures ({{ $value }})." query: 'increase(gitaly_authentications_total{status="failed"}[5m]) > 3' severity: warning - name: GitLab Gitaly circuit breaker tripped description: "Gitaly circuit breaker has tripped on {{ $labels.instance }}. Git operations are failing." query: 'increase(gitaly_circuit_breaker_transitions_total{to_state="open"}[5m]) > 0' severity: critical comments: | When the circuit breaker trips to "open" state, Git operations (push, pull, clone) will fail. Check Gitaly service health and logs. - name: Spinnaker exporters: - name: Embedded exporter slug: embedded-exporter doc_url: https://spinnaker.io/docs/setup/other_config/monitoring/ rules: - name: Spinnaker circuit breaker open description: "Circuit breaker {{ $labels.name }} is open on {{ $labels.instance }}, indicating repeated downstream failures." query: 'resilience4j_circuitbreaker_state{state="open"} == 1' severity: warning for: 5m - name: Spinnaker Orca queue backing up description: "Orca work queue has {{ $value }} messages ready for delivery but not yet picked up. Pipeline executions may be delayed." query: 'queue_ready_depth > 0' severity: warning for: 5m comments: | In a healthy Spinnaker, queue_ready_depth should stay at or near 0. Sustained non-zero values indicate Orca cannot keep up with incoming work. - name: Spinnaker Orca queue message lag high description: "Orca queue message lag is {{ $value }}s. Pipeline stages are waiting too long before being processed." query: 'rate(queue_message_lag_seconds_sum[5m]) / rate(queue_message_lag_seconds_count[5m]) > 30 and rate(queue_message_lag_seconds_count[5m]) > 0' severity: warning for: 5m comments: | The 30s threshold is a rough default. Adjust based on your pipeline SLOs. - name: Spinnaker dead messages description: "Orca is producing dead-lettered messages ({{ $value | humanize }}/s). These are tasks that exhausted all retries and will not be executed." query: 'rate(queue_dead_messages_total[5m]) > 0.05' severity: critical for: 2m - name: Spinnaker zombie executions description: "Zombie pipeline executions rate is {{ $value | humanize }}/s. These are executions with no corresponding queue messages." query: 'rate(queue_zombies_total[5m]) > 0.05' severity: warning for: 5m comments: | Zombies are pipeline executions that are running but have lost their queue entry. See https://spinnaker.io/docs/guides/runbooks/orca-zombie-executions/ - name: Spinnaker thread pool exhaustion description: "Orca message handler thread pool has {{ $value }} blocked threads on {{ $labels.instance }}. Pipeline execution throughput is degraded." query: 'threadpool_blockingQueueSize > 0' severity: warning for: 5m - name: Spinnaker polling monitor items over threshold description: "Igor polling monitor {{ $labels.monitor }} for {{ $labels.partition }} has exceeded its item threshold, preventing pipeline triggers." query: 'sum by (monitor, partition) (pollingMonitor_itemsOverThreshold) > 0' severity: critical for: 5m comments: | When this threshold is exceeded, Igor stops triggering pipelines for the affected monitor. See https://kb.armory.io/s/article/Hitting-Igor-s-caching-thresholds - name: Spinnaker polling monitor failures description: "Igor polling monitor is experiencing failures ({{ $value }} per second). CI/SCM integrations may not trigger pipelines." query: 'rate(pollingMonitor_failed_total[5m]) > 0.05' severity: warning for: 5m - name: Spinnaker high API error rate description: "Spinnaker API 5xx error rate is {{ $value | humanizePercentage }} on {{ $labels.instance }}." query: 'sum by (instance) (rate(controller_invocations_total{status="5xx"}[5m])) / sum by (instance) (rate(controller_invocations_total[5m])) > 0.05 and sum by (instance) (rate(controller_invocations_total[5m])) > 0' severity: warning for: 5m comments: | The 5% threshold is a rough default. Adjust based on your traffic patterns. - name: Spinnaker API rate limit throttling description: "Gate is actively throttling API requests on {{ $labels.instance }} ({{ $value }} throttled requests per second)." query: 'rate(rateLimitThrottling_total[5m]) > 0.05' severity: warning for: 2m - name: Spinnaker Clouddriver high error rate description: "Clouddriver 5xx error rate is {{ $value | humanizePercentage }} on {{ $labels.instance }}. Cloud operations may be failing." query: 'sum by (instance) (rate(controller_invocations_total{status="5xx", job=~".*clouddriver.*"}[5m])) / sum by (instance) (rate(controller_invocations_total{job=~".*clouddriver.*"}[5m])) > 0.05 and sum by (instance) (rate(controller_invocations_total{job=~".*clouddriver.*"}[5m])) > 0' severity: warning for: 5m - name: Spinnaker AWS rate limiting description: "Clouddriver is being rate-limited by AWS on {{ $labels.instance }} ({{ $value }}ms delay). Cloud operations will be slower." query: 'amazonClientProvider_rateLimitDelayMil > 1000' severity: warning for: 5m comments: | This metric is specific to AWS cloud providers in Clouddriver. The 1000ms threshold is a rough default. Adjust based on your AWS usage patterns. - name: Network and security services: - name: SpeedTest exporters: - name: Speedtest exporter slug: nlamirault-speedtest-exporter doc_url: https://github.com/nlamirault/speedtest_exporter rules: - name: SpeedTest Slow Internet Download description: Internet download speed is currently {{humanize $value}} Mbps. query: "avg_over_time(speedtest_download[10m]) < 100" severity: warning - name: SpeedTest Slow Internet Upload description: Internet upload speed is currently {{humanize $value}} Mbps. query: "avg_over_time(speedtest_upload[10m]) < 20" severity: warning - name: SSL/TLS exporters: - name: ssl_exporter slug: ribbybibby-ssl-exporter doc_url: https://github.com/ribbybibby/ssl_exporter rules: - name: SSL certificate probe failed description: Failed to fetch SSL information {{ $labels.instance }} query: ssl_probe_success == 0 severity: critical for: 1m - name: SSL certificate OCSP status unknown description: Failed to get the OCSP status for {{ $labels.instance }} query: ssl_ocsp_response_status == 2 severity: warning - name: SSL certificate revoked description: SSL certificate revoked {{ $labels.instance }} query: ssl_ocsp_response_status == 1 severity: critical - name: SSL certificate expiry (< 7 days) description: "{{ $labels.instance }} Certificate is expiring in 7 days" query: ssl_verified_cert_not_after{chain_no="0"} - time() < 86400 * 7 severity: warning - name: cert-manager exporters: - name: Embedded exporter slug: embedded-exporter doc_url: https://cert-manager.io/docs/devops-tips/prometheus-metrics/ rules: - name: Cert-Manager absent description: Cert-Manager has disappeared from Prometheus service discovery. New certificates will not be able to be minted, and existing ones can't be renewed until cert-manager is back. query: 'absent(up{job="cert-manager"})' severity: critical for: 10m - name: Cert-Manager certificate expiring soon description: The certificate {{ $labels.name }} is expiring in less than 21 days. query: 'avg by (exported_namespace, namespace, name) (certmanager_certificate_expiration_timestamp_seconds - time()) < (21 * 24 * 3600)' severity: warning for: 1h comments: | Threshold of 21 days is a rough default. ACME certificates are typically renewed 30 days before expiry, so expiring within 21 days may indicate issuer misconfiguration. - name: Cert-Manager certificate not ready description: "The certificate {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is not ready to serve traffic." query: 'max by (name, exported_namespace, namespace, condition) (certmanager_certificate_ready_status{condition!="True"} == 1)' severity: critical for: 10m - name: Cert-Manager hitting ACME rate limits description: Cert-Manager is being rate-limited by the ACME provider. Certificate issuance and renewal may be blocked for up to a week. query: 'sum by (host) (rate(certmanager_acme_client_request_count{status="429"}[5m])) > 0' severity: critical for: 5m comments: | Metric renamed in cert-manager v1.19+ (dropped the http_ prefix): certmanager_acme_client_request_count. For cert-manager < v1.19, use: certmanager_http_acme_client_request_count. - name: Juniper exporters: - name: czerwonk/junos_exporter slug: czerwonk-junos-exporter doc_url: https://github.com/czerwonk/junos_exporter rules: - name: Juniper switch down description: The switch appears to be down query: junos_up == 0 severity: critical - name: Juniper critical Bandwidth Usage 1GiB description: Interface is highly saturated. (> 0.90GiB/s) query: "rate(junos_interface_transmit_bytes[1m]) * 8 > 1e+9 * 0.90" severity: critical for: 1m - name: Juniper warning Bandwidth Usage 1GiB description: Interface is getting saturated. (> 0.80GiB/s) query: "rate(junos_interface_transmit_bytes[1m]) * 8 > 1e+9 * 0.80" severity: warning for: 1m - name: CoreDNS exporters: - name: Embedded exporter slug: embedded-exporter rules: - name: CoreDNS Panic Count description: Number of CoreDNS panics encountered query: "increase(coredns_panics_total[1m]) > 0" severity: critical - name: Freeswitch exporters: - name: znerol/prometheus-freeswitch-exporter slug: znerol-freeswitch-exporter doc_url: https://pypi.org/project/prometheus-freeswitch-exporter rules: - name: Freeswitch down description: "Freeswitch {{ $labels.instance }} is unresponsive." query: "freeswitch_up == 0" severity: critical for: 1m - name: Freeswitch Sessions Warning description: 'High sessions usage on {{ $labels.instance }}: {{ $value | printf "%.2f"}}%' query: "(freeswitch_session_active * 100 / freeswitch_session_limit) > 80 and freeswitch_session_limit > 0" severity: warning for: 10m - name: Freeswitch Sessions Critical description: 'High sessions usage on {{ $labels.instance }}: {{ $value | printf "%.2f"}}%' query: "(freeswitch_session_active * 100 / freeswitch_session_limit) > 90 and freeswitch_session_limit > 0" severity: critical for: 5m - name: Hashicorp Vault exporters: - name: Embedded exporter slug: embedded-exporter doc_url: https://github.com/hashicorp/vault/blob/master/website/content/docs/configuration/telemetry.mdx#prometheus rules: - name: Vault sealed description: "Vault instance is sealed on {{ $labels.instance }}" query: "vault_core_unsealed == 0" severity: critical for: 1m - name: Vault too many pending tokens description: "Too many pending tokens on {{ $labels.instance }}: {{ $value }} tokens created but not yet stored." query: "avg(vault_token_create_count - vault_token_store_count) > 0" severity: warning for: 5m - name: Vault too many infinity tokens description: "Too many non-expiring tokens on {{ $labels.instance }}: {{ $value }} tokens with infinite TTL." query: 'vault_token_count_by_ttl{creation_ttl="+Inf"} > 3' severity: warning for: 5m - name: Vault cluster health description: "Vault cluster is not healthy: only {{ $value | humanizePercentage }} of nodes are active." query: "sum(vault_core_active) / count(vault_core_active) <= 0.5 and count(vault_core_active) > 0" severity: critical - name: Keycloak exporters: - name: aerogear/keycloak-metrics-spi slug: aerogear-keycloak-metrics-spi doc_url: https://github.com/aerogear/keycloak-metrics-spi rules: - name: Keycloak high login failure rate description: "More than 5% of login attempts are failing in realm {{ $labels.realm }} (current value: {{ $value | printf \"%.1f\" }}%)." query: '(sum by (realm) (rate(keycloak_failed_login_attempts_total[5m])) / (sum by (realm) (rate(keycloak_logins_total[5m])) + sum by (realm) (rate(keycloak_failed_login_attempts_total[5m])))) * 100 > 5 and (sum by (realm) (rate(keycloak_logins_total[5m])) + sum by (realm) (rate(keycloak_failed_login_attempts_total[5m]))) > 0' severity: warning for: 5m comments: | Threshold of 5% is a rough default. Adjust based on your user base and expected error rates. A spike in failed logins may indicate a brute-force attack or misconfigured client. - name: Keycloak no successful logins description: "No successful logins in realm {{ $labels.realm }} for the last 15 minutes." query: 'sum by (realm) (rate(keycloak_logins_total[15m])) == 0 and (sum by (realm) (rate(keycloak_logins_total[15m])) + sum by (realm) (rate(keycloak_failed_login_attempts_total[15m]))) > 0' severity: critical for: 5m comments: Only fires when login attempts exist but none succeed — may indicate an authentication outage. - name: Keycloak high token refresh error rate description: "More than 10% of token refresh attempts are failing in realm {{ $labels.realm }} (current value: {{ $value | printf \"%.1f\" }}%)." query: '(sum by (realm) (rate(keycloak_refresh_tokens_errors_total[5m])) / sum by (realm) (rate(keycloak_refresh_tokens_total[5m]))) * 100 > 10 and sum by (realm) (rate(keycloak_refresh_tokens_total[5m])) > 0' severity: warning for: 5m comments: Threshold of 10% is a rough default. High refresh token errors may indicate expired sessions or token store issues. - name: Keycloak high code-to-token exchange error rate description: "More than 10% of code-to-token exchanges are failing in realm {{ $labels.realm }} (current value: {{ $value | printf \"%.1f\" }}%)." query: '(sum by (realm) (rate(keycloak_code_to_tokens_errors_total[5m])) / sum by (realm) (rate(keycloak_code_to_tokens_total[5m]))) * 100 > 10 and sum by (realm) (rate(keycloak_code_to_tokens_total[5m])) > 0' severity: warning for: 5m comments: Threshold of 10% is a rough default. Code-to-token failures may indicate misconfigured OAuth clients or replay attacks. - name: Keycloak high registration failure rate description: "More than 10% of registration attempts are failing in realm {{ $labels.realm }} (current value: {{ $value | printf \"%.1f\" }}%)." query: '(sum by (realm) (rate(keycloak_registrations_errors_total[5m])) / sum by (realm) (rate(keycloak_registrations_total[5m]))) * 100 > 10 and sum by (realm) (rate(keycloak_registrations_total[5m])) > 0' severity: warning for: 5m comments: Threshold of 10% is a rough default. - name: Keycloak slow request response time description: "Keycloak {{ $labels.method }} requests are taking more than 2 seconds on average." query: 'sum by (method) (rate(keycloak_request_duration_sum[5m])) / sum by (method) (rate(keycloak_request_duration_count[5m])) > 2000 and sum by (method) (rate(keycloak_request_duration_count[5m])) > 0' severity: warning for: 5m comments: | keycloak_request_duration is in milliseconds. Threshold of 2000ms (2 seconds) is a rough default. - name: Cloudflare exporters: - name: lablabs/cloudflare-exporter slug: lablabs-cloudflare-exporter doc_url: https://github.com/lablabs/cloudflare-exporter rules: - name: Cloudflare http 4xx error rate description: "Cloudflare high HTTP 4xx error rate (> 5% for domain {{ $labels.zone }})" query: '(sum by(zone) (rate(cloudflare_zone_requests_status{status=~"^4.."}[15m])) / on (zone) sum by (zone) (rate(cloudflare_zone_requests_status[15m]))) * 100 > 5 and sum by (zone) (rate(cloudflare_zone_requests_status[15m])) > 0' severity: warning - name: Cloudflare http 5xx error rate description: "Cloudflare high HTTP 5xx error rate (> 5% for domain {{ $labels.zone }})" query: '(sum by (zone) (rate(cloudflare_zone_requests_status{status=~"^5.."}[5m])) / on (zone) sum by (zone) (rate(cloudflare_zone_requests_status[5m]))) * 100 > 5 and sum by (zone) (rate(cloudflare_zone_requests_status[5m])) > 0' severity: critical - name: SNMP exporters: - name: prometheus/snmp_exporter slug: snmp-exporter doc_url: https://github.com/prometheus/snmp_exporter comments: | These rules use standard IF-MIB and SNMPv2-MIB metrics. Metric names depend on your snmp.yml module configuration. Thresholds for bandwidth and error rates are rough defaults - adjust to your environment. rules: - name: SNMP target down description: "SNMP device {{ $labels.instance }} is unreachable." query: 'up{job=~"snmp.*"} == 0' severity: critical for: 5m comments: | Rename job=~"snmp.*" to match the actual job name in your Prometheus scrape config. - name: SNMP interface down description: "Interface {{ $labels.ifDescr }} on {{ $labels.instance }} is operationally down while administratively up." query: '(ifOperStatus{job=~"snmp.*"} == 2) and on(instance, job, ifIndex) (ifAdminStatus{job=~"snmp.*"} == 1)' severity: critical for: 2m - name: SNMP interface high inbound error rate description: "Interface {{ $labels.ifDescr }} on {{ $labels.instance }} has an inbound error rate above 5%." query: 'rate(ifInErrors{job=~"snmp.*"}[5m]) / (rate(ifHCInUcastPkts{job=~"snmp.*"}[5m]) + rate(ifHCInBroadcastPkts{job=~"snmp.*"}[5m]) + rate(ifHCInMulticastPkts{job=~"snmp.*"}[5m])) > 0.05 and (rate(ifHCInUcastPkts{job=~"snmp.*"}[5m]) + rate(ifHCInBroadcastPkts{job=~"snmp.*"}[5m]) + rate(ifHCInMulticastPkts{job=~"snmp.*"}[5m])) > 0' severity: warning for: 5m comments: Threshold is a rough default. Adjust based on your network environment. - name: SNMP interface high outbound error rate description: "Interface {{ $labels.ifDescr }} on {{ $labels.instance }} has an outbound error rate above 5%." query: 'rate(ifOutErrors{job=~"snmp.*"}[5m]) / (rate(ifHCOutUcastPkts{job=~"snmp.*"}[5m]) + rate(ifHCOutBroadcastPkts{job=~"snmp.*"}[5m]) + rate(ifHCOutMulticastPkts{job=~"snmp.*"}[5m])) > 0.05 and (rate(ifHCOutUcastPkts{job=~"snmp.*"}[5m]) + rate(ifHCOutBroadcastPkts{job=~"snmp.*"}[5m]) + rate(ifHCOutMulticastPkts{job=~"snmp.*"}[5m])) > 0' severity: warning for: 5m comments: Threshold is a rough default. Adjust based on your network environment. - name: SNMP interface high bandwidth usage inbound description: "Interface {{ $labels.ifDescr }} on {{ $labels.instance }} inbound utilization is above 80%." query: 'rate(ifHCInOctets{job=~"snmp.*"}[5m]) * 8 / ifSpeed > 0.80 and ifSpeed > 0' severity: warning for: 15m comments: | Threshold is a rough default. ifSpeed is a Gauge32 that maxes out at ~4.29 Gbps. For 10G+ interfaces, use ifHighSpeed (in Mbps) instead. - name: SNMP interface high bandwidth usage outbound description: "Interface {{ $labels.ifDescr }} on {{ $labels.instance }} outbound utilization is above 80%." query: 'rate(ifHCOutOctets{job=~"snmp.*"}[5m]) * 8 / ifSpeed > 0.80 and ifSpeed > 0' severity: warning for: 15m comments: | Threshold is a rough default. ifSpeed is a Gauge32 that maxes out at ~4.29 Gbps. For 10G+ interfaces, use ifHighSpeed (in Mbps) instead. - name: SNMP device restarted description: "SNMP device {{ $labels.instance }} has restarted (uptime < 5 minutes)." query: "sysUpTime / 100 < 300" severity: info comments: sysUpTime is in centiseconds (hundredths of a second). - name: Cilium exporters: - name: Embedded exporter slug: embedded-exporter doc_url: https://docs.cilium.io/en/stable/observability/metrics/ rules: # Agent health - name: Cilium agent unreachable nodes description: "Cilium agent {{ $labels.pod }} cannot reach {{ $value }} node(s). Check network connectivity and node health." query: "sum(cilium_unreachable_nodes{}) by (pod) > 0" severity: warning for: 15m comments: | Metric name depends on Cilium version. Use cilium_unreachable_nodes (older) or cilium_node_connectivity_status (1.14+). - name: Cilium agent unreachable health endpoints description: "Cilium agent {{ $labels.pod }} cannot reach {{ $value }} health endpoint(s). Node-to-node health probes are failing." query: "sum(cilium_unreachable_health_endpoints{}) by (pod) > 0" severity: warning for: 15m comments: | Metric name depends on Cilium version. Use cilium_unreachable_health_endpoints (older) or cilium_node_connectivity_status (1.14+). - name: Cilium agent failing controllers description: "Cilium agent {{ $labels.pod }} has {{ $value }} failing controller(s). Check cilium-agent logs for details." query: "sum(cilium_controllers_failing{}) by (pod) > 0" severity: warning for: 5m comments: | Metric name depends on Cilium version. Use cilium_controllers_failing (older) or cilium_controllers_runs_total (1.14+). # Endpoints - name: Cilium agent endpoint failures description: "Cilium agent {{ $labels.pod }} has {{ $value }} endpoint(s) in invalid state." query: 'sum(cilium_endpoint_state{endpoint_state="invalid"}) by (pod) > 0' severity: warning for: 5m - name: Cilium agent endpoint regeneration failures description: "Cilium agent {{ $labels.pod }} is failing to regenerate endpoints. Network policy enforcement may be stale." query: 'sum(rate(cilium_endpoint_regenerations_total{outcome="fail"}[5m])) by (pod) > 0.05' severity: warning for: 5m - name: Cilium agent endpoint update failure description: "Cilium agent {{ $labels.pod }} is failing K8s endpoint update API calls ({{ $labels.method }} {{ $labels.return_code }})." query: 'sum(rate(cilium_k8s_client_api_calls_total{method=~"(PUT|POST|PATCH)", endpoint="endpoint", return_code!~"2[0-9][0-9]"}[5m])) by (pod, method, return_code) > 0.05' severity: warning for: 5m - name: Cilium agent endpoint create failure description: "Cilium agent {{ $labels.pod }} is failing CNI endpoint-create calls. New pods may fail to get networking." query: 'sum(rate(cilium_api_limiter_processed_requests_total{api_call=~"endpoint-create", outcome="fail"}[1m])) by (pod, api_call) > 0.05' severity: info for: 5m # BPF maps - name: Cilium agent map operation failures description: "Cilium agent {{ $labels.pod }} has eBPF map operation failures on {{ $labels.map_name }}. Datapath may be degraded." query: 'sum(rate(cilium_bpf_map_ops_total{outcome="fail"}[5m])) by (map_name, pod) > 0.05' severity: warning for: 5m - name: Cilium agent BPF map pressure description: "Cilium agent {{ $labels.pod }} eBPF map {{ $labels.map_name }} is above 90% utilization. Map may become full." query: "cilium_bpf_map_pressure{} > 0.9" severity: warning for: 5m comments: Map pressure is a ratio from 0 to 1. At 1.0, the map is full and new entries will be dropped. # Conntrack and NAT - name: Cilium agent conntrack table full description: "Cilium agent {{ $labels.pod }} conntrack table is full, causing packet drops. Increase CT map size or investigate connection leaks." query: 'sum(rate(cilium_drop_count_total{reason="CT: Map insertion failed"}[5m])) by (pod) > 0' severity: critical for: 5m - name: Cilium agent conntrack failed garbage collection description: "Cilium agent {{ $labels.pod }} conntrack garbage collection is failing. Stale entries may accumulate." query: 'sum(rate(cilium_datapath_conntrack_gc_runs_total{status="uncompleted"}[5m])) by (pod) > 0.05' severity: warning for: 5m - name: Cilium agent NAT table full description: "Cilium agent {{ $labels.pod }} NAT table is full, causing masquerade failures. Increase NAT map size or investigate." query: 'sum(rate(cilium_drop_count_total{reason="No mapping for NAT masquerade"}[1m])) by (pod) > 0' severity: critical for: 5m # Packet drops - name: Cilium agent high denied rate description: "Cilium agent {{ $labels.pod }} is dropping packets due to policy denial. Verify network policies are correct." query: 'sum(rate(cilium_drop_count_total{reason="Policy denied"}[1m])) by (pod) > 0' severity: info for: 10m comments: Policy denials may be expected behavior. Investigate only if unexpected traffic is being blocked. - name: Cilium agent high drop rate description: "Cilium agent {{ $labels.pod }} is dropping packets for reason {{ $labels.reason }}. This indicates infrastructure issues." query: 'sum(rate(cilium_drop_count_total{reason!~"Policy denied"}[5m])) by (pod, reason) > 0.05' severity: warning for: 5m # Policy - name: Cilium agent policy map pressure description: "Cilium agent {{ $labels.pod }} policy BPF map is above 90% utilization. New policies may fail to apply." query: 'sum(cilium_bpf_map_pressure{map_name=~"cilium_policy_.*"}) by (pod) > 0.9' severity: warning for: 5m - name: Cilium agent policy import errors description: "Cilium agent {{ $labels.pod }} is failing to import network policies. Policy enforcement may be incomplete." query: 'sum(rate(cilium_policy_change_total{outcome="fail"}[5m])) by (pod) > 0.05' severity: warning for: 5m - name: Cilium agent policy implementation delay description: "Cilium agent {{ $labels.pod }} P99 policy deployment latency exceeds 60 seconds. Endpoints may run with stale policies." query: "histogram_quantile(0.99, sum(rate(cilium_policy_implementation_delay_bucket[5m])) by (le, pod)) > 60" severity: warning for: 5m comments: Threshold of 60s is a rough default. Adjust based on cluster size and policy complexity. # Identity - name: Cilium node-local high identity allocation description: "Cilium agent {{ $labels.pod }} node-local identity allocation is above 80%. Approaching the 65535 identity limit." query: '(sum(cilium_identity{type="node_local"}) by (pod) / (2^16-1)) > 0.8' severity: warning for: 5m - name: Cilium cluster high identity allocation description: "Cilium cluster-wide identity allocation is above 80%. Approaching the maximum identity limit." query: '(sum(cilium_identity{type="cluster_local"}) by () / (2^16-256)) > 0.8' severity: warning for: 5m # IPAM - name: Cilium operator exhausted IPAM IPs description: "Cilium operator has no available IPAM IPs. New pods will fail to schedule networking." query: 'sum(cilium_operator_ipam_ips{type="available"}) by () <= 0' severity: critical for: 5m - name: Cilium operator low available IPAM IPs description: "Cilium operator IPAM IP pool is over 90% utilized. Allocate more IPs to avoid exhaustion." query: 'sum(cilium_operator_ipam_ips{type!="available"}) by () / sum(cilium_operator_ipam_ips) by () > 0.9 and sum(cilium_operator_ipam_ips) by () > 0' severity: warning for: 5m comments: Threshold of 90% is a rough default. Adjust based on your pod churn rate and IP pool size. - name: Cilium operator IPAM interface creation failures description: "Cilium operator is failing to create IPAM network interfaces. IP allocation may be impacted." query: 'sum(rate(cilium_operator_ipam_interface_creation_ops{status!="success"}[5m])) by () > 0.05' severity: warning for: 10m comments: | Some Cilium versions may not have a status label on this metric. Verify against your Cilium version. # API and K8s client - name: Cilium agent API errors description: "Cilium agent {{ $labels.pod }} API is returning 5xx errors ({{ $labels.return_code }}). Agent may be unhealthy." query: 'sum(rate(cilium_agent_api_process_time_seconds_count{return_code=~"5[0-9][0-9]"}[5m])) by (pod, return_code) > 0.05' severity: warning for: 5m - name: Cilium agent Kubernetes client errors description: "Cilium agent {{ $labels.pod }} is receiving errors from K8s API for endpoint {{ $labels.endpoint }} ({{ $labels.return_code }})." query: 'sum(rate(cilium_k8s_client_api_calls_total{endpoint!="metrics", return_code!~"2[0-9][0-9]"}[5m])) by (pod, endpoint, return_code) > 0.05' severity: info for: 5m # ClusterMesh - name: Cilium ClusterMesh remote cluster not ready description: "Cilium ClusterMesh remote cluster {{ $labels.target_cluster }} is not ready from {{ $labels.source_cluster }}." query: "count(cilium_clustermesh_remote_cluster_readiness_status < 1) by (source_cluster, target_cluster) > 0" severity: critical for: 5m - name: Cilium ClusterMesh remote cluster failing description: "Cilium ClusterMesh connectivity to remote cluster {{ $labels.target_cluster }} from {{ $labels.source_cluster }} is failing ({{ $value }} failures)." query: "sum(cilium_clustermesh_remote_cluster_failures) by (source_cluster, target_cluster) > 0" severity: critical for: 5m # KVStoreMesh - name: Cilium KVStoreMesh remote cluster not ready description: "Cilium KVStoreMesh remote cluster {{ $labels.target_cluster }} is not ready from {{ $labels.source_cluster }}." query: "count(cilium_kvstoremesh_remote_cluster_readiness_status < 1) by (source_cluster, target_cluster) > 0" severity: critical for: 5m - name: Cilium KVStoreMesh remote cluster failing description: "Cilium KVStoreMesh remote cluster {{ $labels.target_cluster }} from {{ $labels.source_cluster }} is experiencing failures ({{ $value }} failures)." query: "sum(cilium_kvstoremesh_remote_cluster_failures) by (source_cluster, target_cluster) > 0" severity: critical for: 5m - name: Cilium KVStoreMesh sync errors description: "Cilium KVStoreMesh from {{ $labels.source_cluster }} is experiencing kvstore sync errors." query: "sum(rate(cilium_kvstoremesh_kvstore_sync_errors_total[5m])) by (source_cluster) > 0.05" severity: critical for: 5m # Hubble - name: Cilium Hubble lost events description: "Cilium Hubble on {{ $labels.pod }} is losing flow events. Observability data may be incomplete." query: "sum(rate(hubble_lost_events_total[5m])) by (pod) > 0.05" severity: warning for: 5m - name: Cilium Hubble high DNS error rate description: "Cilium Hubble on {{ $labels.pod }} is observing more than 10% DNS error responses." query: 'sum(rate(hubble_dns_responses_total{rcode!="No Error"}[5m])) by (pod) / sum(rate(hubble_dns_responses_total[5m])) by (pod) > 0.1 and sum(rate(hubble_dns_responses_total[5m])) by (pod) > 0' severity: warning for: 5m comments: Threshold of 10% is a rough default. Some DNS errors may be normal depending on your workload. - name: WireGuard exporters: - name: MindFlavor/prometheus_wireguard_exporter slug: mindflavor-prometheus-wireguard-exporter doc_url: https://github.com/MindFlavor/prometheus_wireguard_exporter rules: - name: WireGuard peer handshake too old description: "WireGuard peer {{ $labels.public_key }} on interface {{ $labels.interface }} has not had a handshake for over 5 minutes. The tunnel may be down." query: 'time() - wireguard_latest_handshake_seconds > 300 and wireguard_latest_handshake_seconds > 0' severity: warning for: 2m comments: | The threshold of 300 seconds (5 minutes) is a rough default. WireGuard peers that are idle but reachable typically re-handshake every 2 minutes. Adjust based on your keepalive interval. The `> 0` guard excludes peers that have never completed a handshake (covered by a separate rule). - name: WireGuard peer handshake never established description: "WireGuard peer {{ $labels.public_key }} on interface {{ $labels.interface }} has never completed a handshake. Check peer configuration and network connectivity." query: 'wireguard_latest_handshake_seconds == 0' severity: critical for: 5m comments: | This alert will fire for all offline mobile/laptop peers. Consider filtering by expected-online peers. - name: WireGuard no traffic on peer description: "WireGuard peer {{ $labels.public_key }} on interface {{ $labels.interface }} has had no traffic for 15 minutes despite an active handshake." query: '(rate(wireguard_sent_bytes_total[15m]) + rate(wireguard_received_bytes_total[15m])) == 0 and wireguard_latest_handshake_seconds > 0 and (time() - wireguard_latest_handshake_seconds) < 300' severity: warning for: 15m comments: | This alert fires when a peer has a recent handshake but zero traffic flow. May indicate routing issues or a misconfigured allowed-ips. Only useful if you expect continuous traffic on all peers. - name: Storage services: - name: Ceph exporters: - name: Embedded exporter slug: embedded-exporter doc_url: https://docs.ceph.com/en/quincy/mgr/prometheus/ rules: - name: Ceph State description: Ceph instance unhealthy query: "ceph_health_status != 0" severity: critical for: 1m comments: | ceph_health_status: 0=HEALTH_OK, 1=HEALTH_WARN, 2=HEALTH_ERR. This rule fires on any non-OK state. Split into ==1 (warning) and ==2 (critical) if you want separate severity levels. - name: Ceph monitor clock skew description: Ceph monitor clock skew detected. Please check ntp and hardware clock settings query: "abs(ceph_monitor_clock_skew_seconds) > 0.2" severity: warning for: 2m - name: Ceph monitor low space description: Ceph monitor storage is low. query: "ceph_monitor_avail_percent < 10" severity: warning for: 2m - name: Ceph OSD Down description: Ceph Object Storage Daemon Down query: "ceph_osd_up == 0" severity: critical for: 1m - name: Ceph high OSD latency description: "Ceph Object Storage Daemon latency is high. Please check if it doesn't stuck in weird state." query: "ceph_osd_apply_latency_ms > 5000" severity: warning for: 1m comments: | Threshold of 5000ms (5 seconds). Adjust based on your expected OSD performance. - name: Ceph OSD near full description: A Ceph OSD is dangerously full. Please add more disks. query: 'ceph_health_detail{name="OSD_NEARFULL"} == 1' severity: warning for: 5m comments: | Ceph internally triggers OSD_NEARFULL based on the nearfull_ratio (default 85%). ceph_health_detail exposes named health checks as individual time series. - name: Ceph OSD reweighted description: Ceph Object Storage Daemon takes too much time to resize. query: "ceph_osd_weight < 1" severity: warning for: 2m - name: Ceph PG down description: Some Ceph placement groups are down. Please ensure that all the data are available. query: "ceph_pg_down > 0" severity: critical - name: Ceph PG incomplete description: Some Ceph placement groups are incomplete. Please ensure that all the data are available. query: "ceph_pg_incomplete > 0" severity: critical - name: Ceph PG inconsistent description: Some Ceph placement groups are inconsistent. Data is available but inconsistent across nodes. query: ceph_pg_inconsistent > 0 severity: warning - name: Ceph PG activation long description: Some Ceph placement groups are too long to activate. query: "ceph_pg_activating > 0" severity: warning for: 2m - name: Ceph PG backfill full description: Some Ceph placement groups are located on full Object Storage Daemon on cluster. Those PGs can be unavailable shortly. Please check OSDs, change weight or reconfigure CRUSH rules. query: "ceph_pg_backfill_toofull > 0" severity: warning for: 2m - name: Ceph PG unavailable description: Some Ceph placement groups are unavailable. query: "ceph_pg_total - ceph_pg_active > 0" severity: critical for: 1m - name: ZFS exporters: - name: node-exporter slug: node-exporter doc_url: https://github.com/prometheus/node_exporter rules: - name: ZFS offline pool description: "A ZFS zpool is in a unexpected state: {{ $labels.state }}." query: 'node_zfs_zpool_state{state!="online"} > 0' severity: critical for: 1m - name: ZFS exporter slug: zfs_exporter doc_url: https://github.com/pdf/zfs_exporter rules: - name: ZFS pool out of space description: "ZFS pool {{ $labels.pool }} is almost full (< 10% left)." query: "zfs_pool_free_bytes * 100 / zfs_pool_size_bytes < 10 and zfs_pool_readonly == 0 and zfs_pool_size_bytes > 0" severity: warning - name: ZFS pool unhealthy description: ZFS pool state is {{ $value }}. See comments for more information. query: "zfs_pool_health > 0" severity: critical comments: | 0: ONLINE 1: DEGRADED 2: FAULTED 3: OFFLINE 4: UNAVAIL 5: REMOVED 6: SUSPENDED - name: ZFS collector failed description: ZFS collector for {{ $labels.instance }} has failed to collect information query: "zfs_scrape_collector_success != 1" severity: warning - name: OpenEBS exporters: - name: Embedded exporter slug: embedded-exporter rules: - name: OpenEBS used pool capacity description: "OpenEBS Pool use more than 80% of his capacity" query: "openebs_used_pool_capacity_percent > 80" severity: warning for: 2m - name: Minio exporters: - name: Embedded exporter slug: embedded-exporter rules: - name: Minio cluster disk offline description: "Minio cluster disk is offline" query: "minio_cluster_drive_offline_total > 0" severity: critical - name: Minio node disk offline description: "Minio cluster node disk is offline" query: "minio_cluster_nodes_offline_total > 0" severity: critical - name: Minio disk space usage description: "Minio available free space is low (< 10%)" query: minio_cluster_capacity_raw_free_bytes / minio_cluster_capacity_raw_total_bytes * 100 < 10 and minio_cluster_capacity_raw_total_bytes > 0 severity: warning - name: Cloud providers services: - name: AWS CloudWatch exporters: - name: prometheus/cloudwatch_exporter slug: prometheus-cloudwatch-exporter doc_url: https://github.com/prometheus/cloudwatch_exporter comments: | CloudWatch metrics are exported as aws_{namespace}_{metric_name}_{statistic} gauges. The rules below cover both exporter health and common AWS service alerts. Adjust thresholds and label filters to match your CloudWatch exporter configuration. rules: - name: CloudWatch exporter scrape error description: "CloudWatch exporter on {{ $labels.instance }} failed to scrape metrics from AWS CloudWatch API." query: "cloudwatch_exporter_scrape_error > 0" severity: warning for: 5m - name: CloudWatch exporter slow scrape description: "CloudWatch exporter on {{ $labels.instance }} scrape is taking more than 5 minutes ({{ $value }}s). Consider reducing the number of metrics or splitting across multiple exporters." query: "cloudwatch_exporter_scrape_duration_seconds > 300" severity: warning for: 5m - name: CloudWatch API high request rate description: "CloudWatch exporter on {{ $labels.instance }} is making {{ $value }} API calls per minute to namespace {{ $labels.namespace }}. This can lead to high AWS costs." query: "sum by (instance, namespace) (rate(cloudwatch_requests_total[5m])) * 60 > 100" severity: warning comments: | CloudWatch API calls cost money (~$0.01 per 1000 GetMetricData requests). 100 requests/minute ≈ $45/month. Adjust the threshold based on your budget. - name: AWS EC2 high CPU utilization description: "EC2 instance {{ $labels.instance_id }} CPU utilization is above 90% ({{ $value }}%)." query: "aws_ec2_cpuutilization_average > 90" severity: warning for: 15m comments: Requires EC2 CPUUtilization metric configured in the CloudWatch exporter. - name: AWS RDS low free storage space description: "RDS instance {{ $labels.dbinstance_identifier }} has less than 2GB free storage ({{ $value }} bytes remaining)." query: "aws_rds_free_storage_space_average < 2000000000" severity: warning for: 5m comments: | Requires RDS FreeStorageSpace metric. The threshold of 2GB is a rough default. Adjust based on your database size. - name: AWS RDS high CPU utilization description: "RDS instance {{ $labels.dbinstance_identifier }} CPU utilization is above 90% ({{ $value }}%)." query: "aws_rds_cpuutilization_average > 90" severity: warning for: 15m comments: Requires RDS CPUUtilization metric configured in the CloudWatch exporter. - name: AWS RDS high database connections description: "RDS instance {{ $labels.dbinstance_identifier }} has {{ $value }} active connections." query: "aws_rds_database_connections_average > 100" severity: warning for: 5m comments: | The threshold depends on the RDS instance class. Adjust based on your instance type's max_connections parameter. - name: AWS SQS queue messages visible description: "SQS queue {{ $labels.queue_name }} has {{ $value }} messages waiting to be processed." query: "aws_sqs_approximate_number_of_messages_visible_average > 1000" severity: warning for: 10m comments: | Requires SQS ApproximateNumberOfMessagesVisible metric. The threshold of 1000 is a rough default. Adjust based on your expected queue depth. - name: AWS SQS message age too old description: "SQS queue {{ $labels.queue_name }} has messages older than 1 hour ({{ $value }}s)." query: "aws_sqs_approximate_age_of_oldest_message_maximum > 3600" severity: warning comments: Requires SQS ApproximateAgeOfOldestMessage metric. - name: AWS ALB unhealthy targets description: "ALB {{ $labels.load_balancer }} has {{ $value }} unhealthy target(s) in target group {{ $labels.target_group }}." query: "aws_applicationelb_unhealthy_host_count_average > 0" severity: critical for: 5m comments: Requires ApplicationELB UnHealthyHostCount metric. - name: AWS ALB high 5xx error rate description: "ALB {{ $labels.load_balancer }} 5xx error rate is above 5% ({{ $value }}%)." query: "(aws_applicationelb_httpcode_elb_5_xx_count_sum / aws_applicationelb_request_count_sum) * 100 > 5 and aws_applicationelb_request_count_sum > 0" severity: critical for: 5m comments: Requires ApplicationELB HTTPCode_ELB_5XX_Count and RequestCount metrics. - name: AWS ALB high target response time description: "ALB {{ $labels.load_balancer }} average target response time is above 2 seconds ({{ $value }}s)." query: "aws_applicationelb_target_response_time_average > 2" severity: warning for: 5m comments: Requires ApplicationELB TargetResponseTime metric. - name: AWS Lambda high error rate description: "Lambda function {{ $labels.function_name }} error rate is above 5% ({{ $value }}%)." query: "(aws_lambda_errors_sum / aws_lambda_invocations_sum) * 100 > 5 and aws_lambda_invocations_sum > 0" severity: warning for: 5m comments: Requires Lambda Errors and Invocations metrics. - name: Google Cloud Stackdriver exporters: - name: prometheus-community/stackdriver_exporter slug: stackdriver-exporter doc_url: https://github.com/prometheus-community/stackdriver_exporter comments: | Self-monitoring metrics use the stackdriver_monitoring_* prefix. All self-monitoring metrics include a project_id label. rules: - name: Stackdriver exporter scrape error description: "Stackdriver exporter failed to scrape metrics from Google Cloud Monitoring API for project {{ $labels.project_id }}." query: "stackdriver_monitoring_last_scrape_error > 0" severity: warning for: 5m - name: Stackdriver exporter slow scrape description: "Stackdriver exporter scrape for project {{ $labels.project_id }} is taking more than 5 minutes ({{ $value }}s)." query: "stackdriver_monitoring_last_scrape_duration_seconds > 300" severity: warning for: 5m - name: Stackdriver exporter scrape errors increasing description: "Stackdriver exporter has had {{ $value }} scrape errors in the last 15 minutes for project {{ $labels.project_id }}." query: "increase(stackdriver_monitoring_scrape_errors_total[15m]) > 5" severity: warning - name: Stackdriver exporter high API calls description: "Stackdriver exporter is making {{ $value }} API calls per minute for project {{ $labels.project_id }}. This may hit Google Cloud Monitoring API quotas." query: "rate(stackdriver_monitoring_api_calls_total[5m]) * 60 > 100" severity: warning - name: Stackdriver exporter scrape stale description: "Stackdriver exporter has not successfully scraped metrics for project {{ $labels.project_id }} in the last 10 minutes." query: "time() - stackdriver_monitoring_last_scrape_timestamp > 600" severity: warning - name: DigitalOcean exporters: - name: metalmatze/digitalocean_exporter slug: digitalocean-exporter doc_url: https://github.com/metalmatze/digitalocean_exporter rules: - name: DigitalOcean droplet down description: "DigitalOcean droplet {{ $labels.name }} ({{ $labels.id }}) in {{ $labels.region }} is not running." query: "digitalocean_droplet_up == 0" severity: critical for: 5m - name: DigitalOcean account not active description: "DigitalOcean account is not active. It may be suspended or locked." query: "digitalocean_account_active != 1" severity: critical for: 5m - name: DigitalOcean database down description: "DigitalOcean managed database {{ $labels.name }} ({{ $labels.engine }}) in {{ $labels.region }} is offline." query: "digitalocean_database_status == 0" severity: critical for: 2m - name: DigitalOcean Kubernetes cluster down description: "DigitalOcean Kubernetes cluster {{ $labels.name }} ({{ $labels.version }}) in {{ $labels.region }} is not running." query: "digitalocean_kubernetes_cluster_up == 0" severity: critical for: 5m - name: DigitalOcean load balancer down description: "DigitalOcean load balancer {{ $labels.name }} ({{ $labels.ip }}) is not active." query: "digitalocean_loadbalancer_status == 0" severity: critical for: 2m - name: DigitalOcean load balancer no backends description: "DigitalOcean load balancer {{ $labels.name }} ({{ $labels.ip }}) has no droplets attached." query: "digitalocean_loadbalancer_droplets == 0" severity: warning for: 1m - name: DigitalOcean floating IP not assigned description: "DigitalOcean floating IP {{ $labels.ipv4 }} in {{ $labels.region }} is not assigned to any droplet." query: "digitalocean_floating_ipv4_active == 0" severity: warning - name: DigitalOcean active incidents description: "DigitalOcean platform has {{ $value }} active incident(s)." query: "digitalocean_incidents_total > 0" severity: warning - name: DigitalOcean exporter collection errors description: "DigitalOcean exporter {{ $labels.collector }} collector has {{ $value }} errors." query: "increase(digitalocean_errors_total[5m]) > 3" severity: warning for: 5m - name: DigitalOcean droplet limit approaching description: "DigitalOcean account is using {{ $value }}% of its droplet quota." query: "(count(digitalocean_droplet_up) / digitalocean_account_droplet_limit) * 100 > 80 and digitalocean_account_droplet_limit > 0" severity: warning comments: Fires when more than 80% of the account's droplet limit is in use. - name: Azure exporters: - name: webdevops/azure-metrics-exporter slug: azure-metrics-exporter doc_url: https://github.com/webdevops/azure-metrics-exporter comments: | The exporter uses azurerm_resource_metric as the default metric name for forwarded Azure Monitor metrics. The metric name can be customized via the name parameter in probe configuration. Self-monitoring metrics use the azurerm_stats_* and azurerm_api_* prefixes. rules: - name: Azure exporter request errors description: "Azure metrics exporter on {{ $labels.instance }} has {{ $value }} API request errors in the last 15 minutes." query: 'increase(azurerm_stats_metric_requests{result="error"}[15m]) > 5' severity: warning - name: Azure exporter high error rate description: "Azure metrics exporter on {{ $labels.instance }} has an error rate above 10% ({{ $value }}%)." query: 'sum by (instance) (rate(azurerm_stats_metric_requests{result="error"}[5m])) / sum by (instance) (rate(azurerm_stats_metric_requests[5m])) * 100 > 10 and sum by (instance) (rate(azurerm_stats_metric_requests[5m])) > 0' severity: warning for: 5m - name: Azure API read rate limit approaching description: "Azure API read rate limit for subscription {{ $labels.subscriptionID }} is running low ({{ $value }} remaining)." query: 'azurerm_api_ratelimit{type="read"} < 100' severity: warning comments: | Azure Resource Manager enforces rate limits per subscription. The threshold of 100 remaining calls is a rough default. Adjust based on your scrape interval and number of monitored resources. - name: Azure API write rate limit approaching description: "Azure API write rate limit for subscription {{ $labels.subscriptionID }} is running low ({{ $value }} remaining)." query: 'azurerm_api_ratelimit{type="write"} < 50' severity: warning - name: Azure exporter slow collection description: "Azure metrics exporter on {{ $labels.instance }} metric collection is taking more than 5 minutes ({{ $value }}s)." query: "azurerm_stats_metric_collecttime > 300" severity: warning for: 5m - name: Observability services: - name: Thanos exporters: - name: Thanos Compactor slug: thanos-compactor rules: - name: Thanos Compactor Multiple Running description: "No more than one Thanos Compact instance should be running at once. There are {{$value}} instances running." query: 'sum by (job) (up{job=~".*thanos-compact.*"}) > 1' severity: warning for: 5m - name: Thanos Compactor Halted description: "Thanos Compact {{$labels.job}} has failed to run and now is halted." query: 'thanos_compact_halted == 1' severity: warning for: 5m - name: Thanos Compactor High Compaction Failures description: "Thanos Compact {{$labels.job}} is failing to execute {{$value | humanize}}% of compactions." query: '(sum by (job) (rate(thanos_compact_group_compactions_failures_total[5m])) / sum by (job) (rate(thanos_compact_group_compactions_total[5m])) * 100 > 5) and sum by (job) (rate(thanos_compact_group_compactions_total[5m])) > 0' severity: warning for: 15m - name: Thanos Compact Bucket High Operation Failures description: "Thanos Compact {{$labels.job}} Bucket is failing to execute {{$value | humanize}}% of operations." query: '(sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-compact.*"}[5m])) / sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-compact.*"}[5m])) * 100 > 5) and sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-compact.*"}[5m])) > 0' severity: warning for: 15m - name: Thanos Compact Has Not Run description: "Thanos Compact {{$labels.job}} has not uploaded anything for 24 hours." query: '(time() - max by (job) (max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~".*thanos-compact.*"}[24h]))) / 60 / 60 > 24' severity: warning - name: Thanos Query slug: thanos-query rules: - name: Thanos Query Http Request Query Error Rate High description: 'Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of "query" requests.' query: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*", handler="query"}[5m]))/ sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query"}[5m]))) * 100 > 5 and sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query"}[5m])) > 0' severity: critical for: 5m - name: Thanos Query Http Request Query Range Error Rate High description: 'Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of "query_range" requests.' query: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*", handler="query_range"}[5m]))/ sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query_range"}[5m]))) * 100 > 5 and sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query_range"}[5m])) > 0' severity: critical for: 5m - name: Thanos Query Grpc Server Error Rate description: "Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of requests." query: '(sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-query.*"}[5m]))/ sum by (job) (rate(grpc_server_started_total{job=~".*thanos-query.*"}[5m])) * 100 > 5) and sum by (job) (rate(grpc_server_started_total{job=~".*thanos-query.*"}[5m])) > 0' severity: warning for: 5m - name: Thanos Query Grpc Client Error Rate description: "Thanos Query {{$labels.job}} is failing to send {{$value | humanize}}% of requests." query: '(sum by (job) (rate(grpc_client_handled_total{grpc_code=~"Unknown|Internal|Unavailable|DataLoss|DeadlineExceeded|ResourceExhausted", job=~".*thanos-query.*"}[5m])) / sum by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*"}[5m]))) * 100 > 5 and sum by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*"}[5m])) > 0' severity: warning for: 5m comments: | Filters to actual error codes only. grpc_code!="OK" would include benign codes like NotFound, AlreadyExists, and Cancelled. - name: Thanos Query High D N S Failures description: "Thanos Query {{$labels.job}} have {{$value | humanize}}% of failing DNS queries for store endpoints." query: '(sum by (job) (rate(thanos_query_store_apis_dns_failures_total[5m])) / sum by (job) (rate(thanos_query_store_apis_dns_lookups_total[5m]))) * 100 > 1 and sum by (job) (rate(thanos_query_store_apis_dns_lookups_total[5m])) > 0' severity: warning for: 15m - name: Thanos Query Instant Latency High description: "Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for instant queries." query: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m]))) > 40 and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-query.*", handler="query"}[5m])) > 0)' severity: critical for: 10m - name: Thanos Query Range Latency High description: "Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for range queries." query: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query_range"}[5m]))) > 90 and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-query.*", handler="query_range"}[5m])) > 0)' severity: critical for: 10m - name: Thanos Query Overload description: "Thanos Query {{$labels.job}} has been overloaded for more than 15 minutes. This may be a symptom of excessive simultanous complex requests, low performance of the Prometheus API, or failures within these components. Assess the health of the Thanos query instances, the connnected Prometheus instances, look for potential senders of these requests and then contact support." query: "(max_over_time(thanos_query_concurrent_gate_queries_max[5m]) - avg_over_time(thanos_query_concurrent_gate_queries_in_flight[5m]) < 1)" severity: warning for: 15m - name: Thanos Receiver slug: thanos-receiver rules: - name: Thanos Receive Http Request Error Rate High description: "Thanos Receive {{$labels.job}} is failing to handle {{$value | humanize}}% of requests." query: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-receive.*", handler="receive"}[5m]))/ sum by (job) (rate(http_requests_total{job=~".*thanos-receive.*", handler="receive"}[5m]))) * 100 > 5 and sum by (job) (rate(http_requests_total{job=~".*thanos-receive.*", handler="receive"}[5m])) > 0' severity: critical for: 5m - name: Thanos Receive Http Request Latency High description: "Thanos Receive {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for requests." query: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-receive.*", handler="receive"}[5m]))) > 10 and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-receive.*", handler="receive"}[5m])) > 0)' severity: critical for: 10m - name: Thanos Receive High Replication Failures description: "Thanos Receive {{$labels.job}} is failing to replicate {{$value | humanize}}% of requests." query: 'thanos_receive_replication_factor > 1 and ((sum by (job) (rate(thanos_receive_replications_total{result="error"}[5m])) / sum by (job) (rate(thanos_receive_replications_total[5m]))) > (max by (job) (floor((thanos_receive_replication_factor+1)/ 2)) / max by (job) (thanos_receive_hashring_nodes))) * 100' severity: warning for: 5m - name: Thanos Receive High Forward Request Failures description: "Thanos Receive {{$labels.job}} is failing to forward {{$value | humanize}}% of requests." query: '(sum by (job) (rate(thanos_receive_forward_requests_total{result="error"}[5m]))/ sum by (job) (rate(thanos_receive_forward_requests_total[5m]))) * 100 > 20 and sum by (job) (rate(thanos_receive_forward_requests_total[5m])) > 0' severity: info for: 5m - name: Thanos Receive High Hashring File Refresh Failures description: "Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{$value | humanize}} of attempts failed." query: '(sum by (job) (rate(thanos_receive_hashrings_file_errors_total[5m])) / sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total[5m])) > 0) and sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total[5m])) > 0' severity: warning for: 15m - name: Thanos Receive Config Reload Failure description: "Thanos Receive {{$labels.job}} has not been able to reload hashring configurations." query: 'avg by (job) (thanos_receive_config_last_reload_successful) != 1' severity: warning for: 5m - name: Thanos Receive No Upload description: "Thanos Receive {{$labels.instance}} has not uploaded latest data to object storage." query: '(up{job=~".*thanos-receive.*"} - 1) + on (job, instance) (sum by (job, instance) (increase(thanos_shipper_uploads_total{job=~".*thanos-receive.*"}[3h])) == 0)' severity: critical for: 3h - name: Thanos Sidecar slug: thanos-sidecar rules: - name: Thanos Sidecar Bucket Operations Failed description: "Thanos Sidecar {{$labels.instance}} bucket operations are failing ({{ $value | humanize }}/s)." query: 'sum by (job, instance) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-sidecar.*"}[5m])) > 0.05' comments: | Threshold of 0.05/s avoids firing on transient single-event spikes. severity: critical for: 5m - name: Thanos Sidecar No Connection To Started Prometheus description: "Thanos Sidecar {{$labels.instance}} is unhealthy." query: 'thanos_sidecar_prometheus_up == 0 and on (namespace, pod) prometheus_tsdb_data_replay_duration_seconds != 0' severity: critical for: 5m - name: Thanos Store slug: thanos-store rules: - name: Thanos Store Grpc Error Rate description: "Thanos Store {{$labels.job}} is failing to handle {{$value | humanize}}% of requests." query: '(sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-store.*"}[5m]))/ sum by (job) (rate(grpc_server_started_total{job=~".*thanos-store.*"}[5m])) * 100 > 5) and sum by (job) (rate(grpc_server_started_total{job=~".*thanos-store.*"}[5m])) > 0' severity: warning for: 5m - name: Thanos Store Series Gate Latency High description: "Thanos Store {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for store series gate requests." query: '(histogram_quantile(0.99, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket[5m]))) > 2 and sum by (job) (rate(thanos_bucket_store_series_gate_duration_seconds_count[5m])) > 0)' severity: warning for: 10m - name: Thanos Store Bucket High Operation Failures description: "Thanos Store {{$labels.job}} Bucket is failing to execute {{$value | humanize}}% of operations." query: '(sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-store.*"}[5m])) / sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-store.*"}[5m])) * 100 > 5) and sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-store.*"}[5m])) > 0' severity: warning for: 15m - name: Thanos Store Objstore Operation Latency High description: "Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of {{$value}} seconds for the bucket operations." query: '(histogram_quantile(0.99, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~".*thanos-store.*"}[5m]))) > 2 and sum by (job) (rate(thanos_objstore_bucket_operation_duration_seconds_count{job=~".*thanos-store.*"}[5m])) > 0)' severity: warning for: 10m - name: Thanos Ruler slug: thanos-ruler rules: - name: Thanos Rule Queue Is Dropping Alerts description: "Thanos Rule {{$labels.instance}} is failing to queue alerts ({{ $value | humanize }}/s)." query: 'sum by (job, instance) (rate(thanos_alert_queue_alerts_dropped_total[5m])) > 0' severity: critical for: 5m - name: Thanos Rule Sender Is Failing Alerts description: "Thanos Rule {{$labels.instance}} is failing to send alerts to alertmanager ({{ $value | humanize }}/s)." query: 'sum by (job, instance) (rate(thanos_alert_sender_alerts_dropped_total[5m])) > 0' severity: critical for: 5m - name: Thanos Rule High Rule Evaluation Failures description: "Thanos Rule {{$labels.instance}} is failing to evaluate {{$value | humanize}}% of rules." query: '(sum by (job, instance) (rate(prometheus_rule_evaluation_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) * 100 > 5) and sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) > 0' severity: critical for: 5m - name: Thanos Rule High Rule Evaluation Warnings description: "Thanos Rule {{$labels.instance}} has high number of evaluation warnings ({{ $value | humanize }}/s)." query: 'sum by (job, instance) (rate(thanos_rule_evaluation_with_warnings_total[5m])) > 0.05' comments: | Threshold of 0.05/s avoids firing on transient single-event spikes. severity: info for: 15m - name: Thanos Rule Rule Evaluation Latency High description: "Thanos Rule {{$labels.instance}} has higher evaluation latency than interval for {{$labels.rule_group}}." query: '(sum by (job, instance, rule_group) (prometheus_rule_group_last_duration_seconds{job=~".*thanos-rule.*"}) > sum by (job, instance, rule_group) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"}))' severity: warning for: 5m - name: Thanos Rule Grpc Error Rate description: "Thanos Rule {{$labels.job}} is failing to handle {{$value | humanize}}% of requests." query: '(sum by (job, instance) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-rule.*"}[5m]))/ sum by (job, instance) (rate(grpc_server_started_total{job=~".*thanos-rule.*"}[5m])) * 100 > 5) and sum by (job, instance) (rate(grpc_server_started_total{job=~".*thanos-rule.*"}[5m])) > 0' severity: warning for: 5m - name: Thanos Rule Config Reload Failure description: "Thanos Rule {{$labels.job}} has not been able to reload its configuration." query: 'avg by (job, instance) (thanos_rule_config_last_reload_successful) != 1' severity: info for: 5m - name: Thanos Rule Query High D N S Failures description: "Thanos Rule {{$labels.job}} has {{$value | humanize}}% of failing DNS queries for query endpoints." query: '(sum by (job, instance) (rate(thanos_rule_query_apis_dns_failures_total[5m])) / sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total[5m])) * 100 > 1) and sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total[5m])) > 0' severity: warning for: 15m - name: Thanos Rule Alertmanager High D N S Failures description: "Thanos Rule {{$labels.instance}} has {{$value | humanize}}% of failing DNS queries for Alertmanager endpoints." query: '(sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_failures_total[5m])) / sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total[5m])) * 100 > 1) and sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total[5m])) > 0' severity: warning for: 15m - name: Thanos Rule No Evaluation For10 Intervals description: "Thanos Rule {{$labels.job}} has rule groups that did not evaluate for at least 10x of their expected interval." query: 'time() - max by (job, instance, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job=~".*thanos-rule.*"})>10 * max by (job, instance, group) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"})' severity: info for: 5m - name: Thanos No Rule Evaluations description: "Thanos Rule {{$labels.instance}} did not perform any rule evaluations in the past 10 minutes." query: 'sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) <= 0 and sum by (job, instance) (thanos_rule_loaded_rules) > 0' severity: critical for: 5m - name: Thanos Bucket Replicate slug: thanos-bucket-replicate rules: - name: Thanos Bucket Replicate Error Rate description: "Thanos Replicate is failing to run, {{$value | humanize}}% of attempts failed." query: '(sum by (job) (rate(thanos_replicate_replication_runs_total{result="error"}[5m])) / on (job) group_left sum by (job) (rate(thanos_replicate_replication_runs_total[5m]))) * 100 >= 10 and sum by (job) (rate(thanos_replicate_replication_runs_total[5m])) > 0' severity: critical for: 5m - name: Thanos Bucket Replicate Run Latency description: "Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for the replicate operations." query: '(histogram_quantile(0.99, sum by (job, le) (rate(thanos_replicate_replication_run_duration_seconds_bucket[5m]))) > 20 and sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_count[5m])) > 0)' severity: critical for: 5m - name: Thanos Component Absent slug: thanos-component-absent rules: - name: Thanos Compact Is Down description: "ThanosCompact has disappeared. Prometheus target for the component cannot be discovered." query: 'absent(up{job=~".*thanos-compact.*"} == 1)' severity: critical for: 5m - name: Thanos Query Is Down description: "ThanosQuery has disappeared. Prometheus target for the component cannot be discovered." query: 'absent(up{job=~".*thanos-query.*"} == 1)' severity: critical for: 5m - name: Thanos Receive Is Down description: "ThanosReceive has disappeared. Prometheus target for the component cannot be discovered." query: 'absent(up{job=~".*thanos-receive.*"} == 1)' severity: critical for: 5m - name: Thanos Rule Is Down description: "ThanosRule has disappeared. Prometheus target for the component cannot be discovered." query: 'absent(up{job=~".*thanos-rule.*"} == 1)' severity: critical for: 5m - name: Thanos Sidecar Is Down description: "ThanosSidecar has disappeared. Prometheus target for the component cannot be discovered." query: 'absent(up{job=~".*thanos-sidecar.*"} == 1)' severity: critical for: 5m - name: Thanos Store Is Down description: "ThanosStore has disappeared. Prometheus target for the component cannot be discovered." query: absent(up{job=~".*thanos-store.*"} == 1) severity: critical for: 5m - name: Loki exporters: - name: Embedded exporter slug: embedded-exporter rules: - name: Loki process too many restarts description: A loki process had too many restarts (target {{ $labels.instance }}) query: changes(process_start_time_seconds{job=~".*loki.*"}[15m]) > 2 severity: warning - name: Loki request errors description: 'The {{ $labels.job }} and {{ $labels.route }} are experiencing {{ printf "%.2f" $value }}% errors.' query: '100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10 and sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 0' severity: critical for: 15m - name: Loki request panic description: "{{ $labels.job }} is experiencing {{ $value | humanize }} panic(s) in the last 5 minutes." query: sum(increase(loki_panic_total[5m])) by (namespace, job) > 0 severity: critical - name: Loki request latency description: The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. query: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (namespace, job, route, le)) > 1 severity: critical for: 5m - name: Promtail exporters: - name: Embedded exporter slug: embedded-exporter rules: - name: Promtail request errors description: The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. query: '100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance) / sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) > 10 and sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) > 0' severity: critical for: 5m - name: Promtail request latency description: The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. query: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[5m])) by (namespace, job, route, le)) > 1 severity: critical for: 5m - name: Cortex exporters: - name: Embedded exporter slug: embedded-exporter rules: - name: Cortex ruler configuration reload failure description: Cortex ruler configuration reload failure (instance {{ $labels.instance }}) query: cortex_ruler_config_last_reload_successful != 1 severity: warning - name: Cortex not connected to Alertmanager description: Cortex not connected to Alertmanager (instance {{ $labels.instance }}) query: cortex_prometheus_notifications_alertmanagers_discovered < 1 severity: critical - name: Cortex notifications are being dropped description: "Cortex notifications are being dropped due to errors (instance {{ $labels.instance }}, {{ $value | humanize }}/s)." query: rate(cortex_prometheus_notifications_dropped_total[5m]) > 0.05 comments: | Threshold of 0.05/s avoids firing on transient single-event spikes. severity: critical - name: Cortex notification errors description: "Cortex is failing when sending alert notifications (instance {{ $labels.instance }}, {{ $value | humanize }}/s)." query: rate(cortex_prometheus_notifications_errors_total[5m]) > 0.05 comments: | Threshold of 0.05/s avoids firing on transient single-event spikes. severity: critical - name: Cortex ingester unhealthy description: Cortex has an unhealthy ingester query: cortex_ring_members{state="Unhealthy", name="ingester"} > 0 severity: critical - name: Cortex frontend queries stuck description: There are queued up queries in query-frontend. query: sum by (job) (cortex_query_frontend_queue_length) > 0 severity: critical for: 5m - name: Grafana Tempo exporters: - name: Embedded exporter slug: embedded-exporter doc_url: https://grafana.com/docs/tempo/latest/operations/monitor/ rules: - name: Tempo distributor unhealthy description: Tempo has {{ $value }} unhealthy distributor(s). query: max by (job) (tempo_ring_members{state="Unhealthy", name="distributor"}) > 0 severity: warning for: 15m - name: Tempo live store unhealthy description: Tempo has {{ $value }} unhealthy live store(s). query: max by (job) (tempo_ring_members{state="Unhealthy", name="live-store"}) > 0 severity: critical for: 15m - name: Tempo metrics generator unhealthy description: Tempo has {{ $value }} unhealthy metrics generator(s). query: max by (job) (tempo_ring_members{state="Unhealthy", name="metrics-generator"}) > 0 severity: critical for: 15m - name: Tempo compactions failing description: "{{ $value }} compactions have failed in the past hour." query: sum by (job) (increase(tempodb_compaction_errors_total[1h])) > 2 and sum by (job) (increase(tempodb_compaction_errors_total[5m])) > 0 severity: critical for: 1h comments: | Uses a two-window approach: 1h for historical count and 5m to confirm the issue is ongoing. - name: Tempo polls failing description: "{{ $value }} blocklist polls have failed in the past hour." query: sum by (job) (increase(tempodb_blocklist_poll_errors_total[1h])) > 2 and sum by (job) (increase(tempodb_blocklist_poll_errors_total[5m])) > 0 severity: critical - name: Tempo tenant index failures description: "{{ $value }} tenant index failures in the past hour." query: sum by (job) (increase(tempodb_blocklist_tenant_index_errors_total[1h])) > 2 and sum by (job) (increase(tempodb_blocklist_tenant_index_errors_total[5m])) > 0 severity: critical - name: Tempo no tenant index builders description: No tenant index builders for tenant {{ $labels.tenant }}. Tenant index will quickly become stale. query: sum by (tenant) (tempodb_blocklist_tenant_index_builder) == 0 and on() max(tempodb_blocklist_length) > 0 severity: critical for: 5m - name: Tempo tenant index too old description: Tenant index for {{ $labels.tenant }} is {{ $value }}s old. query: max by (tenant) (tempodb_blocklist_tenant_index_age_seconds) > 600 severity: critical for: 5m comments: | Threshold of 600s (10 minutes). Adjust based on your tenant index build interval. - name: Tempo block list rising quickly description: Tempo blocklist length is up {{ printf "%.0f" $value }}% over the last 7 days. Consider scaling compactors. query: (avg(tempodb_blocklist_length) / avg(tempodb_blocklist_length offset 7d) - 1) * 100 > 40 and avg(tempodb_blocklist_length offset 7d) > 0 severity: critical for: 15m comments: | Fires when the blocklist grows more than 40% over 7 days. - name: Tempo bad overrides description: '{{ $labels.job }} failed to reload runtime overrides.' query: sum by (job) (tempo_runtime_config_last_reload_successful == 0) > 0 severity: critical for: 15m - name: Tempo user configurable overrides reload failing description: "{{ $value }} user-configurable overrides reloads have failed in the past hour." query: sum by (job) (increase(tempo_overrides_user_configurable_overrides_reload_failed_total[1h])) > 5 and sum by (job) (increase(tempo_overrides_user_configurable_overrides_reload_failed_total[5m])) > 0 severity: critical - name: Tempo compaction too many outstanding blocks warning description: There are too many outstanding compaction blocks for {{ $labels.instance }}. Consider increasing compactor resources. query: sum by (instance) (tempodb_compaction_outstanding_blocks) > 100 severity: warning for: 6h comments: | Threshold of 100 blocks per compactor instance. Adjust based on your environment. - name: Tempo compaction too many outstanding blocks critical description: There are too many outstanding compaction blocks for {{ $labels.instance }}. Increase compactor resources immediately. query: sum by (instance) (tempodb_compaction_outstanding_blocks) > 250 severity: critical for: 24h comments: | Threshold of 100 blocks per compactor instance. Normalize by backend-worker count if needed. Adjust based on your environment. - name: Tempo distributor usage tracker errors description: "Tempo distributor usage tracker errors for {{ $labels.job }} at {{ $value | humanize }}/s (reason {{ $labels.reason }})." query: sum by (job, reason) (rate(tempo_distributor_usage_tracker_errors_total[5m])) > 0.05 severity: critical for: 30m comments: | Threshold of 0.05/s avoids firing on transient single-event spikes. - name: Tempo metrics generator processor updates failing description: "Tempo metrics generator processor updates are failing for {{ $labels.job }} ({{ $value }} failures in 5m)." query: sum by (job) (increase(tempo_metrics_generator_active_processors_update_failed_total[5m])) > 2 severity: critical for: 15m - name: Tempo metrics generator service graphs dropping spans description: Tempo metrics generator is dropping {{ printf "%.2f" $value }}% of spans in service graphs for {{ $labels.job }}. query: '100 * sum by (job) (rate(tempo_metrics_generator_processor_service_graphs_dropped_spans_total[5m])) / sum by (job) (rate(tempo_metrics_generator_spans_received_total[5m])) > 0.5 and sum by (job) (rate(tempo_metrics_generator_spans_received_total[5m])) > 0' severity: warning for: 15m - name: Tempo metrics generator collections failing description: "Tempo metrics generator collections are failing for {{ $labels.job }} ({{ $value }} failures in 5m)." query: sum by (job) (increase(tempo_metrics_generator_registry_collections_failed_total[5m])) > 2 severity: critical for: 5m - name: Tempo memcached errors elevated description: 'Tempo memcached error rate is {{ printf "%.2f" $value }}% for {{ $labels.name }} in {{ $labels.job }}.' query: '100 * sum by (name, job) (rate(tempo_memcache_request_duration_seconds_count{status_code="500"}[5m])) / sum by (name, job) (rate(tempo_memcache_request_duration_seconds_count[5m])) > 20 and sum by (name, job) (rate(tempo_memcache_request_duration_seconds_count[5m])) > 0' severity: warning for: 10m comments: | Fires when the memcached error rate exceeds 20%. Only relevant if Tempo is configured with memcached caching. - name: Grafana Mimir exporters: - name: Embedded exporter slug: embedded-exporter doc_url: https://grafana.com/docs/mimir/latest/manage/monitor-grafana-mimir/ comments: | Mimir uses the `cortex_` metric prefix for backward compatibility with Cortex. This is intentional and expected. rules: # Core alerts - name: Mimir ingester unhealthy description: Mimir has {{ $value }} unhealthy ingester(s) in the ring. query: min by (job) (cortex_ring_members{state="Unhealthy", name="ingester"}) > 0 severity: critical for: 15m - name: Mimir request errors description: 'Mimir {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.' query: '100 * sum by (job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..", route!~"ready|debug_pprof"}[5m])) / sum by (job, route) (rate(cortex_request_duration_seconds_count{route!~"ready|debug_pprof"}[5m])) > 1 and sum by (job, route) (rate(cortex_request_duration_seconds_count{route!~"ready|debug_pprof"}[5m])) > 0' severity: critical for: 15m - name: Mimir inconsistent runtime config description: An inconsistent runtime config file is used across Mimir instances. query: count(count by (job, sha256) (cortex_runtime_config_hash)) without(sha256) > 1 severity: critical for: 1h - name: Mimir bad runtime config description: '{{ $labels.job }} failed to reload runtime config.' query: sum by (job) (cortex_runtime_config_last_reload_successful == 0) > 0 severity: critical for: 5m - name: Mimir scheduler queries stuck description: There are {{ $value }} queued up queries in {{ $labels.job }}. query: sum by (job) (min_over_time(cortex_query_scheduler_queue_length[1m])) > 0 severity: critical for: 7m - name: Mimir cache request errors description: 'Mimir cache {{ $labels.name }} is experiencing {{ printf "%.2f" $value }}% errors for {{ $labels.operation }} operation.' query: '(sum by (name, operation, job) (rate(thanos_cache_operation_failures_total[5m])) / sum by (name, operation, job) (rate(thanos_cache_operations_total[5m]))) * 100 > 5 and sum by (name, operation, job) (rate(thanos_cache_operations_total[5m])) > 0' severity: warning for: 5m - name: Mimir KV store failure description: 'Mimir {{ $labels.job }} KV store {{ $labels.kv_name }} is failing with 100% error rate.' query: '(sum by (job, kv_name) (rate(cortex_kv_request_duration_seconds_count{status_code!~"2.."}[5m])) / sum by (job, kv_name) (rate(cortex_kv_request_duration_seconds_count[5m]))) == 1 and sum by (job, kv_name) (rate(cortex_kv_request_duration_seconds_count[5m])) > 0' severity: critical for: 5m - name: Mimir memory map areas too high description: 'Mimir {{ $labels.job }} is using {{ printf "%.0f" $value }}% of its memory map area limit.' query: 'process_memory_map_areas{job=~".*(ingester|cortex|mimir|store-gateway).*"} / process_memory_map_areas_limit{job=~".*(ingester|cortex|mimir|store-gateway).*"} * 100 > 80 and process_memory_map_areas_limit{job=~".*(ingester|cortex|mimir|store-gateway).*"} > 0' severity: critical for: 5m - name: Mimir ingester instance has no tenants description: Mimir ingester {{ $labels.instance }} has no tenants assigned. query: (cortex_ingester_memory_users == 0) and on (instance) (cortex_ingester_memory_users offset 1h > 0) severity: warning for: 1h - name: Mimir ruler instance has no rule groups description: Mimir ruler {{ $labels.instance }} has no rule groups assigned. query: (cortex_ruler_managers_total == 0) and on (instance) (cortex_ruler_managers_total offset 1h > 0) severity: warning for: 1h - name: Mimir ingested data too far in the future description: Mimir ingester {{ $labels.job }} has ingested samples with timestamps more than 1 hour in the future. query: max by (job) (cortex_ingester_tsdb_head_max_timestamp_seconds - time() and cortex_ingester_tsdb_head_max_timestamp_seconds > 0) > 3600 severity: warning for: 5m - name: Mimir store gateway too many failed operations description: Mimir store-gateway {{ $labels.job }} bucket operations are failing ({{ $value | humanize }}/s). query: sum by (job) (rate(thanos_objstore_bucket_operation_failures_total[5m])) > 0.05 comments: | Threshold of 0.05/s avoids firing on transient single-event spikes. severity: warning for: 5m - name: Mimir ring members mismatch description: Mimir {{ $labels.name }} ring has inconsistent member counts across instances. query: max by (name, job) (sum by (name, job, instance) (cortex_ring_members)) != min by (name, job) (sum by (name, job, instance) (cortex_ring_members)) severity: warning for: 15m # Instance limits - name: Mimir ingester reaching series limit warning description: 'Mimir ingester {{ $labels.instance }} has reached {{ printf "%.0f" $value }}% of its series limit.' query: '(cortex_ingester_memory_series / ignoring(limit) cortex_ingester_instance_limits{limit="max_series"} * 100 > 80) and cortex_ingester_instance_limits{limit="max_series"} > 0' severity: warning for: 3h - name: Mimir ingester reaching series limit critical description: 'Mimir ingester {{ $labels.instance }} has reached {{ printf "%.0f" $value }}% of its series limit.' query: '(cortex_ingester_memory_series / ignoring(limit) cortex_ingester_instance_limits{limit="max_series"} * 100 > 90) and cortex_ingester_instance_limits{limit="max_series"} > 0' severity: critical for: 5m - name: Mimir ingester reaching tenants limit warning description: 'Mimir ingester {{ $labels.instance }} has reached {{ printf "%.0f" $value }}% of its tenants limit.' query: '(cortex_ingester_memory_users / ignoring(limit) cortex_ingester_instance_limits{limit="max_tenants"} * 100 > 70) and cortex_ingester_instance_limits{limit="max_tenants"} > 0' severity: warning for: 5m - name: Mimir ingester reaching tenants limit critical description: 'Mimir ingester {{ $labels.instance }} has reached {{ printf "%.0f" $value }}% of its tenants limit.' query: '(cortex_ingester_memory_users / ignoring(limit) cortex_ingester_instance_limits{limit="max_tenants"} * 100 > 80) and cortex_ingester_instance_limits{limit="max_tenants"} > 0' severity: critical for: 5m - name: Mimir reaching TCP connections limit description: 'Mimir instance {{ $labels.instance }} is using {{ printf "%.0f" $value }}% of its TCP connections limit.' query: cortex_tcp_connections / cortex_tcp_connections_limit * 100 > 80 and cortex_tcp_connections_limit > 0 severity: critical for: 5m - name: Mimir distributor inflight requests high description: 'Mimir distributor {{ $labels.instance }} is using {{ printf "%.0f" $value }}% of its inflight push requests limit.' query: '(cortex_distributor_inflight_push_requests / ignoring(limit) cortex_distributor_instance_limits{limit="max_inflight_push_requests"} * 100 > 80) and cortex_distributor_instance_limits{limit="max_inflight_push_requests"} > 0' severity: critical for: 5m # Blocks and TSDB - name: Mimir ingester TSDB head compaction failed description: "Mimir ingester {{ $labels.instance }} is failing to compact TSDB head ({{ $value | humanize }}/s)." query: rate(cortex_ingester_tsdb_compactions_failed_total[5m]) > 0.05 severity: critical for: 15m comments: | Threshold of 0.05/s avoids firing on transient single-event spikes. - name: Mimir ingester TSDB head truncation failed description: "Mimir ingester {{ $labels.instance }} is failing to truncate TSDB head ({{ $value | humanize }}/s)." query: rate(cortex_ingester_tsdb_head_truncations_failed_total[5m]) > 0.05 severity: critical for: 15m comments: | Threshold of 0.05/s avoids firing on transient single-event spikes. - name: Mimir ingester TSDB checkpoint creation failed description: "Mimir ingester {{ $labels.instance }} is failing to create TSDB checkpoints ({{ $value | humanize }}/s)." query: rate(cortex_ingester_tsdb_checkpoint_creations_failed_total[5m]) > 0.05 severity: critical for: 15m comments: | Threshold of 0.05/s avoids firing on transient single-event spikes. - name: Mimir ingester TSDB checkpoint deletion failed description: "Mimir ingester {{ $labels.instance }} is failing to delete TSDB checkpoints ({{ $value | humanize }}/s)." query: rate(cortex_ingester_tsdb_checkpoint_deletions_failed_total[5m]) > 0.05 severity: critical comments: | Threshold of 0.05/s avoids firing on transient single-event spikes. - name: Mimir ingester TSDB WAL truncation failed description: "Mimir ingester {{ $labels.instance }} is failing to truncate TSDB WAL ({{ $value | humanize }}/s)." query: rate(cortex_ingester_tsdb_wal_truncations_failed_total[5m]) > 0.05 severity: warning comments: | Threshold of 0.05/s avoids firing on transient single-event spikes. - name: Mimir ingester TSDB WAL writes failed description: "Mimir ingester {{ $labels.instance }} is failing to write to TSDB WAL ({{ $value | humanize }}/s)." query: rate(cortex_ingester_tsdb_wal_writes_failed_total[1m]) > 0.05 severity: critical for: 3m comments: | Threshold of 0.05/s avoids firing on transient single-event spikes. - name: Mimir store gateway has not synced bucket description: Mimir store-gateway {{ $labels.instance }} has not synced the bucket for more than 30 minutes. query: (time() - cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 1800) and cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 0 comments: | Threshold of 30 minutes. Adjust based on your sync interval. severity: critical for: 5m - name: Mimir store gateway no synced tenants description: Mimir store-gateway {{ $labels.instance }} has no synced tenants. query: (min by (instance, job) (cortex_bucket_stores_tenants_synced{component="store-gateway"}) == 0) and on (instance) (cortex_bucket_stores_tenants_synced{component="store-gateway"} offset 1h > 0) severity: warning for: 1h - name: Mimir bucket index not updated description: 'Mimir bucket index for tenant {{ $labels.user }} has not been updated for more than 35 minutes.' query: min by (user, job) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 2100 severity: critical # Compactor - name: Mimir compactor not cleaning up blocks description: Mimir compactor {{ $labels.instance }} has not cleaned up blocks in the last 6 hours. query: (time() - cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds > 21600) and cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds > 0 severity: critical for: 1h - name: Mimir compactor not running compaction description: Mimir compactor {{ $labels.instance }} has not run compaction in the last 24 hours. query: (time() - cortex_compactor_last_successful_run_timestamp_seconds > 86400) and cortex_compactor_last_successful_run_timestamp_seconds > 0 severity: critical for: 15m - name: Mimir compactor has consecutive failures description: "Mimir compactor {{ $labels.instance }} has had {{ $value }} compaction failures in the last 2 hours." query: increase(cortex_compactor_runs_failed_total{reason!="shutdown"}[2h]) > 1 severity: critical - name: Mimir compactor has run out of disk space description: Mimir compactor {{ $labels.instance }} has run out of disk space. query: delta(cortex_compactor_disk_out_of_space_errors_total[24h]) >= 1 comments: | cortex_compactor_disk_out_of_space_errors_total is declared as gauge by Mimir despite the _total suffix, so delta() is used instead of increase(). severity: critical - name: Mimir compactor has not uploaded blocks description: Mimir compactor {{ $labels.instance }} has not uploaded any block in the last 24 hours. query: (time() - thanos_objstore_bucket_last_successful_upload_time{component="compactor"} > 86400) and thanos_objstore_bucket_last_successful_upload_time{component="compactor"} > 0 severity: critical for: 15m - name: Mimir compactor skipped blocks description: "Mimir compactor has found {{ $value }} blocks that cannot be compacted (reason {{ $labels.reason }})." query: increase(cortex_compactor_blocks_marked_for_no_compaction_total[24h]) > 0 comments: | Using a 24h window as compaction skips are rare events. severity: warning for: 5m # Ruler - name: Mimir ruler too many failed pushes description: 'Mimir ruler {{ $labels.instance }} is failing to push {{ printf "%.2f" $value }}% of write requests.' query: '100 * sum by (instance, job) (rate(cortex_ruler_write_requests_failed_total[5m])) / sum by (instance, job) (rate(cortex_ruler_write_requests_total[5m])) > 1 and sum by (instance, job) (rate(cortex_ruler_write_requests_total[5m])) > 0' severity: critical for: 5m - name: Mimir ruler too many failed queries description: 'Mimir ruler {{ $labels.instance }} is failing {{ printf "%.2f" $value }}% of query evaluations.' query: '100 * sum by (instance, job) (rate(cortex_ruler_queries_failed_total[5m])) / sum by (instance, job) (rate(cortex_ruler_queries_total[5m])) > 1 and sum by (instance, job) (rate(cortex_ruler_queries_total[5m])) > 0' severity: critical for: 5m - name: Mimir ruler missed evaluations description: 'Mimir ruler {{ $labels.instance }} is missing {{ printf "%.2f" $value }}% of rule group evaluations.' query: '100 * sum by (instance, job) (rate(cortex_prometheus_rule_group_iterations_missed_total[5m])) / sum by (instance, job) (rate(cortex_prometheus_rule_group_iterations_total[5m])) > 1 and sum by (instance, job) (rate(cortex_prometheus_rule_group_iterations_total[5m])) > 0' severity: warning for: 5m - name: Mimir ruler failed ring check description: Mimir ruler {{ $labels.job }} is failing ring checks ({{ $value | humanize }}/s). query: sum by (job) (rate(cortex_ruler_ring_check_errors_total[5m])) > 0.05 comments: | Threshold of 0.05/s avoids firing on transient single-event spikes. severity: critical for: 5m # Alertmanager - name: Mimir alertmanager sync configs failing description: "Mimir alertmanager {{ $labels.job }} is failing to sync configs ({{ $value | humanize }}/s)." query: rate(cortex_alertmanager_sync_configs_failed_total[5m]) > 0.05 severity: critical for: 30m comments: | Threshold of 0.05/s avoids firing on transient single-event spikes. - name: Mimir alertmanager ring check failing description: "Mimir alertmanager {{ $labels.job }} is failing ring checks ({{ $value | humanize }}/s)." query: rate(cortex_alertmanager_ring_check_errors_total[5m]) > 0.05 severity: critical for: 10m comments: | Threshold of 0.05/s avoids firing on transient single-event spikes. - name: Mimir alertmanager state merge failing description: "Mimir alertmanager {{ $labels.job }} is failing to merge state updates ({{ $value | humanize }}/s)." query: rate(cortex_alertmanager_partial_state_merges_failed_total[5m]) > 0.05 severity: critical for: 10m comments: | Threshold of 0.05/s avoids firing on transient single-event spikes. - name: Mimir alertmanager replication failing description: "Mimir alertmanager {{ $labels.job }} is failing to replicate state ({{ $value | humanize }}/s)." query: rate(cortex_alertmanager_state_replication_failed_total[5m]) > 0.05 severity: critical for: 10m comments: | Threshold of 0.05/s avoids firing on transient single-event spikes. - name: Mimir alertmanager persist state failing description: "Mimir alertmanager {{ $labels.job }} is failing to persist state ({{ $value | humanize }}/s)." query: rate(cortex_alertmanager_state_persist_failed_total[15m]) > 0.05 severity: critical for: 1h comments: | Threshold of 0.05/s avoids firing on transient single-event spikes. - name: Mimir alertmanager initial sync failed description: Mimir alertmanager {{ $labels.job }} failed initial state sync. query: increase(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed"}[1m]) > 0 severity: warning - name: Mimir alertmanager instance has no tenants description: Mimir alertmanager {{ $labels.instance }} has no tenants assigned. query: (cortex_alertmanager_tenants_owned == 0) and on (instance) (cortex_alertmanager_tenants_owned offset 1h > 0) severity: warning for: 1h # Gossip - name: Mimir gossip members count too high description: Mimir gossip cluster has more members than expected. query: 'avg(memberlist_client_cluster_members_count{job=~".*(mimir|cortex).*"}) by (job) * 1.15 + 10 < max(memberlist_client_cluster_members_count{job=~".*(mimir|cortex).*"}) by (job)' severity: warning for: 20m - name: Mimir gossip members count too low description: Mimir gossip cluster has fewer members than expected. query: 'avg(memberlist_client_cluster_members_count{job=~".*(mimir|cortex).*"}) by (job) * 0.5 > min(memberlist_client_cluster_members_count{job=~".*(mimir|cortex).*"}) by (job)' severity: warning for: 20m # Go runtime - name: Mimir go threads too high warning description: 'Mimir {{ $labels.instance }} has {{ $value }} Go threads.' query: 'go_threads{job=~".*(mimir|cortex).*"} > 5000' severity: warning for: 15m comments: | A high number of Go threads may indicate a goroutine leak. - name: Mimir go threads too high critical description: 'Mimir {{ $labels.instance }} has {{ $value }} Go threads.' query: 'go_threads{job=~".*(mimir|cortex).*"} > 8000' severity: critical for: 15m - name: Grafana Alloy exporters: - name: Embedded exporter slug: embedded-exporter rules: - name: Grafana Alloy service down description: "Alloy on instance {{ $labels.instance }} is not responding or has stopped running." query: "count by (instance) (alloy_build_info offset 2h) unless count by (instance) (alloy_build_info)" severity: critical - name: OpenTelemetry Collector exporters: - name: Embedded exporter slug: embedded-exporter doc_url: https://opentelemetry.io/docs/collector/internal-telemetry/ comments: | OpenTelemetry Collector self-monitoring metrics are exposed on port 8888 by default at the /metrics endpoint. These alerts monitor the collector's health when metrics are ingested via the Prometheus OTLP endpoint or scraped directly. All collector internal metrics are prefixed with 'otelcol_'. rules: - name: OpenTelemetry Collector down description: OpenTelemetry Collector instance has disappeared or is not being scraped query: 'up{job=~".*otel.*collector.*"} == 0' severity: critical for: 1m comments: | Adjust the job label regex to match the actual job name in your Prometheus scrape config. - name: OpenTelemetry Collector receiver refused spans description: "OpenTelemetry Collector is refusing {{ $value | humanize }}/s spans on {{ $labels.receiver }}." query: 'rate(otelcol_receiver_refused_spans[5m]) > 0.05' comments: | Threshold of 0.05/s avoids firing on transient single-event spikes. severity: critical for: 5m - name: OpenTelemetry Collector receiver refused metric points description: "OpenTelemetry Collector is refusing {{ $value | humanize }}/s metric points on {{ $labels.receiver }}." query: 'rate(otelcol_receiver_refused_metric_points[5m]) > 0.05' comments: | Threshold of 0.05/s avoids firing on transient single-event spikes. severity: critical for: 5m - name: OpenTelemetry Collector receiver refused log records description: "OpenTelemetry Collector is refusing {{ $value | humanize }}/s log records on {{ $labels.receiver }}." query: 'rate(otelcol_receiver_refused_log_records[5m]) > 0.05' comments: | Threshold of 0.05/s avoids firing on transient single-event spikes. severity: critical for: 5m - name: OpenTelemetry Collector exporter failed spans description: "OpenTelemetry Collector failing to send {{ $value | humanize }}/s spans via {{ $labels.exporter }}." query: 'rate(otelcol_exporter_send_failed_spans[5m]) > 0.05' comments: | Threshold of 0.05/s avoids firing on transient single-event spikes. severity: warning for: 5m - name: OpenTelemetry Collector exporter failed metric points description: "OpenTelemetry Collector failing to send {{ $value | humanize }}/s metric points via {{ $labels.exporter }}." query: 'rate(otelcol_exporter_send_failed_metric_points[5m]) > 0.05' comments: | Threshold of 0.05/s avoids firing on transient single-event spikes. severity: warning for: 5m - name: OpenTelemetry Collector exporter failed log records description: "OpenTelemetry Collector failing to send {{ $value | humanize }}/s log records via {{ $labels.exporter }}." query: 'rate(otelcol_exporter_send_failed_log_records[5m]) > 0.05' comments: | Threshold of 0.05/s avoids firing on transient single-event spikes. severity: warning for: 5m - name: OpenTelemetry Collector exporter queue nearly full description: "OpenTelemetry Collector exporter {{ $labels.exporter }} queue is over 80% full" query: '(otelcol_exporter_queue_size / on(instance, job, exporter) otelcol_exporter_queue_capacity) > 0.8 and otelcol_exporter_queue_capacity > 0' severity: warning - name: OpenTelemetry Collector processor refused spans description: "OpenTelemetry Collector processor {{ $labels.processor }} is refusing spans ({{ $value | humanize }}/s), likely due to backpressure." query: 'rate(otelcol_processor_refused_spans[5m]) > 0.05' comments: | Threshold of 0.05/s avoids firing on transient single-event spikes. These processor metrics are deprecated since collector v0.110.0. severity: warning for: 5m - name: OpenTelemetry Collector processor refused metric points description: "OpenTelemetry Collector processor {{ $labels.processor }} is refusing metric points ({{ $value | humanize }}/s), likely due to backpressure." query: 'rate(otelcol_processor_refused_metric_points[5m]) > 0.05' comments: | Threshold of 0.05/s avoids firing on transient single-event spikes. These processor metrics are deprecated since collector v0.110.0. severity: warning for: 5m - name: OpenTelemetry Collector high memory usage description: "OpenTelemetry Collector memory usage is above 90%" query: '(otelcol_process_runtime_heap_alloc_bytes / on(instance, job) otelcol_process_runtime_total_sys_memory_bytes) > 0.9' severity: warning for: 5m - name: OpenTelemetry Collector OTLP receiver errors description: "OpenTelemetry Collector OTLP receiver is completely failing - all spans are being refused" query: 'rate(otelcol_receiver_accepted_spans{receiver=~"otlp"}[5m]) == 0 and rate(otelcol_receiver_refused_spans{receiver=~"otlp"}[5m]) > 0' severity: critical for: 2m - name: Jaeger exporters: - name: Embedded exporter (v2+) slug: embedded-exporter doc_url: https://www.jaegertracing.io/docs/2.dev/operations/monitoring/ comments: | Jaeger v2 is built on OpenTelemetry Collector and exposes metrics on port 8888 (/metrics). It emits standard otelcol_* pipeline metrics alongside Jaeger-specific storage and query metrics. For span ingestion pipeline alerts (refused spans, export failures, queue saturation), use the OpenTelemetry Collector rules instead. rules: - name: Jaeger high storage error rate description: "Jaeger on {{ $labels.instance }} is experiencing {{ $value | humanize }}% storage errors on {{ $labels.operation }}." query: '100 * sum(rate(jaeger_storage_requests_total{result="err"}[1m])) by (instance, job, namespace, operation) / sum(rate(jaeger_storage_requests_total[1m])) by (instance, job, namespace, operation) > 1 and sum(rate(jaeger_storage_requests_total[1m])) by (instance, job, namespace, operation) > 0' severity: warning for: 5m - name: Jaeger slow storage operations description: "Jaeger on {{ $labels.instance }} storage p99 latency for {{ $labels.operation }} is {{ $value | humanizeDuration }}." query: 'histogram_quantile(0.99, sum(rate(jaeger_storage_latency_seconds_bucket[5m])) by (le, instance, job, namespace, operation)) > 1' severity: warning for: 5m comments: | Threshold of 1s is a rough default. Adjust based on your storage backend and data volume. - name: Jaeger query service high error rate description: "Jaeger query service on {{ $labels.instance }} is returning {{ $value | humanize }}% HTTP 5xx errors." query: '100 * sum(rate(http_server_request_duration_seconds_count{http_route="/api/traces",http_response_status_code=~"5.."}[1m])) by (instance, job, namespace) / sum(rate(http_server_request_duration_seconds_count{http_route="/api/traces"}[1m])) by (instance, job, namespace) > 1 and sum(rate(http_server_request_duration_seconds_count{http_route="/api/traces"}[1m])) by (instance, job, namespace) > 0' severity: warning for: 5m comments: | Filters on http_route="/api/traces" (the trace search endpoint). The http_server_request_duration_seconds metric is emitted by the otelhttp middleware used by the Jaeger query service. - name: Jaeger query service slow responses description: "Jaeger query service on {{ $labels.instance }} p99 response latency is {{ $value | humanizeDuration }}." query: 'histogram_quantile(0.99, sum(rate(http_server_request_duration_seconds_bucket{http_route="/api/traces"}[5m])) by (le, instance, job, namespace)) > 2' severity: warning for: 5m comments: | Threshold of 2s is a rough default. Adjust based on your storage backend and data volume. - name: Jaeger storage completely unavailable description: "Jaeger on {{ $labels.instance }} has 100% storage errors for {{ $labels.operation }} — storage backend may be down." query: 'sum(rate(jaeger_storage_requests_total{result="err"}[1m])) by (instance, job, namespace, operation) > 0 and sum(rate(jaeger_storage_requests_total{result="ok"}[1m])) by (instance, job, namespace, operation) == 0' severity: critical for: 2m comments: | Fires when all storage operations for a given type are failing and none are succeeding. Indicates the storage backend (Cassandra, Elasticsearch, etc.) is likely unreachable or misconfigured. - name: Jaeger slow single trace retrieval description: "Jaeger on {{ $labels.instance }} p99 latency for single trace retrieval is {{ $value | humanizeDuration }}." query: 'histogram_quantile(0.99, sum(rate(http_server_request_duration_seconds_bucket{http_route="/api/traces/{traceID}"}[5m])) by (le, instance, job, namespace)) > 5' severity: warning for: 5m comments: | Single trace retrieval (/api/traces/{traceID}) can be slower than search, especially for large traces. Threshold of 5s is a rough default. - name: Jaeger service discovery errors description: "Jaeger on {{ $labels.instance }} is returning {{ $value | humanize }}% HTTP 5xx errors on the services endpoint." query: '100 * sum(rate(http_server_request_duration_seconds_count{http_route="/api/services",http_response_status_code=~"5.."}[1m])) by (instance, job, namespace) / sum(rate(http_server_request_duration_seconds_count{http_route="/api/services"}[1m])) by (instance, job, namespace) > 1 and sum(rate(http_server_request_duration_seconds_count{http_route="/api/services"}[1m])) by (instance, job, namespace) > 0' severity: warning for: 5m comments: | Errors on /api/services indicate the storage backend cannot return the list of instrumented services, which breaks the Jaeger UI service selector. - name: Jaeger no storage reads succeeding description: "Jaeger on {{ $labels.instance }} has no successful storage reads for {{ $labels.operation }} in the past 15 minutes." query: 'sum(increase(jaeger_storage_requests_total{result="ok"}[15m])) by (instance, job, namespace, operation) == 0 and sum(increase(jaeger_storage_requests_total[15m])) by (instance, job, namespace, operation) > 0' severity: warning for: 5m comments: | Fires when an operation (e.g. find_traces, get_services) has received requests but none succeeded. May indicate a persistent storage error or a backend that is slow to recover. - name: Embedded exporter (legacy, 1 and sum(rate(jaeger_agent_http_server_total[1m])) by (instance, job, namespace) > 0' severity: warning for: 15m - name: Jaeger client RPC request errors description: "Jaeger client on {{ $labels.instance }} is experiencing {{ $value | humanize }}% RPC HTTP errors." query: '100 * sum(rate(jaeger_client_jaeger_rpc_http_requests{status_code=~"4xx|5xx"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_client_jaeger_rpc_http_requests[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_client_jaeger_rpc_http_requests[1m])) by (instance, job, namespace) > 0' severity: warning for: 15m - name: Jaeger client spans dropped description: "Jaeger client on {{ $labels.instance }} is dropping {{ $value | humanize }}% of spans." query: '100 * sum(rate(jaeger_reporter_spans{result=~"dropped|err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_reporter_spans[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_reporter_spans[1m])) by (instance, job, namespace) > 0' severity: warning for: 15m - name: Jaeger agent spans dropped description: "Jaeger agent on {{ $labels.instance }} is dropping {{ $value | humanize }}% of span batches." query: '100 * sum(rate(jaeger_agent_reporter_batches_failures_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_agent_reporter_batches_submitted_total[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_agent_reporter_batches_submitted_total[1m])) by (instance, job, namespace) > 0' severity: warning for: 15m - name: Jaeger collector dropping spans description: "Jaeger collector on {{ $labels.instance }} is dropping {{ $value | humanize }}% of spans." query: '100 * sum(rate(jaeger_collector_spans_dropped_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_collector_spans_received_total[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_collector_spans_received_total[1m])) by (instance, job, namespace) > 0' severity: warning for: 15m - name: Jaeger sampling update failing description: "Jaeger on {{ $labels.instance }} is failing {{ $value | humanize }}% of sampling policy updates." query: '100 * sum(rate(jaeger_sampler_queries{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_sampler_queries[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_sampler_queries[1m])) by (instance, job, namespace) > 0' severity: warning for: 15m - name: Jaeger throttling update failing description: "Jaeger on {{ $labels.instance }} is failing {{ $value | humanize }}% of throttling policy updates." query: '100 * sum(rate(jaeger_throttler_updates{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_throttler_updates[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_throttler_updates[1m])) by (instance, job, namespace) > 0' severity: warning for: 15m - name: Jaeger query request failures description: "Jaeger query on {{ $labels.instance }} is failing {{ $value | humanize }}% of requests." query: '100 * sum(rate(jaeger_query_requests_total{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_query_requests_total[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_query_requests_total[1m])) by (instance, job, namespace) > 0' severity: warning for: 15m - name: Other services: - name: APC UPS exporters: - name: mdlayher/apcupsd_exporter slug: apcupsd_exporter doc_url: https://github.com/mdlayher/apcupsd_exporter rules: - name: APC UPS Battery nearly empty description: Battery is almost empty (< 10% left) query: "apcupsd_battery_charge_percent < 10" severity: critical - name: APC UPS Less than 15 Minutes of battery time remaining description: Battery is almost empty (< 15 Minutes remaining) query: "apcupsd_battery_time_left_seconds < 900" severity: critical - name: APC UPS AC input outage description: UPS now running on battery (since {{$value | humanizeDuration}}) query: "apcupsd_battery_time_on_seconds > 0" severity: warning - name: APC UPS low battery voltage description: Battery voltage is lower than nominal (< 95%) query: "(apcupsd_battery_volts / apcupsd_battery_nominal_volts) < 0.95 and apcupsd_battery_nominal_volts > 0" severity: warning - name: APC UPS high temperature description: Internal temperature is high ({{$value}}°C) query: "apcupsd_internal_temperature_celsius >= 40" severity: warning for: 2m - name: APC UPS high load description: UPS load is > 80% query: "apcupsd_ups_load_percent > 80" severity: warning - name: Graph Node exporters: - name: Embedded exporter slug: embedded-exporter rules: - name: Provider failed because net_version failed description: "Failed net_version for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`" query: "eth_rpc_status == 1" severity: critical - name: Provider failed because get genesis failed description: "Failed to get genesis for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`" query: "eth_rpc_status == 2" severity: critical - name: Provider failed because net_version timeout description: "net_version timeout for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`" query: "eth_rpc_status == 3" severity: critical - name: Provider failed because get genesis timeout description: "Timeout to get genesis for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`" query: "eth_rpc_status == 4" severity: critical - name: Store connection slow description: "Store connection is too slow to `{{$labels.pool}}` pool, `{{$labels.shard}}` shard in Graph node `{{$labels.instance}}`" query: "store_connection_wait_time_ms > 10" severity: warning comments: | Threshold of 10ms. Adjust based on your expected database latency. - name: Store connection very slow description: "Store connection is very slow to `{{$labels.pool}}` pool, `{{$labels.shard}}` shard in Graph node `{{$labels.instance}}`" query: "store_connection_wait_time_ms > 20" severity: critical comments: | Threshold of 20ms. Adjust based on your expected database latency. - name: LiteLLM exporters: - slug: embedded-exporter doc_url: https://docs.litellm.ai/docs/proxy/prometheus rules: - name: LiteLLM provider spend over budget description: "Cumulative spend for an LLM provider has exceeded the daily budget threshold. Replace the regex `(claude-|anthropic/).*` with your provider's model-name pattern. Useful as a soft-warning when `provider_budget_config` hard-cap is unavailable or disabled." query: 'sum(increase(litellm_spend_metric_total{model=~"(claude-|anthropic/).*"}[24h])) > 1' severity: warning for: 5m comments: | The threshold (1) is in USD. The `model` label carries the resolved model-name (post-routing). PromQL `increase()` requires ≥2 datapoints with growth-difference to extrapolate positive — for brand-new counter series this needs ≥2 distinct request bursts ≥1 scrape-cycle apart. - name: LiteLLM proxy failed requests rate high description: "LiteLLM proxy is returning failed responses to clients (>5% error rate over 5min). Investigate downstream LLM provider availability or auth issues." query: 'sum(rate(litellm_proxy_failed_requests_metric_total[5m])) / sum(rate(litellm_proxy_total_requests_metric_total[5m])) > 0.05' severity: warning for: 10m - name: LiteLLM request latency p95 high description: "LiteLLM request total latency p95 exceeds 10 seconds over 5min. Check downstream LLM provider response-times and proxy queue-depth." query: 'histogram_quantile(0.95, sum(rate(litellm_request_total_latency_metric_bucket[5m])) by (le)) > 10' severity: warning for: 10m