mirror of
https://github.com/samber/awesome-prometheus-alerts.git
synced 2026-06-21 00:47:18 +08:00
3369 lines
202 KiB
YAML
3369 lines
202 KiB
YAML
#
|
|
# The following yaml cannot be copy-pasted to Prometheus configuration.
|
|
# Please navigate to https://samber.github.io/awesome-prometheus-alerts/rules instead.
|
|
#
|
|
# Contributing guidelines:
|
|
# https://github.com/samber/awesome-prometheus-alerts/blob/master/CONTRIBUTING.md
|
|
#
|
|
|
|
groups:
|
|
- name: Basic resource monitoring
|
|
services:
|
|
- name: Prometheus self-monitoring
|
|
exporters:
|
|
- slug: embedded-exporter
|
|
rules:
|
|
- name: Prometheus job missing
|
|
description: A Prometheus job has disappeared
|
|
query: 'absent(up{job="prometheus"})'
|
|
severity: warning
|
|
- name: Prometheus target missing
|
|
description: A Prometheus target has disappeared. An exporter might be crashed.
|
|
query: "up == 0"
|
|
severity: critical
|
|
- name: Prometheus all targets missing
|
|
description: A Prometheus job does not have living target anymore.
|
|
query: "sum by (job) (up) == 0"
|
|
severity: critical
|
|
- name: Prometheus target missing with warmup time
|
|
description: "Allow a job time to start up (10 minutes) before alerting that it's down."
|
|
query: "sum by (instance, job) ((up == 0) * on (instance) group_left(__name__) (node_time_seconds - node_boot_time_seconds > 600))"
|
|
severity: critical
|
|
- name: Prometheus configuration reload failure
|
|
description: Prometheus configuration reload error
|
|
query: "prometheus_config_last_reload_successful != 1"
|
|
severity: warning
|
|
- name: Prometheus too many restarts
|
|
description: Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.
|
|
query: 'changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2'
|
|
severity: warning
|
|
- name: Prometheus AlertManager job missing
|
|
description: A Prometheus AlertManager job has disappeared
|
|
query: 'absent(up{job="alertmanager"})'
|
|
severity: warning
|
|
- name: Prometheus AlertManager configuration reload failure
|
|
description: AlertManager configuration reload error
|
|
query: "alertmanager_config_last_reload_successful != 1"
|
|
severity: warning
|
|
- name: Prometheus AlertManager config not synced
|
|
description: Configurations of AlertManager cluster instances are out of sync
|
|
query: 'count(count_values("config_hash", alertmanager_config_hash)) > 1'
|
|
severity: warning
|
|
- name: Prometheus AlertManager E2E dead man switch
|
|
description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager."
|
|
query: "vector(1)"
|
|
severity: critical
|
|
- name: Prometheus not connected to alertmanager
|
|
description: Prometheus cannot connect the alertmanager
|
|
query: "prometheus_notifications_alertmanagers_discovered < 1"
|
|
severity: critical
|
|
- name: Prometheus rule evaluation failures
|
|
description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts."
|
|
query: "increase(prometheus_rule_evaluation_failures_total[3m]) > 0"
|
|
severity: critical
|
|
- name: Prometheus template text expansion failures
|
|
description: "Prometheus encountered {{ $value }} template text expansion failures"
|
|
query: "increase(prometheus_template_text_expansion_failures_total[3m]) > 0"
|
|
severity: critical
|
|
- name: Prometheus rule evaluation slow
|
|
description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query."
|
|
query: "prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds"
|
|
severity: warning
|
|
for: 5m
|
|
- name: Prometheus notifications backlog
|
|
description: The Prometheus notification queue has not been empty for 10 minutes
|
|
query: "min_over_time(prometheus_notifications_queue_length[10m]) > 0"
|
|
severity: warning
|
|
- name: Prometheus AlertManager notification failing
|
|
description: Alertmanager is failing sending notifications
|
|
query: "rate(alertmanager_notifications_failed_total[1m]) > 0"
|
|
severity: critical
|
|
- name: Prometheus target empty
|
|
description: Prometheus has no target in service discovery
|
|
query: "prometheus_sd_discovered_targets == 0"
|
|
severity: critical
|
|
- name: Prometheus target scraping slow
|
|
description: Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned.
|
|
query: 'prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05'
|
|
severity: warning
|
|
for: 5m
|
|
- name: Prometheus large scrape
|
|
description: Prometheus has many scrapes that exceed the sample limit
|
|
query: "increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10"
|
|
severity: warning
|
|
for: 5m
|
|
- name: Prometheus target scrape duplicate
|
|
description: Prometheus has many samples rejected due to duplicate timestamps but different values
|
|
query: "increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0"
|
|
severity: warning
|
|
- name: Prometheus TSDB checkpoint creation failures
|
|
description: "Prometheus encountered {{ $value }} checkpoint creation failures"
|
|
query: "increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0"
|
|
severity: critical
|
|
- name: Prometheus TSDB checkpoint deletion failures
|
|
description: "Prometheus encountered {{ $value }} checkpoint deletion failures"
|
|
query: "increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0"
|
|
severity: critical
|
|
- name: Prometheus TSDB compactions failed
|
|
description: "Prometheus encountered {{ $value }} TSDB compactions failures"
|
|
query: "increase(prometheus_tsdb_compactions_failed_total[1m]) > 0"
|
|
severity: critical
|
|
- name: Prometheus TSDB head truncations failed
|
|
description: "Prometheus encountered {{ $value }} TSDB head truncation failures"
|
|
query: "increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0"
|
|
severity: critical
|
|
- name: Prometheus TSDB reload failures
|
|
description: "Prometheus encountered {{ $value }} TSDB reload failures"
|
|
query: "increase(prometheus_tsdb_reloads_failures_total[1m]) > 0"
|
|
severity: critical
|
|
- name: Prometheus TSDB WAL corruptions
|
|
description: "Prometheus encountered {{ $value }} TSDB WAL corruptions"
|
|
query: "increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0"
|
|
severity: critical
|
|
- name: Prometheus TSDB WAL truncations failed
|
|
description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures"
|
|
query: "increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0"
|
|
severity: critical
|
|
- name: Prometheus timeseries cardinality
|
|
description: 'The "{{ $labels.name }}" timeseries cardinality is getting very high: {{ $value }}'
|
|
query: 'label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000'
|
|
severity: warning
|
|
|
|
- name: Host and hardware
|
|
exporters:
|
|
- name: node-exporter
|
|
slug: node-exporter
|
|
doc_url: https://github.com/prometheus/node_exporter
|
|
rules:
|
|
- name: Host out of memory
|
|
description: Node memory is filling up (< 10% left)
|
|
query: "(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < .10)"
|
|
severity: warning
|
|
for: 2m
|
|
- name: Host memory under memory pressure
|
|
description: The node is under heavy memory pressure. High rate of loading memory pages from disk.
|
|
query: "(rate(node_vmstat_pgmajfault[5m]) > 1000)"
|
|
severity: warning
|
|
- name: Host Memory is underutilized
|
|
description: "Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})"
|
|
query: "min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8"
|
|
severity: info
|
|
comments: |
|
|
You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
|
|
- name: Host unusual network throughput in
|
|
description: Host receive bandwidth is high (>80%).
|
|
query: "((rate(node_network_receive_bytes_total[5m]) / node_network_speed_bytes) > .80)"
|
|
severity: warning
|
|
- name: Host unusual network throughput out
|
|
description: Host transmit bandwidth is high (>80%)
|
|
query: "((rate(node_network_transmit_bytes_total[5m]) / node_network_speed_bytes) > .80)"
|
|
severity: warning
|
|
- name: Host unusual disk read rate
|
|
description: Disk is too busy (IO wait > 80%)
|
|
query: "(rate(node_disk_io_time_seconds_total[5m]) > .80)"
|
|
severity: warning
|
|
- name: Host out of disk space
|
|
description: Disk is almost full (< 10% left)
|
|
query: '(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0)'
|
|
severity: critical
|
|
comments: |
|
|
Please add ignored mountpoints in node_exporter parameters like
|
|
"--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
|
|
Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
|
|
for: 2m
|
|
- name: Host disk may fill in 24 hours
|
|
description: Filesystem will likely run out of space within the next 24 hours.
|
|
query: 'predict_linear(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[3h], 86400) <= 0 and node_filesystem_avail_bytes > 0'
|
|
severity: warning
|
|
comments: |
|
|
Please add ignored mountpoints in node_exporter parameters like
|
|
"--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
|
|
Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
|
|
for: 2m
|
|
- name: Host out of inodes
|
|
description: Disk is almost running out of available inodes (< 10% left)
|
|
query: "(node_filesystem_files_free / node_filesystem_files < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0)"
|
|
severity: critical
|
|
for: 2m
|
|
- name: Host filesystem device error
|
|
description: "Error stat-ing the {{ $labels.mountpoint }} filesystem"
|
|
query: 'node_filesystem_device_error{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} == 1'
|
|
severity: critical
|
|
for: 2m
|
|
- name: Host inodes may fill in 24 hours
|
|
description: Filesystem will likely run out of inodes within the next 24 hours at current write rate
|
|
query: 'predict_linear(node_filesystem_files_free{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_files_free > 0'
|
|
severity: warning
|
|
for: 2m
|
|
- name: Host unusual disk read latency
|
|
description: Disk latency is growing (read operations > 100ms)
|
|
query: "(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0)"
|
|
severity: warning
|
|
for: 2m
|
|
- name: Host unusual disk write latency
|
|
description: Disk latency is growing (write operations > 100ms)
|
|
query: "(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0)"
|
|
severity: warning
|
|
for: 2m
|
|
- name: Host high CPU load
|
|
description: CPU load is > 80%
|
|
query: '1 - (avg without (cpu) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) > .80'
|
|
severity: warning
|
|
for: 10m
|
|
- name: Host CPU is underutilized
|
|
description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs."
|
|
query: '(min without (cpu) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8'
|
|
severity: info
|
|
for: 1w
|
|
comments: |
|
|
You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
|
|
- name: Host CPU steal noisy neighbor
|
|
description: CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.
|
|
query: 'avg without (cpu) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10'
|
|
severity: warning
|
|
- name: Host CPU high iowait
|
|
description: CPU iowait > 10%. Your CPU is idling waiting for storage to respond.
|
|
query: 'avg without (cpu) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10'
|
|
severity: warning
|
|
- name: Host unusual disk IO
|
|
description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities. Check storage for issues."
|
|
query: "rate(node_disk_io_time_seconds_total[5m]) > 0.8"
|
|
severity: warning
|
|
for: 5m
|
|
- name: Host context switching high
|
|
description: Context switching is growing on the node (twice the daily average during the last 15m)
|
|
query: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2'
|
|
severity: warning
|
|
comments: |
|
|
x2 context switches is an arbitrary number.
|
|
The alert threshold depends on the nature of the application.
|
|
Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
|
|
- name: Host swap is filling up
|
|
description: Swap is filling up (>80%)
|
|
query: "((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80)"
|
|
severity: warning
|
|
for: 2m
|
|
- name: Host systemd service crashed
|
|
description: "systemd service crashed"
|
|
query: '(node_systemd_unit_state{state="failed"} == 1)'
|
|
severity: warning
|
|
- name: Host physical component too hot
|
|
description: "Physical hardware component too hot"
|
|
query: "node_hwmon_temp_celsius > node_hwmon_temp_max_celsius"
|
|
severity: warning
|
|
for: 5m
|
|
- name: Host node overtemperature alarm
|
|
description: "Physical node temperature alarm triggered"
|
|
query: "((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1))"
|
|
severity: critical
|
|
- name: Host software RAID insufficient drives
|
|
description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} has insufficient drives remaining."
|
|
query: '((node_md_disks_required - on(device, instance) node_md_disks{state="active"}) > 0)'
|
|
severity: critical
|
|
- name: Host software RAID disk failure
|
|
description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} needs attention."
|
|
query: '(node_md_disks{state="failed"} > 0)'
|
|
severity: warning
|
|
for: 2m
|
|
- name: Host kernel version deviations
|
|
description: Kernel version for {{ $labels.instance }} has changed.
|
|
query: "changes(node_uname_info[1h]) > 0"
|
|
severity: info
|
|
- name: Host OOM kill detected
|
|
description: OOM kill detected
|
|
query: "(increase(node_vmstat_oom_kill[1m]) > 0)"
|
|
severity: warning
|
|
- name: Host EDAC Correctable Errors detected
|
|
description: 'Host {{ $labels.instance }} has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'
|
|
query: "(increase(node_edac_correctable_errors_total[1m]) > 0)"
|
|
severity: info
|
|
- name: Host EDAC Uncorrectable Errors detected
|
|
description: 'Host {{ $labels.instance }} has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'
|
|
query: "(node_edac_uncorrectable_errors_total > 0)"
|
|
severity: warning
|
|
- name: Host Network Receive Errors
|
|
description: 'Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.'
|
|
query: "(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01)"
|
|
severity: warning
|
|
for: 2m
|
|
- name: Host Network Transmit Errors
|
|
description: 'Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.'
|
|
query: "(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01)"
|
|
severity: warning
|
|
for: 2m
|
|
- name: Host Network Bond Degraded
|
|
description: 'Bond "{{ $labels.device }}" degraded on "{{ $labels.instance }}".'
|
|
query: "((node_bonding_active - node_bonding_slaves) != 0)"
|
|
severity: warning
|
|
for: 2m
|
|
- name: Host conntrack limit
|
|
description: "The number of conntrack is approaching limit"
|
|
query: "(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8)"
|
|
severity: warning
|
|
for: 5m
|
|
- name: Host clock skew
|
|
description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host."
|
|
query: "((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0))"
|
|
severity: warning
|
|
for: 10m
|
|
- name: Host clock not synchronising
|
|
description: "Clock not synchronising. Ensure NTP is configured on this host."
|
|
query: "(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16)"
|
|
severity: warning
|
|
for: 2m
|
|
|
|
- name: S.M.A.R.T Device Monitoring
|
|
exporters:
|
|
- name: smartctl-exporter
|
|
slug: smartctl-exporter
|
|
doc_url: https://github.com/prometheus-community/smartctl_exporter
|
|
rules:
|
|
- name: SMART device temperature warning
|
|
description: Device temperature warning on {{ $labels.instance }} drive {{ $labels.device }} over 60°C
|
|
query: '(avg_over_time(smartctl_device_temperature{temperature_type="current"} [5m]) unless on (instance, device) smartctl_device_temperature{temperature_type="drive_trip"}) > 60'
|
|
severity: warning
|
|
- name: SMART device temperature critical
|
|
description: Device temperature critical on {{ $labels.instance }} drive {{ $labels.device }} over 70°C
|
|
query: '(max_over_time(smartctl_device_temperature{temperature_type="current"} [5m]) unless on (instance, device) smartctl_device_temperature{temperature_type="drive_trip"}) > 70'
|
|
severity: critical
|
|
- name: SMART device temperature over trip value
|
|
description: Device temperature over trip value on {{ $labels.instance }} drive {{ $labels.device }})
|
|
query: 'max_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) >= on(device, instance) smartctl_device_temperature{temperature_type="drive_trip"}'
|
|
severity: critical
|
|
- name: SMART device temperature nearing trip value
|
|
description: Device temperature at 80% of trip value on {{ $labels.instance }} drive {{ $labels.device }})
|
|
query: 'max_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) >= on(device, instance) (smartctl_device_temperature{temperature_type="drive_trip"} * .80)'
|
|
severity: warning
|
|
- name: SMART status
|
|
description: Device has a SMART status failure on {{ $labels.instance }} drive {{ $labels.device }})
|
|
query: "smartctl_device_smart_status != 1"
|
|
severity: critical
|
|
- name: SMART critical warning
|
|
description: Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }})
|
|
query: "smartctl_device_critical_warning > 0"
|
|
severity: critical
|
|
- name: SMART media errors
|
|
description: Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }})
|
|
query: "smartctl_device_media_errors > 0"
|
|
severity: critical
|
|
- name: SMART Wearout Indicator
|
|
description: Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }})
|
|
query: "smartctl_device_available_spare < smartctl_device_available_spare_threshold"
|
|
severity: critical
|
|
|
|
- name: Docker containers
|
|
exporters:
|
|
- name: google/cAdvisor
|
|
slug: google-cadvisor
|
|
doc_url: https://github.com/google/cadvisor
|
|
rules:
|
|
- name: Container killed
|
|
description: A container has disappeared
|
|
query: "time() - container_last_seen > 60"
|
|
severity: warning
|
|
comments: |
|
|
This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment.
|
|
- name: Container absent
|
|
description: A container is absent for 5 min
|
|
query: "absent(container_last_seen)"
|
|
severity: warning
|
|
for: 5m
|
|
comments: |
|
|
This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment.
|
|
- name: Container High CPU utilization
|
|
description: Container CPU utilization is above 80%
|
|
query: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) > 80'
|
|
severity: warning
|
|
for: 2m
|
|
- name: Container High Memory usage
|
|
description: Container Memory usage is above 80%
|
|
query: '(sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80'
|
|
severity: warning
|
|
comments: See https://medium.com/faun/how-much-is-too-much-the-linux-oomkiller-and-used-memory-d32186f29c9d
|
|
for: 2m
|
|
- name: Container Volume usage
|
|
description: Container Volume usage is above 80%
|
|
query: '(1 - (sum(container_fs_inodes_free{name!=""}) BY (instance) / sum(container_fs_inodes_total) BY (instance))) * 100 > 80'
|
|
severity: warning
|
|
for: 2m
|
|
- name: Container high throttle rate
|
|
description: Container is being throttled
|
|
query: 'sum(increase(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) by (container, pod, namespace) / sum(increase(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > ( 25 / 100 )'
|
|
severity: warning
|
|
for: 5m
|
|
- name: Container high low change CPU usage
|
|
description: This alert rule monitors the absolute change in CPU usage within a time window and triggers an alert when the change exceeds 25%.
|
|
query: '(abs((sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[1m])) * 100) - (sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[1m] offset 1m)) * 100)) or abs((sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[1m])) * 100) - (sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[5m] offset 1m)) * 100))) > 25'
|
|
severity: info
|
|
- name: Container Low CPU utilization
|
|
description: Container CPU utilization is under 20% for 1 week. Consider reducing the allocated CPU.
|
|
query: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) < 20'
|
|
severity: info
|
|
for: 7d
|
|
- name: Container Low Memory usage
|
|
description: Container Memory usage is under 20% for 1 week. Consider reducing the allocated memory.
|
|
query: '(sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) < 20'
|
|
severity: info
|
|
for: 7d
|
|
|
|
- name: Blackbox
|
|
exporters:
|
|
- name: prometheus/blackbox_exporter
|
|
slug: blackbox-exporter
|
|
doc_url: https://github.com/prometheus/blackbox_exporter
|
|
rules:
|
|
- name: Blackbox probe failed
|
|
description: Probe failed
|
|
query: probe_success == 0
|
|
severity: critical
|
|
- name: Blackbox configuration reload failure
|
|
description: Blackbox configuration reload failure
|
|
query: "blackbox_exporter_config_last_reload_successful != 1"
|
|
severity: warning
|
|
- name: Blackbox slow probe
|
|
description: Blackbox probe took more than 1s to complete
|
|
query: "avg_over_time(probe_duration_seconds[1m]) > 1"
|
|
severity: warning
|
|
for: 1m
|
|
- name: Blackbox probe HTTP failure
|
|
description: HTTP status code is not 200-399
|
|
query: "probe_http_status_code <= 199 OR probe_http_status_code >= 400"
|
|
severity: critical
|
|
- name: Blackbox SSL certificate will expire soon
|
|
description: SSL certificate expires in less than 20 days
|
|
query: "3 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 20"
|
|
severity: warning
|
|
- name: Blackbox SSL certificate will expire soon
|
|
description: SSL certificate expires in less than 3 days
|
|
query: "0 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 3"
|
|
severity: critical
|
|
- name: Blackbox SSL certificate expired
|
|
description: SSL certificate has expired already
|
|
query: "round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 0"
|
|
severity: critical
|
|
comments: |
|
|
For probe_ssl_earliest_cert_expiry to be exposed after expiration, you
|
|
need to enable insecure_skip_verify. Note that this will disable
|
|
certificate validation.
|
|
See https://github.com/prometheus/blackbox_exporter/blob/master/CONFIGURATION.md#tls_config
|
|
- name: Blackbox probe slow HTTP
|
|
description: HTTP request took more than 1s
|
|
query: "avg_over_time(probe_http_duration_seconds[1m]) > 1"
|
|
severity: warning
|
|
for: 1m
|
|
- name: Blackbox probe slow ping
|
|
description: Blackbox ping took more than 1s
|
|
query: "avg_over_time(probe_icmp_duration_seconds[1m]) > 1"
|
|
severity: warning
|
|
for: 1m
|
|
|
|
- name: Windows Server
|
|
exporters:
|
|
- name: prometheus-community/windows_exporter
|
|
slug: windows-exporter
|
|
doc_url: https://github.com/prometheus-community/windows_exporter
|
|
rules:
|
|
- name: Windows Server collector Error
|
|
description: "Collector {{ $labels.collector }} was not successful"
|
|
query: "windows_exporter_collector_success == 0"
|
|
severity: critical
|
|
- name: Windows Server service Status
|
|
description: Windows Service state is not OK
|
|
query: 'windows_service_status{status="ok"} != 1'
|
|
severity: critical
|
|
for: 1m
|
|
- name: Windows Server CPU Usage
|
|
description: CPU Usage is more than 80%
|
|
query: '100 - (avg by (instance) (rate(windows_cpu_time_total{mode="idle"}[2m])) * 100) > 80'
|
|
severity: warning
|
|
- name: Windows Server memory Usage
|
|
description: Memory usage is more than 90%
|
|
query: "100 - ((windows_os_physical_memory_free_bytes / windows_cs_physical_memory_bytes) * 100) > 90"
|
|
severity: warning
|
|
for: 2m
|
|
- name: Windows Server disk Space Usage
|
|
description: Disk usage is more than 80%
|
|
query: "100.0 - 100 * ((windows_logical_disk_free_bytes / 1024 / 1024 ) / (windows_logical_disk_size_bytes / 1024 / 1024)) > 80"
|
|
severity: critical
|
|
for: 2m
|
|
|
|
- name: VMware
|
|
exporters:
|
|
- name: pryorda/vmware_exporter
|
|
slug: pryorda-vmware-exporter
|
|
doc_url: https://github.com/pryorda/vmware_exporter
|
|
rules:
|
|
- name: Virtual Machine Memory Warning
|
|
description: 'High memory usage on {{ $labels.instance }}: {{ $value | printf "%.2f"}}%'
|
|
query: "vmware_vm_mem_usage_average / 100 >= 80 and vmware_vm_mem_usage_average / 100 < 90"
|
|
severity: warning
|
|
for: 5m
|
|
- name: Virtual Machine Memory Critical
|
|
description: 'High memory usage on {{ $labels.instance }}: {{ $value | printf "%.2f"}}%'
|
|
query: "vmware_vm_mem_usage_average / 100 >= 90"
|
|
severity: critical
|
|
for: 1m
|
|
- name: High Number of Snapshots
|
|
description: "High snapshots number on {{ $labels.instance }}: {{ $value }}"
|
|
query: "vmware_vm_snapshots > 3"
|
|
severity: warning
|
|
for: 30m
|
|
- name: Outdated Snapshots
|
|
description: 'Outdated snapshots on {{ $labels.instance }}: {{ $value | printf "%.0f"}} days'
|
|
query: "(time() - vmware_vm_snapshot_timestamp_seconds) / (60 * 60 * 24) >= 3"
|
|
severity: warning
|
|
for: 5m
|
|
|
|
- name: Netdata
|
|
exporters:
|
|
- name: Embedded exporter
|
|
slug: embedded-exporter
|
|
doc_url: https://github.com/netdata/netdata/blob/master/backends/prometheus/README.md
|
|
rules:
|
|
- name: Netdata high cpu usage
|
|
description: Netdata high CPU usage (> 80%)
|
|
query: 'rate(netdata_cpu_cpu_percentage_average{dimension="idle"}[1m]) > 80'
|
|
severity: warning
|
|
for: 5m
|
|
- name: Host CPU steal noisy neighbor
|
|
description: CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.
|
|
query: 'rate(netdata_cpu_cpu_percentage_average{dimension="steal"}[1m]) > 10'
|
|
severity: warning
|
|
for: 5m
|
|
- name: Netdata high memory usage
|
|
description: Netdata high memory usage (> 80%)
|
|
query: '100 / netdata_system_ram_MiB_average * netdata_system_ram_MiB_average{dimension=~"free|cached"} < 20'
|
|
severity: warning
|
|
for: 5m
|
|
- name: Netdata low disk space
|
|
description: Netdata low disk space (> 80%)
|
|
query: '100 / netdata_disk_space_GB_average * netdata_disk_space_GB_average{dimension=~"avail|cached"} < 20'
|
|
severity: warning
|
|
for: 5m
|
|
- name: Netdata predicted disk full
|
|
description: Netdata predicted disk full in 24 hours
|
|
query: 'predict_linear(netdata_disk_space_GB_average{dimension=~"avail|cached"}[3h], 24 * 3600) < 0'
|
|
severity: warning
|
|
- name: Netdata MD mismatch cnt unsynchronized blocks
|
|
description: RAID Array have unsynchronized blocks
|
|
query: "netdata_md_mismatch_cnt_unsynchronized_blocks_average > 1024"
|
|
severity: warning
|
|
for: 2m
|
|
- name: Netdata disk reallocated sectors
|
|
description: Reallocated sectors on disk
|
|
query: "increase(netdata_smartd_log_reallocated_sectors_count_sectors_average[1m]) > 0"
|
|
severity: info
|
|
- name: Netdata disk current pending sector
|
|
description: Disk current pending sector
|
|
query: "netdata_smartd_log_current_pending_sector_count_sectors_average > 0"
|
|
severity: warning
|
|
- name: Netdata reported uncorrectable disk sectors
|
|
description: Reported uncorrectable disk sectors
|
|
query: "increase(netdata_smartd_log_offline_uncorrectable_sector_count_sectors_average[2m]) > 0"
|
|
severity: warning
|
|
|
|
- name: Databases and brokers
|
|
services:
|
|
- name: MySQL
|
|
exporters:
|
|
- name: prometheus/mysqld_exporter
|
|
slug: mysqld-exporter
|
|
doc_url: https://github.com/prometheus/mysqld_exporter
|
|
rules:
|
|
- name: MySQL down
|
|
description: MySQL instance is down on {{ $labels.instance }}
|
|
query: "mysql_up == 0"
|
|
severity: critical
|
|
- name: MySQL too many connections (> 80%)
|
|
description: "More than 80% of MySQL connections are in use on {{ $labels.instance }}"
|
|
query: "max_over_time(mysql_global_status_threads_connected[1m]) / mysql_global_variables_max_connections * 100 > 80"
|
|
severity: warning
|
|
for: 2m
|
|
- name: MySQL high prepared statements utilization (> 80%)
|
|
description: "High utilization of prepared statements (>80%) on {{ $labels.instance }}"
|
|
query: "max_over_time(mysql_global_status_prepared_stmt_count[1m]) / mysql_global_variables_max_prepared_stmt_count * 100 > 80"
|
|
severity: warning
|
|
for: 2m
|
|
- name: MySQL high threads running
|
|
description: "More than 60% of MySQL connections are in running state on {{ $labels.instance }}"
|
|
query: "max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections * 100 > 60"
|
|
severity: warning
|
|
for: 2m
|
|
- name: MySQL Slave IO thread not running
|
|
description: "MySQL Slave IO thread not running on {{ $labels.instance }}"
|
|
query: "( mysql_slave_status_slave_io_running and ON (instance) mysql_slave_status_master_server_id > 0 ) == 0"
|
|
severity: critical
|
|
- name: MySQL Slave SQL thread not running
|
|
description: "MySQL Slave SQL thread not running on {{ $labels.instance }}"
|
|
query: "( mysql_slave_status_slave_sql_running and ON (instance) mysql_slave_status_master_server_id > 0) == 0"
|
|
severity: critical
|
|
- name: MySQL Slave replication lag
|
|
description: "MySQL replication lag on {{ $labels.instance }}"
|
|
query: "( (mysql_slave_status_seconds_behind_master - mysql_slave_status_sql_delay) and ON (instance) mysql_slave_status_master_server_id > 0 ) > 30"
|
|
severity: critical
|
|
for: 1m
|
|
- name: MySQL slow queries
|
|
description: MySQL server mysql has some new slow query.
|
|
query: increase(mysql_global_status_slow_queries[1m]) > 0
|
|
severity: warning
|
|
for: 2m
|
|
- name: MySQL InnoDB log waits
|
|
description: MySQL innodb log writes stalling
|
|
query: rate(mysql_global_status_innodb_log_waits[15m]) > 10
|
|
severity: warning
|
|
- name: MySQL restarted
|
|
description: MySQL has just been restarted, less than one minute ago on {{ $labels.instance }}.
|
|
query: "mysql_global_status_uptime < 60"
|
|
severity: info
|
|
- name: MySQL High QPS
|
|
description: MySQL is being overload with unusual QPS (> 10k QPS).
|
|
query: "irate(mysql_global_status_questions[1m]) > 10000"
|
|
severity: info
|
|
for: 2m
|
|
- name: MySQL too many open files
|
|
description: MySQL has too many open files, consider increase variables open_files_limit on {{ $labels.instance }}.
|
|
query: "mysql_global_status_innodb_num_open_files / mysql_global_variables_open_files_limit * 100 > 75"
|
|
severity: warning
|
|
for: 2m
|
|
- name: MySQL InnoDB Force Recovery is enabled
|
|
description: "MySQL InnoDB force recovery is enabled on {{ $labels.instance }}"
|
|
query: "mysql_global_variables_innodb_force_recovery != 0"
|
|
severity: warning
|
|
for: 2m
|
|
- name: MySQL InnoDB history_len too long
|
|
description: "MySQL history_len (undo log) too long on {{ $labels.instance }}"
|
|
query: "mysql_info_schema_innodb_metrics_transaction_trx_rseg_history_len > 50000"
|
|
severity: warning
|
|
for: 2m
|
|
|
|
- name: PostgreSQL
|
|
exporters:
|
|
- name: prometheus-community/postgres_exporter
|
|
slug: postgres-exporter
|
|
doc_url: https://github.com/prometheus-community/postgres_exporter
|
|
rules:
|
|
- name: Postgresql down
|
|
description: Postgresql instance is down
|
|
query: "pg_up == 0"
|
|
severity: critical
|
|
- name: Postgresql restarted
|
|
description: Postgresql restarted
|
|
query: "time() - pg_postmaster_start_time_seconds < 60"
|
|
severity: critical
|
|
- name: Postgresql exporter error
|
|
description: Postgresql exporter is showing errors. A query may be buggy in query.yaml
|
|
query: "pg_exporter_last_scrape_error > 0"
|
|
severity: critical
|
|
- name: Postgresql table not auto vacuumed
|
|
description: Table {{ $labels.relname }} has not been auto vacuumed for 10 days
|
|
query: "((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_vacuum_threshold) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10"
|
|
severity: warning
|
|
- name: Postgresql table not auto analyzed
|
|
description: Table {{ $labels.relname }} has not been auto analyzed for 10 days
|
|
query: "((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_analyze_threshold) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10"
|
|
severity: warning
|
|
- name: Postgresql too many connections
|
|
description: PostgreSQL instance has too many connections (> 80%).
|
|
query: "sum by (instance, job, server) (pg_stat_activity_count) > min by (instance, job, server) (pg_settings_max_connections * 0.8)"
|
|
severity: warning
|
|
for: 2m
|
|
- name: Postgresql not enough connections
|
|
description: PostgreSQL instance should have more connections (> 5)
|
|
query: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5'
|
|
severity: critical
|
|
for: 2m
|
|
- name: Postgresql dead locks
|
|
description: PostgreSQL has dead-locks
|
|
query: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5'
|
|
severity: warning
|
|
- name: Postgresql high rollback rate
|
|
description: Ratio of transactions being aborted compared to committed is > 2 %
|
|
query: 'sum by (namespace,datname) ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) / ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m])))) > 0.02'
|
|
severity: warning
|
|
- name: Postgresql commit rate low
|
|
description: Postgresql seems to be processing very few transactions
|
|
query: 'increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[5m]) < 5'
|
|
severity: critical
|
|
for: 2m
|
|
- name: Postgresql low XID consumption
|
|
description: Postgresql seems to be consuming transaction IDs very slowly
|
|
query: "rate(pg_txid_current[1m]) < 5"
|
|
severity: warning
|
|
for: 2m
|
|
- name: Postgresql high rate statement timeout
|
|
description: Postgres transactions showing high rate of statement timeouts
|
|
query: 'rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3'
|
|
severity: critical
|
|
- name: Postgresql high rate deadlock
|
|
description: Postgres detected deadlocks
|
|
query: 'increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1'
|
|
severity: critical
|
|
- name: Postgresql unused replication slot
|
|
description: Unused Replication Slots
|
|
query: "(pg_replication_slots_active == 0) and (pg_replication_is_replica == 0)"
|
|
severity: warning
|
|
for: 1m
|
|
- name: Postgresql too many dead tuples
|
|
description: PostgreSQL dead tuples is too large
|
|
query: "((pg_stat_user_tables_n_dead_tup > 10000) / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)) >= 0.1"
|
|
severity: warning
|
|
for: 2m
|
|
- name: Postgresql configuration changed
|
|
description: Postgres Database configuration change has occurred
|
|
query: '{__name__=~"pg_settings_.*"} != ON(__name__, instance) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"} OFFSET 5m'
|
|
severity: info
|
|
- name: Postgresql SSL compression active
|
|
description: Database allows connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.
|
|
query: "sum(pg_stat_ssl_compression) > 0"
|
|
severity: critical
|
|
- name: Postgresql too many locks acquired
|
|
description: Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.
|
|
query: "((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20"
|
|
severity: critical
|
|
for: 2m
|
|
- name: Postgresql bloat index high (> 80%)
|
|
description: "The index {{ $labels.idxname }} is bloated. You should execute `REINDEX INDEX CONCURRENTLY {{ $labels.idxname }};`"
|
|
query: "pg_bloat_btree_bloat_pct > 80 and on (idxname) (pg_bloat_btree_real_size > 100000000)"
|
|
severity: warning
|
|
for: 1h
|
|
comments: |
|
|
See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
|
|
- name: Postgresql bloat table high (> 80%)
|
|
description: "The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`"
|
|
query: "pg_bloat_table_bloat_pct > 80 and on (relname) (pg_bloat_table_real_size > 200000000)"
|
|
severity: warning
|
|
for: 1h
|
|
comments: |
|
|
See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
|
|
- name: Postgresql invalid index
|
|
description: "The table {{ $labels.relname }} has an invalid index: {{ $labels.indexrelname }}. You should execute `DROP INDEX {{ $labels.indexrelname }};`"
|
|
query: 'pg_general_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}'
|
|
severity: warning
|
|
for: 6h
|
|
comments: |
|
|
See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
|
|
- name: Postgresql replication lag
|
|
description: The PostgreSQL replication lag is high (> 5s)
|
|
query: "pg_replication_lag_seconds > 5"
|
|
severity: warning
|
|
for: 30s
|
|
|
|
- name: SQL Server
|
|
exporters:
|
|
- name: Ozarklake/prometheus-mssql-exporter
|
|
slug: ozarklake-mssql-exporter
|
|
doc_url: https://github.com/Ozarklake/prometheus-mssql-exporter
|
|
rules:
|
|
- name: SQL Server down
|
|
description: SQL server instance is down
|
|
query: mssql_up == 0
|
|
severity: critical
|
|
- name: SQL Server deadlock
|
|
description: SQL Server is having some deadlock.
|
|
query: increase(mssql_deadlocks[1m]) > 5
|
|
severity: warning
|
|
|
|
- name: Patroni
|
|
exporters:
|
|
- name: Embedded exporter (Patroni >= 2.1.0)
|
|
slug: embedded-exporter-patroni
|
|
doc_url: https://patroni.readthedocs.io/en/latest/rest_api.html?highlight=prometheus#monitoring-endpoint
|
|
rules:
|
|
- name: Patroni has no Leader
|
|
description: A leader node (neither primary nor standby) cannot be found inside the cluster {{ $labels.scope }}
|
|
query: (max by (scope) (patroni_master) < 1) and (max by (scope) (patroni_standby_leader) < 1)
|
|
severity: critical
|
|
|
|
- name: PGBouncer
|
|
exporters:
|
|
- name: spreaker/prometheus-pgbouncer-exporter
|
|
slug: spreaker-pgbouncer-exporter
|
|
doc_url: https://github.com/spreaker/prometheus-pgbouncer-exporter
|
|
rules:
|
|
- name: PGBouncer active connections
|
|
description: PGBouncer pools are filling up
|
|
query: "pgbouncer_pools_server_active_connections > 200"
|
|
severity: warning
|
|
for: 2m
|
|
- name: PGBouncer errors
|
|
description: PGBouncer is logging errors. This may be due to a a server restart or an admin typing commands at the pgbouncer console.
|
|
query: 'increase(pgbouncer_errors_count{errmsg!="server conn crashed?"}[1m]) > 10'
|
|
severity: warning
|
|
- name: PGBouncer max connections
|
|
description: The number of PGBouncer client connections has reached max_client_conn.
|
|
query: 'increase(pgbouncer_errors_count{errmsg="no more connections allowed (max_client_conn)"}[30s]) > 0'
|
|
severity: critical
|
|
|
|
- name: Redis
|
|
exporters:
|
|
- name: oliver006/redis_exporter
|
|
slug: oliver006-redis-exporter
|
|
doc_url: https://github.com/oliver006/redis_exporter
|
|
rules:
|
|
- name: Redis down
|
|
description: Redis instance is down
|
|
query: "redis_up == 0"
|
|
severity: critical
|
|
- name: Redis missing master
|
|
description: Redis cluster has no node marked as master.
|
|
query: '(count(redis_instance_info{role="master"}) or vector(0)) < 1'
|
|
severity: critical
|
|
- name: Redis too many masters
|
|
description: Redis cluster has too many nodes marked as master.
|
|
query: 'count(redis_instance_info{role="master"}) > 1'
|
|
severity: critical
|
|
- name: Redis disconnected slaves
|
|
description: Redis not replicating for all slaves. Consider reviewing the redis replication status.
|
|
query: "count without (instance, job) (redis_connected_slaves) - sum without (instance, job) (redis_connected_slaves) - 1 > 0"
|
|
severity: critical
|
|
- name: Redis replication broken
|
|
description: Redis instance lost a slave
|
|
query: "delta(redis_connected_slaves[1m]) < 0"
|
|
severity: critical
|
|
- name: Redis cluster flapping
|
|
description: Changes have been detected in Redis replica connection. This can occur when replica nodes lose connection to the master and reconnect (a.k.a flapping).
|
|
query: "changes(redis_connected_slaves[1m]) > 1"
|
|
severity: critical
|
|
for: 2m
|
|
- name: Redis missing backup
|
|
description: Redis has not been backuped for 24 hours
|
|
query: "time() - redis_rdb_last_save_timestamp_seconds > 60 * 60 * 24"
|
|
severity: critical
|
|
- name: Redis out of system memory
|
|
description: Redis is running out of system memory (> 90%)
|
|
query: "redis_memory_used_bytes / redis_total_system_memory_bytes * 100 > 90"
|
|
severity: warning
|
|
for: 2m
|
|
comments: |
|
|
The exporter must be started with --include-system-metrics flag or REDIS_EXPORTER_INCL_SYSTEM_METRICS=true environment variable.
|
|
- name: Redis out of configured maxmemory
|
|
description: Redis is running out of configured maxmemory (> 90%)
|
|
query: "redis_memory_used_bytes / redis_memory_max_bytes * 100 > 90 and on(instance) redis_memory_max_bytes > 0"
|
|
severity: warning
|
|
for: 2m
|
|
- name: Redis too many connections
|
|
description: Redis is running out of connections (> 90% used)
|
|
query: "redis_connected_clients / redis_config_maxclients * 100 > 90"
|
|
severity: warning
|
|
for: 2m
|
|
- name: Redis not enough connections
|
|
description: Redis instance should have more connections (> 5)
|
|
query: "redis_connected_clients < 5"
|
|
severity: warning
|
|
for: 2m
|
|
- name: Redis rejected connections
|
|
description: Some connections to Redis has been rejected
|
|
query: "increase(redis_rejected_connections_total[1m]) > 0"
|
|
severity: critical
|
|
|
|
- name: MongoDB
|
|
exporters:
|
|
- name: percona/mongodb_exporter
|
|
slug: percona-mongodb-exporter
|
|
doc_url: https://github.com/percona/mongodb_exporter
|
|
rules:
|
|
- name: MongoDB Down
|
|
description: MongoDB instance is down
|
|
query: "mongodb_up == 0"
|
|
severity: critical
|
|
- name: Mongodb replica member unhealthy
|
|
description: MongoDB replica member is not healthy
|
|
query: "mongodb_rs_members_health == 0"
|
|
severity: critical
|
|
- name: MongoDB replication lag
|
|
description: Mongodb replication lag is more than 10s
|
|
query: '(mongodb_rs_members_optimeDate{member_state="PRIMARY"} - on (set) group_right mongodb_rs_members_optimeDate{member_state="SECONDARY"}) / 1000 > 10'
|
|
severity: critical
|
|
- name: MongoDB replication headroom
|
|
description: MongoDB replication headroom is <= 0
|
|
query: 'sum(avg(mongodb_mongod_replset_oplog_head_timestamp - mongodb_mongod_replset_oplog_tail_timestamp)) - sum(avg(mongodb_rs_members_optimeDate{member_state="PRIMARY"} - on (set) group_right mongodb_rs_members_optimeDate{member_state="SECONDARY"})) <= 0'
|
|
severity: critical
|
|
- name: MongoDB number cursors open
|
|
description: Too many cursors opened by MongoDB for clients (> 10k)
|
|
query: 'mongodb_ss_metrics_cursor_open{csr_type="total"} > 10 * 1000'
|
|
severity: warning
|
|
for: 2m
|
|
- name: MongoDB cursors timeouts
|
|
description: Too many cursors are timing out
|
|
query: "increase(mongodb_ss_metrics_cursor_timedOut[1m]) > 100"
|
|
severity: warning
|
|
for: 2m
|
|
- name: MongoDB too many connections
|
|
description: Too many connections (> 80%)
|
|
query: 'avg by(instance) (rate(mongodb_ss_connections{conn_type="current"}[1m])) / avg by(instance) (sum (mongodb_ss_connections) by (instance)) * 100 > 80'
|
|
severity: warning
|
|
for: 2m
|
|
|
|
- name: dcu/mongodb_exporter
|
|
slug: dcu-mongodb-exporter
|
|
doc_url: https://github.com/dcu/mongodb_exporter
|
|
rules:
|
|
- name: MongoDB replication lag
|
|
description: Mongodb replication lag is more than 10s
|
|
query: 'avg(mongodb_replset_member_optime_date{state="PRIMARY"}) - avg(mongodb_replset_member_optime_date{state="SECONDARY"}) > 10'
|
|
severity: critical
|
|
- name: MongoDB replication Status 3
|
|
description: MongoDB Replication set member either perform startup self-checks, or transition from completing a rollback or resync
|
|
query: "mongodb_replset_member_state == 3"
|
|
severity: critical
|
|
- name: MongoDB replication Status 6
|
|
description: MongoDB Replication set member as seen from another member of the set, is not yet known
|
|
query: "mongodb_replset_member_state == 6"
|
|
severity: critical
|
|
- name: MongoDB replication Status 8
|
|
description: MongoDB Replication set member as seen from another member of the set, is unreachable
|
|
query: "mongodb_replset_member_state == 8"
|
|
severity: critical
|
|
- name: MongoDB replication Status 9
|
|
description: MongoDB Replication set member is actively performing a rollback. Data is not available for reads
|
|
query: "mongodb_replset_member_state == 9"
|
|
severity: critical
|
|
- name: MongoDB replication Status 10
|
|
description: MongoDB Replication set member was once in a replica set but was subsequently removed
|
|
query: "mongodb_replset_member_state == 10"
|
|
severity: critical
|
|
- name: MongoDB number cursors open
|
|
description: Too many cursors opened by MongoDB for clients (> 10k)
|
|
query: 'mongodb_metrics_cursor_open{state="total_open"} > 10000'
|
|
severity: warning
|
|
for: 2m
|
|
- name: MongoDB cursors timeouts
|
|
description: Too many cursors are timing out
|
|
query: "increase(mongodb_metrics_cursor_timed_out_total[1m]) > 100"
|
|
severity: warning
|
|
for: 2m
|
|
- name: MongoDB too many connections
|
|
description: Too many connections (> 80%)
|
|
query: 'avg by(instance) (rate(mongodb_connections{state="current"}[1m])) / avg by(instance) (sum (mongodb_connections) by (instance)) * 100 > 80'
|
|
severity: warning
|
|
for: 2m
|
|
- name: MongoDB virtual memory usage
|
|
description: High memory usage
|
|
query: '(sum(mongodb_memory{type="virtual"}) BY (instance) / sum(mongodb_memory{type="mapped"}) BY (instance)) > 3'
|
|
severity: warning
|
|
for: 2m
|
|
|
|
- name: stefanprodan/mgob
|
|
slug: stefanprodan-mgob-exporter
|
|
doc_url: https://github.com/stefanprodan/mgob
|
|
rules:
|
|
- name: Mgob backup failed
|
|
description: MongoDB backup has failed
|
|
query: 'changes(mgob_scheduler_backup_total{status="500"}[1h]) > 0'
|
|
severity: critical
|
|
|
|
- name: RabbitMQ
|
|
exporters:
|
|
- name: rabbitmq/rabbitmq-prometheus
|
|
slug: rabbitmq-exporter
|
|
doc_url: https://github.com/rabbitmq/rabbitmq-prometheus
|
|
rules:
|
|
- name: RabbitMQ node down
|
|
description: Less than 3 nodes running in RabbitMQ cluster
|
|
query: "sum(rabbitmq_build_info) < 3"
|
|
severity: critical
|
|
- name: RabbitMQ node not distributed
|
|
description: Distribution link state is not 'up'
|
|
query: "erlang_vm_dist_node_state < 3"
|
|
severity: critical
|
|
- name: RabbitMQ instances different versions
|
|
description: Running different version of RabbitMQ in the same cluster, can lead to failure.
|
|
query: "count(count(rabbitmq_build_info) by (rabbitmq_version)) > 1"
|
|
severity: warning
|
|
for: 1h
|
|
- name: RabbitMQ memory high
|
|
description: A node use more than 90% of allocated RAM
|
|
query: "rabbitmq_process_resident_memory_bytes / rabbitmq_resident_memory_limit_bytes * 100 > 90"
|
|
severity: warning
|
|
for: 2m
|
|
- name: RabbitMQ file descriptors usage
|
|
description: A node use more than 90% of file descriptors
|
|
query: "rabbitmq_process_open_fds / rabbitmq_process_max_fds * 100 > 90"
|
|
severity: warning
|
|
for: 2m
|
|
- name: RabbitMQ too many ready messages
|
|
description: RabbitMQ too many ready messages on {{ $labels.instace }}
|
|
query: "sum(rabbitmq_queue_messages_ready) BY (queue) > 1000"
|
|
severity: warning
|
|
for: 1m
|
|
- name: RabbitMQ too many unack messages
|
|
description: Too many unacknowledged messages
|
|
query: "sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000"
|
|
severity: warning
|
|
for: 1m
|
|
- name: RabbitMQ too many connections
|
|
description: The total connections of a node is too high
|
|
query: "rabbitmq_connections > 1000"
|
|
severity: warning
|
|
for: 2m
|
|
- name: RabbitMQ no queue consumer
|
|
description: A queue has less than 1 consumer
|
|
query: "rabbitmq_queue_consumers < 1"
|
|
severity: warning
|
|
for: 1m # allows a short service restart
|
|
- name: RabbitMQ unroutable messages
|
|
description: A queue has unroutable messages
|
|
query: "increase(rabbitmq_channel_messages_unroutable_returned_total[1m]) > 0 or increase(rabbitmq_channel_messages_unroutable_dropped_total[1m]) > 0"
|
|
severity: warning
|
|
for: 2m
|
|
|
|
- name: kbudde/rabbitmq-exporter
|
|
slug: kbudde-rabbitmq-exporter
|
|
doc_url: https://github.com/kbudde/rabbitmq_exporter
|
|
rules:
|
|
- name: RabbitMQ down
|
|
description: RabbitMQ node down
|
|
query: "rabbitmq_up == 0"
|
|
severity: critical
|
|
- name: RabbitMQ cluster down
|
|
description: Less than 3 nodes running in RabbitMQ cluster
|
|
query: "sum(rabbitmq_running) < 3"
|
|
severity: critical
|
|
- name: RabbitMQ cluster partition
|
|
description: Cluster partition
|
|
query: "rabbitmq_partitions > 0"
|
|
severity: critical
|
|
- name: RabbitMQ out of memory
|
|
description: Memory available for RabbmitMQ is low (< 10%)
|
|
query: "rabbitmq_node_mem_used / rabbitmq_node_mem_limit * 100 > 90"
|
|
severity: warning
|
|
for: 2m
|
|
- name: RabbitMQ too many connections
|
|
description: RabbitMQ instance has too many connections (> 1000)
|
|
query: "rabbitmq_connectionsTotal > 1000"
|
|
severity: warning
|
|
for: 2m
|
|
- name: RabbitMQ dead letter queue filling up
|
|
description: Dead letter queue is filling up (> 10 msgs)
|
|
query: 'rabbitmq_queue_messages{queue="my-dead-letter-queue"} > 10'
|
|
severity: warning
|
|
for: 1m
|
|
comments: |
|
|
Indicate the queue name in dedicated label.
|
|
- name: RabbitMQ too many messages in queue
|
|
description: Queue is filling up (> 1000 msgs)
|
|
query: 'rabbitmq_queue_messages_ready{queue="my-queue"} > 1000'
|
|
severity: warning
|
|
for: 2m
|
|
comments: |
|
|
Indicate the queue name in dedicated label.
|
|
- name: RabbitMQ slow queue consuming
|
|
description: Queue messages are consumed slowly (> 60s)
|
|
query: 'time() - rabbitmq_queue_head_message_timestamp{queue="my-queue"} > 60'
|
|
severity: warning
|
|
for: 2m
|
|
comments: |
|
|
Indicate the queue name in dedicated label.
|
|
- name: RabbitMQ no consumer
|
|
description: Queue has no consumer
|
|
query: "rabbitmq_queue_consumers == 0"
|
|
severity: critical
|
|
for: 1m # allows a short service restart
|
|
- name: RabbitMQ too many consumers
|
|
description: Queue should have only 1 consumer
|
|
query: 'rabbitmq_queue_consumers{queue="my-queue"} > 1'
|
|
severity: critical
|
|
comments: |
|
|
Indicate the queue name in dedicated label.
|
|
- name: RabbitMQ unactive exchange
|
|
description: Exchange receive less than 5 msgs per second
|
|
query: 'rate(rabbitmq_exchange_messages_published_in_total{exchange="my-exchange"}[1m]) < 5'
|
|
severity: warning
|
|
comments: |
|
|
Indicate the exchange name in dedicated label.
|
|
for: 2m
|
|
|
|
- name: Elasticsearch
|
|
exporters:
|
|
- name: prometheus-community/elasticsearch_exporter
|
|
slug: prometheus-community-elasticsearch-exporter
|
|
doc_url: https://github.com/prometheus-community/elasticsearch_exporter
|
|
rules:
|
|
- name: Elasticsearch Heap Usage Too High
|
|
description: "The heap usage is over 90%"
|
|
query: '(elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 90'
|
|
severity: critical
|
|
for: 2m
|
|
- name: Elasticsearch Heap Usage warning
|
|
description: "The heap usage is over 80%"
|
|
query: '(elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 80'
|
|
severity: warning
|
|
for: 2m
|
|
- name: Elasticsearch disk out of space
|
|
description: The disk usage is over 90%
|
|
query: "elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 10"
|
|
severity: critical
|
|
- name: Elasticsearch disk space low
|
|
description: The disk usage is over 80%
|
|
query: "elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 20"
|
|
severity: warning
|
|
for: 2m
|
|
- name: Elasticsearch Cluster Red
|
|
description: Elastic Cluster Red status
|
|
query: 'elasticsearch_cluster_health_status{color="red"} == 1'
|
|
severity: critical
|
|
- name: Elasticsearch Cluster Yellow
|
|
description: Elastic Cluster Yellow status
|
|
query: 'elasticsearch_cluster_health_status{color="yellow"} == 1'
|
|
severity: warning
|
|
- name: Elasticsearch Healthy Nodes
|
|
description: "Missing node in Elasticsearch cluster"
|
|
query: "elasticsearch_cluster_health_number_of_nodes < 3"
|
|
severity: critical
|
|
- name: Elasticsearch Healthy Data Nodes
|
|
description: "Missing data node in Elasticsearch cluster"
|
|
query: "elasticsearch_cluster_health_number_of_data_nodes < 3"
|
|
severity: critical
|
|
- name: Elasticsearch relocating shards
|
|
description: "Elasticsearch is relocating shards"
|
|
query: "elasticsearch_cluster_health_relocating_shards > 0"
|
|
severity: info
|
|
- name: Elasticsearch relocating shards too long
|
|
description: "Elasticsearch has been relocating shards for 15min"
|
|
query: "elasticsearch_cluster_health_relocating_shards > 0"
|
|
severity: warning
|
|
for: 15m
|
|
- name: Elasticsearch initializing shards
|
|
description: "Elasticsearch is initializing shards"
|
|
query: "elasticsearch_cluster_health_initializing_shards > 0"
|
|
severity: info
|
|
- name: Elasticsearch initializing shards too long
|
|
description: "Elasticsearch has been initializing shards for 15 min"
|
|
query: "elasticsearch_cluster_health_initializing_shards > 0"
|
|
severity: warning
|
|
for: 15m
|
|
- name: Elasticsearch unassigned shards
|
|
description: "Elasticsearch has unassigned shards"
|
|
query: "elasticsearch_cluster_health_unassigned_shards > 0"
|
|
severity: critical
|
|
- name: Elasticsearch pending tasks
|
|
description: "Elasticsearch has pending tasks. Cluster works slowly."
|
|
query: "elasticsearch_cluster_health_number_of_pending_tasks > 0"
|
|
severity: warning
|
|
for: 15m
|
|
- name: Elasticsearch no new documents
|
|
description: "No new documents for 10 min!"
|
|
query: 'increase(elasticsearch_indices_indexing_index_total{es_data_node="true"}[10m]) < 1'
|
|
severity: warning
|
|
- name: Elasticsearch High Indexing Latency
|
|
description: "The indexing latency on Elasticsearch cluster is higher than the threshold."
|
|
query: "increase(elasticsearch_indices_indexing_index_time_seconds_total[1m]) / increase(elasticsearch_indices_indexing_index_total[1m]) > 0.0005"
|
|
severity: warning
|
|
for: 10m
|
|
- name: Elasticsearch High Indexing Rate
|
|
description: "The indexing rate on Elasticsearch cluster is higher than the threshold."
|
|
query: "sum(rate(elasticsearch_indices_indexing_index_total[1m]))> 10000"
|
|
severity: warning
|
|
for: 5m
|
|
- name: Elasticsearch High Query Rate
|
|
description: "The query rate on Elasticsearch cluster is higher than the threshold."
|
|
query: "sum(rate(elasticsearch_indices_search_query_total[1m])) > 100"
|
|
severity: warning
|
|
for: 5m
|
|
- name: Elasticsearch High Query Latency
|
|
description: "The query latency on Elasticsearch cluster is higher than the threshold."
|
|
query: "increase(elasticsearch_indices_search_fetch_time_seconds[1m]) / increase(elasticsearch_indices_search_fetch_total[1m]) > 1"
|
|
severity: warning
|
|
for: 5m
|
|
|
|
- name: Meilisearch
|
|
exporters:
|
|
- name: Embedded exporter
|
|
slug: embedded-exporter
|
|
doc_url: https://github.com/orgs/meilisearch/discussions/625
|
|
rules:
|
|
- name: Meilisearch index is empty
|
|
description: Meilisearch instance is down
|
|
query: "meilisearch_index_docs_count == 0"
|
|
severity: warning
|
|
- name: Meilisearch http response time
|
|
description: Meilisearch http response time is too high
|
|
query: "meilisearch_http_response_time_seconds > 0.5"
|
|
severity: warning
|
|
|
|
- name: Cassandra
|
|
exporters:
|
|
- name: instaclustr/cassandra-exporter
|
|
slug: instaclustr-cassandra-exporter
|
|
doc_url: https://github.com/instaclustr/cassandra-exporter
|
|
rules:
|
|
- name: "Cassandra Node is unavailable"
|
|
description: "Cassandra Node is unavailable - {{ $labels.cassandra_cluster }} {{ $labels.exported_endpoint }}"
|
|
query: "sum(cassandra_endpoint_active) by (cassandra_cluster,instance,exported_endpoint) < 1"
|
|
severity: critical
|
|
- name: "Cassandra many compaction tasks are pending"
|
|
description: "Many Cassandra compaction tasks are pending - {{ $labels.cassandra_cluster }}"
|
|
query: "cassandra_table_estimated_pending_compactions > 100"
|
|
severity: warning
|
|
- name: "Cassandra commitlog pending tasks"
|
|
description: "Cassandra commitlog pending tasks - {{ $labels.cassandra_cluster }}"
|
|
query: "cassandra_commit_log_pending_tasks > 15"
|
|
for: 2m
|
|
severity: warning
|
|
- name: "Cassandra compaction executor blocked tasks"
|
|
description: "Some Cassandra compaction executor tasks are blocked - {{ $labels.cassandra_cluster }}"
|
|
query: 'cassandra_thread_pool_blocked_tasks{pool="CompactionExecutor"} > 15'
|
|
for: 2m
|
|
severity: warning
|
|
- name: "Cassandra flush writer blocked tasks"
|
|
description: "Some Cassandra flush writer tasks are blocked - {{ $labels.cassandra_cluster }}"
|
|
query: 'cassandra_thread_pool_blocked_tasks{pool="MemtableFlushWriter"} > 15'
|
|
for: 2m
|
|
severity: warning
|
|
- name: "Cassandra connection timeouts total"
|
|
description: "Some connection between nodes are ending in timeout - {{ $labels.cassandra_cluster }}"
|
|
query: "avg(cassandra_client_request_timeouts_total) by (cassandra_cluster,instance) > 5"
|
|
for: 2m
|
|
severity: critical
|
|
- name: "Cassandra storage exceptions"
|
|
description: "Something is going wrong with cassandra storage - {{ $labels.cassandra_cluster }}"
|
|
query: "changes(cassandra_storage_exceptions_total[1m]) > 1"
|
|
severity: critical
|
|
- name: "Cassandra tombstone dump"
|
|
description: "Cassandra tombstone dump - {{ $labels.cassandra_cluster }}"
|
|
query: 'avg(cassandra_table_tombstones_scanned{quantile="0.99"}) by (instance,cassandra_cluster,keyspace) > 100'
|
|
for: 2m
|
|
severity: critical
|
|
- name: "Cassandra client request unavailable write"
|
|
description: "Some Cassandra client requests are unavailable to write - {{ $labels.cassandra_cluster }}"
|
|
query: 'changes(cassandra_client_request_unavailable_exceptions_total{operation="write"}[1m]) > 0'
|
|
for: 2m
|
|
severity: critical
|
|
- name: "Cassandra client request unavailable read"
|
|
description: "Some Cassandra client requests are unavailable to read - {{ $labels.cassandra_cluster }}"
|
|
query: 'changes(cassandra_client_request_unavailable_exceptions_total{operation="read"}[1m]) > 0'
|
|
for: 2m
|
|
severity: critical
|
|
- name: "Cassandra client request write failure"
|
|
description: "Read failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}"
|
|
query: 'increase(cassandra_client_request_failures_total{operation="write"}[1m]) > 0'
|
|
for: 2m
|
|
severity: critical
|
|
- name: "Cassandra client request read failure"
|
|
description: "Read failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}"
|
|
query: 'increase(cassandra_client_request_failures_total{operation="read"}[1m]) > 0'
|
|
for: 2m
|
|
severity: critical
|
|
|
|
- name: criteo/cassandra_exporter
|
|
slug: criteo-cassandra-exporter
|
|
doc_url: https://github.com/criteo/cassandra_exporter
|
|
rules:
|
|
- name: Cassandra hints count
|
|
description: Cassandra hints count has changed on {{ $labels.instance }} some nodes may go down
|
|
query: 'changes(cassandra_stats{name="org:apache:cassandra:metrics:storage:totalhints:count"}[1m]) > 3'
|
|
severity: critical
|
|
- name: Cassandra compaction task pending
|
|
description: Many Cassandra compaction tasks are pending. You might need to increase I/O capacity by adding nodes to the cluster.
|
|
query: 'avg_over_time(cassandra_stats{name="org:apache:cassandra:metrics:compaction:pendingtasks:value"}[1m]) > 100'
|
|
severity: warning
|
|
for: 2m
|
|
- name: Cassandra viewwrite latency
|
|
description: High viewwrite latency on {{ $labels.instance }} cassandra node
|
|
query: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:viewwrite:viewwritelatency:99thpercentile",service="cas"} > 100000'
|
|
severity: warning
|
|
for: 2m
|
|
- name: Cassandra bad hacker
|
|
description: Increase of Cassandra authentication failures
|
|
query: 'rate(cassandra_stats{name="org:apache:cassandra:metrics:client:authfailure:count"}[1m]) > 5'
|
|
severity: warning
|
|
for: 2m
|
|
- name: Cassandra node down
|
|
description: Cassandra node down
|
|
query: 'sum(cassandra_stats{name="org:apache:cassandra:net:failuredetector:downendpointcount"}) by (service,group,cluster,env) > 0'
|
|
severity: critical
|
|
- name: Cassandra commitlog pending tasks
|
|
description: Unexpected number of Cassandra commitlog pending tasks
|
|
query: 'cassandra_stats{name="org:apache:cassandra:metrics:commitlog:pendingtasks:value"} > 15'
|
|
severity: warning
|
|
for: 2m
|
|
- name: Cassandra compaction executor blocked tasks
|
|
description: Some Cassandra compaction executor tasks are blocked
|
|
query: 'cassandra_stats{name="org:apache:cassandra:metrics:threadpools:internal:compactionexecutor:currentlyblockedtasks:count"} > 0'
|
|
severity: warning
|
|
for: 2m
|
|
- name: Cassandra flush writer blocked tasks
|
|
description: Some Cassandra flush writer tasks are blocked
|
|
query: 'cassandra_stats{name="org:apache:cassandra:metrics:threadpools:internal:memtableflushwriter:currentlyblockedtasks:count"} > 0'
|
|
severity: warning
|
|
for: 2m
|
|
- name: Cassandra repair pending tasks
|
|
description: Some Cassandra repair tasks are pending
|
|
query: 'cassandra_stats{name="org:apache:cassandra:metrics:threadpools:internal:antientropystage:pendingtasks:value"} > 2'
|
|
severity: warning
|
|
for: 2m
|
|
- name: Cassandra repair blocked tasks
|
|
description: Some Cassandra repair tasks are blocked
|
|
query: 'cassandra_stats{name="org:apache:cassandra:metrics:threadpools:internal:antientropystage:currentlyblockedtasks:count"} > 0'
|
|
severity: warning
|
|
for: 2m
|
|
- name: Cassandra connection timeouts total
|
|
description: Some connection between nodes are ending in timeout
|
|
query: 'rate(cassandra_stats{name="org:apache:cassandra:metrics:connection:totaltimeouts:count"}[1m]) > 5'
|
|
severity: critical
|
|
for: 2m
|
|
- name: Cassandra storage exceptions
|
|
description: Something is going wrong with cassandra storage
|
|
query: 'changes(cassandra_stats{name="org:apache:cassandra:metrics:storage:exceptions:count"}[1m]) > 1'
|
|
severity: critical
|
|
- name: Cassandra tombstone dump
|
|
description: Too much tombstones scanned in queries
|
|
query: 'cassandra_stats{name="org:apache:cassandra:metrics:table:tombstonescannedhistogram:99thpercentile"} > 1000'
|
|
severity: critical
|
|
- name: Cassandra client request unavailable write
|
|
description: Write failures have occurred because too many nodes are unavailable
|
|
query: 'changes(cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:write:unavailables:count"}[1m]) > 0'
|
|
severity: critical
|
|
- name: Cassandra client request unavailable read
|
|
description: Read failures have occurred because too many nodes are unavailable
|
|
query: 'changes(cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:read:unavailables:count"}[1m]) > 0'
|
|
severity: critical
|
|
- name: Cassandra client request write failure
|
|
description: A lot of write failures encountered. A write failure is a non-timeout exception encountered during a write request. Examine the reason map to find to the root cause. The most common cause for this type of error is when batch sizes are too large.
|
|
query: 'increase(cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:write:failures:oneminuterate"}[1m]) > 0'
|
|
severity: critical
|
|
- name: Cassandra client request read failure
|
|
description: A lot of read failures encountered. A read failure is a non-timeout exception encountered during a read request. Examine the reason map to find to the root cause. The most common cause for this type of error is when batch sizes are too large.
|
|
query: 'increase(cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:read:failures:oneminuterate"}[1m]) > 0'
|
|
severity: critical
|
|
- name: Cassandra cache hit rate key cache
|
|
description: Key cache hit rate is below 85%
|
|
query: 'cassandra_stats{name="org:apache:cassandra:metrics:cache:keycache:hitrate:value"} < .85'
|
|
severity: critical
|
|
for: 2m
|
|
|
|
- name: Clickhouse
|
|
exporters:
|
|
- name: Embedded Exporter
|
|
slug: embedded-exporter
|
|
doc_url: https://clickhouse.com/docs/en/operations/system-tables/metrics
|
|
rules:
|
|
- name: ClickHouse node down
|
|
description: "No metrics received from ClickHouse exporter for over 2 minutes."
|
|
query: 'up{job="clickhouse"} == 0'
|
|
severity: critical
|
|
for: 2m
|
|
- name: ClickHouse Memory Usage Critical
|
|
description: "Memory usage is critically high, over 90%."
|
|
query: "ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 90"
|
|
severity: critical
|
|
for: 5m
|
|
- name: ClickHouse Memory Usage Warning
|
|
description: "Memory usage is over 80%."
|
|
query: "ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 80"
|
|
severity: warning
|
|
for: 5m
|
|
- name: ClickHouse Disk Space Low on Default
|
|
description: "Disk space on default is below 20%."
|
|
query: "ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 20"
|
|
severity: warning
|
|
for: 2m
|
|
- name: ClickHouse Disk Space Critical on Default
|
|
description: "Disk space on default disk is critically low, below 10%."
|
|
query: "ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 10"
|
|
severity: critical
|
|
for: 2m
|
|
- name: ClickHouse Disk Space Low on Backups
|
|
description: "Disk space on backups is below 20%."
|
|
query: "ClickHouseAsyncMetrics_DiskAvailable_backups / (ClickHouseAsyncMetrics_DiskAvailable_backups + ClickHouseAsyncMetrics_DiskUsed_backups) * 100 < 20"
|
|
severity: warning
|
|
for: 2m
|
|
- name: ClickHouse Replica Errors
|
|
description: "Critical replica errors detected, either all replicas are stale or lost."
|
|
query: "ClickHouseErrorMetric_ALL_REPLICAS_ARE_STALE == 1 or ClickHouseErrorMetric_ALL_REPLICAS_LOST == 1"
|
|
severity: critical
|
|
for: 0m
|
|
- name: ClickHouse No Available Replicas
|
|
description: "No available replicas in ClickHouse."
|
|
query: "ClickHouseErrorMetric_NO_AVAILABLE_REPLICA == 1"
|
|
severity: critical
|
|
for: 0m
|
|
- name: ClickHouse No Live Replicas
|
|
description: "There are too few live replicas available, risking data loss and service disruption."
|
|
query: "ClickHouseErrorMetric_TOO_FEW_LIVE_REPLICAS == 1"
|
|
severity: critical
|
|
for: 0m
|
|
- name: ClickHouse High Network Traffic
|
|
description: "Network traffic is unusually high, may affect cluster performance."
|
|
query: "ClickHouseMetrics_NetworkSend > 250 or ClickHouseMetrics_NetworkReceive > 250"
|
|
severity: warning
|
|
for: 5m
|
|
comments: |
|
|
Please replace the threshold with an appropriate value
|
|
- name: ClickHouse High TCP Connections
|
|
description: "High number of TCP connections, indicating heavy client or inter-cluster communication."
|
|
query: "ClickHouseMetrics_TCPConnection > 400"
|
|
severity: warning
|
|
for: 5m
|
|
comments: |
|
|
Please replace the threshold with an appropriate value
|
|
- name: ClickHouse Interserver Connection Issues
|
|
description: "An increase in interserver connections may indicate replication or distributed query handling issues."
|
|
query: "increase(ClickHouseMetrics_InterserverConnection[5m]) > 0"
|
|
severity: warning
|
|
for: 1m
|
|
- name: ClickHouse ZooKeeper Connection Issues
|
|
description: "ClickHouse is experiencing issues with ZooKeeper connections, which may affect cluster state and coordination."
|
|
query: "avg(ClickHouseMetrics_ZooKeeperSession) != 1"
|
|
severity: warning
|
|
for: 3m
|
|
- name: ClickHouse Authentication Failures
|
|
description: "Authentication failures detected, indicating potential security issues or misconfiguration."
|
|
query: "increase(ClickHouseErrorMetric_AUTHENTICATION_FAILED[5m]) > 0"
|
|
severity: info
|
|
for: 0m
|
|
- name: ClickHouse Access Denied Errors
|
|
description: "Access denied errors have been logged, which could indicate permission issues or unauthorized access attempts."
|
|
query: "increase(ClickHouseErrorMetric_RESOURCE_ACCESS_DENIED[5m]) > 0"
|
|
severity: info
|
|
for: 0m
|
|
- name: ClickHouse rejected insert queries
|
|
description: "INSERTs rejected due to too many active data parts. Reduce insert frequency."
|
|
query: "increase(ClickHouseProfileEvents_RejectedInserts[1m]) > 0"
|
|
severity: warning
|
|
for: 1m
|
|
- name: ClickHouse delayed insert queries
|
|
description: "INSERTs delayed due to high number of active parts."
|
|
query: "increase(ClickHouseProfileEvents_DelayedInserts[5m]) > 0"
|
|
severity: warning
|
|
for: 2m
|
|
- name: ClickHouse zookeeper hardware exception
|
|
description: "Zookeeper hardware exception: network issues communicating with ZooKeeper"
|
|
query: "increase(ClickHouseProfileEvents_ZooKeeperHardwareExceptions[1m]) > 0"
|
|
severity: critical
|
|
for: 1m
|
|
- name: ClickHouse high network usage
|
|
description: High network usage. ClickHouse network usage exceeds 100MB/s.
|
|
query: "rate(ClickHouseProfileEvents_NetworkSendBytes[1m]) > 100*1024*1024 or rate(ClickHouseProfileEvents_NetworkReceiveBytes[1m]) > 100*1024*1024"
|
|
severity: warning
|
|
for: 2m
|
|
comments: |
|
|
Please replace the threshold with an appropriate value
|
|
- name: ClickHouse distributed rejected inserts
|
|
description: "INSERTs into Distributed tables rejected due to pending bytes limit."
|
|
query: "increase(ClickHouseProfileEvents_DistributedRejectedInserts[5m]) > 0"
|
|
severity: critical
|
|
for: 2m
|
|
|
|
- name: CouchDB
|
|
exporters:
|
|
- name: gesellix/couchdb-prometheus-exporter
|
|
slug: gesellix-couchdb-prometheus-exporter
|
|
doc_url: https://github.com/gesellix/couchdb-prometheus-exporter
|
|
rules:
|
|
- name: CouchDB node down
|
|
description: CouchDB node is not responding (node_up metric is 0) for more than 2 minutes
|
|
query: "couchdb_httpd_node_up == 0 or couchdb_httpd_up == 0"
|
|
severity: critical
|
|
for: 2m
|
|
- name: CouchDB atom memory usage critical
|
|
description: Atom memory usage is above 90% of limit
|
|
query: "couchdb_erlang_memory_atom_used > 0.9 * couchdb_erlang_memory_atom"
|
|
severity: critical
|
|
for: 5m
|
|
- name: CouchDB open databases critical
|
|
description: Number of open databases exceeds 90% of node capacity
|
|
query: "couchdb_httpd_open_databases > 0.9 * 1000"
|
|
severity: critical
|
|
for: 5m
|
|
- name: CouchDB open OS files critical
|
|
description: CouchDB is using more than 90% of allowed OS file descriptors, may fail to open new files
|
|
query: "couchdb_httpd_open_os_files > 0.9 * 65535"
|
|
severity: critical
|
|
for: 5m
|
|
- name: CouchDB 5xx error ratio high
|
|
description: More than 5% of HTTP requests are returning 5xx errors
|
|
query: "rate(couchdb_httpd_status_codes{code=~\"5..\"}[5m]) / rate(couchdb_httpd_requests[5m]) > 0.05"
|
|
severity: critical
|
|
for: 5m
|
|
- name: CouchDB temporary view read rate critical
|
|
description: Temporary view read rate exceeds 100 reads/sec, high risk of performance degradation
|
|
query: "rate(couchdb_httpd_temporary_view_reads[5m]) > 100"
|
|
severity: critical
|
|
for: 5m
|
|
- name: CouchDB Mango queries scanning too many docs
|
|
description: Some Mango queries are scanning too many documents, consider adding indexes
|
|
query: "rate(couchdb_mango_too_many_docs_scanned[5m]) > 50"
|
|
severity: warning
|
|
for: 5m
|
|
- name: CouchDB Mango queries failed due to invalid index
|
|
description: Some Mango queries failed to execute because the index was missing or invalid
|
|
query: "rate(couchdb_mango_query_invalid_index[5m]) > 5"
|
|
severity: warning
|
|
for: 5m
|
|
- name: CouchDB Mango docs examined high
|
|
description: High number of documents examined per Mango queries, consider indexing
|
|
query: "rate(couchdb_mango_docs_examined[5m]) > 1000"
|
|
severity: warning
|
|
for: 5m
|
|
- name: CouchDB Replicator manager died
|
|
description: Replication manager process has crashed
|
|
query: "increase(couchdb_replicator_changes_manager_deaths[5m]) > 0"
|
|
severity: critical
|
|
for: 1m
|
|
- name: CouchDB Replicator queue process died
|
|
description: Replication queue process has crashed
|
|
query: "increase(couchdb_replicator_changes_queue_deaths[5m]) > 0"
|
|
severity: critical
|
|
for: 1m
|
|
- name: CouchDB Replicator reader process died
|
|
description: Replication reader process has crashed
|
|
query: "increase(couchdb_replicator_changes_reader_deaths[5m]) > 0"
|
|
severity: critical
|
|
for: 1m
|
|
- name: CouchDB Replicator failed to start
|
|
description: One or more replication tasks failed to start
|
|
query: "increase(couchdb_replicator_failed_starts[5m]) > 0"
|
|
severity: critical
|
|
for: 1m
|
|
- name: CouchDB replication cluster unstable
|
|
description: The replication cluster is unstable, replication may be interrupted
|
|
query: "couchdb_replicator_cluster_is_stable == 0"
|
|
severity: critical
|
|
for: 2m
|
|
- name: CouchDB replication read failures
|
|
description: Replication changes feed has failed reads more than 5 times in 5 minutes
|
|
query: "increase(couchdb_replicator_changes_read_failures[5m]) > 5"
|
|
severity: warning
|
|
for: 5m
|
|
- name: CouchDB file descriptors high
|
|
description: Process is using more than 85% of allowed file descriptors
|
|
query: "process_open_fds / process_max_fds > 0.85"
|
|
severity: warning
|
|
for: 5m
|
|
- name: CouchDB process restarted
|
|
description: CouchDB process has restarted recently
|
|
query: "changes(process_start_time_seconds[1h]) > 0"
|
|
severity: critical
|
|
for: 1m
|
|
- name: CouchDB critical log entries
|
|
description: Critical or error log entries detected in the last 5 minutes
|
|
query: "increase(couchdb_server_couch_log{level=~\"error|critical\"}[5m]) > 0"
|
|
severity: critical
|
|
for: 1m
|
|
|
|
- name: Zookeeper
|
|
exporters:
|
|
- name: cloudflare/kafka_zookeeper_exporter
|
|
slug: cloudflare-kafka-zookeeper-exporter
|
|
doc_url: https://github.com/cloudflare/kafka_zookeeper_exporter
|
|
rules:
|
|
- name: dabealu/zookeeper-exporter
|
|
slug: dabealu-zookeeper-exporter
|
|
doc_url: https://github.com/dabealu/zookeeper-exporter
|
|
rules:
|
|
- name: Zookeeper Down
|
|
description: "Zookeeper down on instance {{ $labels.instance }}"
|
|
query: "zk_up == 0"
|
|
severity: critical
|
|
- name: Zookeeper missing leader
|
|
description: "Zookeeper cluster has no node marked as leader"
|
|
query: "sum(zk_server_leader) == 0"
|
|
severity: critical
|
|
- name: Zookeeper Too Many Leaders
|
|
description: "Zookeeper cluster has too many nodes marked as leader"
|
|
query: "sum(zk_server_leader) > 1"
|
|
severity: critical
|
|
- name: Zookeeper Not Ok
|
|
description: "Zookeeper instance is not ok"
|
|
query: "zk_ruok == 0"
|
|
severity: warning
|
|
for: 3m
|
|
|
|
- name: Kafka
|
|
exporters:
|
|
- name: danielqsj/kafka_exporter
|
|
slug: danielqsj-kafka-exporter
|
|
doc_url: https://github.com/danielqsj/kafka_exporter
|
|
rules:
|
|
- name: Kafka topics replicas
|
|
description: Kafka topic in-sync partition
|
|
query: "min(kafka_topic_partition_in_sync_replica) by (topic) < 3"
|
|
severity: critical
|
|
- name: Kafka consumers group
|
|
description: Kafka consumers group
|
|
query: "sum(kafka_consumergroup_lag) by (consumergroup) > 50"
|
|
severity: critical
|
|
for: 1m
|
|
- name: linkedin/Burrow
|
|
slug: linkedin-kafka-exporter
|
|
doc_url: https://github.com/linkedin/Burrow
|
|
rules:
|
|
- name: Kafka topic offset decreased
|
|
description: Kafka topic offset has decreased
|
|
query: "delta(kafka_burrow_partition_current_offset[1m]) < 0"
|
|
severity: warning
|
|
- name: Kafka consumer lag
|
|
description: Kafka consumer has a 30 minutes and increasing lag
|
|
query: "kafka_burrow_topic_partition_offset - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset >= (kafka_burrow_topic_partition_offset offset 15m - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset offset 15m) AND kafka_burrow_topic_partition_offset - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset > 0"
|
|
severity: warning
|
|
for: 15m
|
|
|
|
- name: Pulsar
|
|
exporters:
|
|
- name: embedded exporter
|
|
slug: embedded-exporter
|
|
doc_url: https://pulsar.apache.org/docs/reference-metrics/
|
|
rules:
|
|
- name: Pulsar subscription high number of backlog entries
|
|
description: "The number of subscription backlog entries is over 5k"
|
|
query: sum(pulsar_subscription_back_log) by (subscription) > 5000
|
|
for: 1h
|
|
severity: warning
|
|
- name: Pulsar subscription very high number of backlog entries
|
|
description: "The number of subscription backlog entries is over 100k"
|
|
query: sum(pulsar_subscription_back_log) by (subscription) > 100000
|
|
for: 1h
|
|
severity: critical
|
|
- name: Pulsar topic large backlog storage size
|
|
description: "The topic backlog storage size is over 5 GB"
|
|
query: sum(pulsar_storage_size > 5*1024*1024*1024) by (topic)
|
|
for: 1h
|
|
severity: warning
|
|
- name: Pulsar topic very large backlog storage size
|
|
description: "The topic backlog storage size is over 20 GB"
|
|
query: sum(pulsar_storage_size > 20*1024*1024*1024) by (topic)
|
|
for: 1h
|
|
severity: critical
|
|
- name: Pulsar high write latency
|
|
description: "Messages cannot be written in a timely fashion"
|
|
query: sum(pulsar_storage_write_latency_overflow > 0) by (topic)
|
|
for: 1h
|
|
severity: critical
|
|
- name: Pulsar large message payload
|
|
description: "Observing large message payload (> 1MB)"
|
|
query: sum(pulsar_entry_size_overflow > 0) by (topic)
|
|
for: 1h
|
|
severity: warning
|
|
- name: Pulsar high ledger disk usage
|
|
description: "Observing Ledger Disk Usage (> 75%)"
|
|
query: sum(bookie_ledger_dir__pulsar_data_bookkeeper_ledgers_usage) by (kubernetes_pod_name) > 75
|
|
for: 1h
|
|
severity: critical
|
|
- name: Pulsar read only bookies
|
|
description: "Observing Readonly Bookies"
|
|
query: count(bookie_SERVER_STATUS{} == 0) by (pod)
|
|
for: 5m
|
|
severity: critical
|
|
- name: Pulsar high number of function errors
|
|
description: "Observing more than 10 Function errors per minute"
|
|
query: sum((rate(pulsar_function_user_exceptions_total{}[1m]) + rate(pulsar_function_system_exceptions_total{}[1m])) > 10) by (name)
|
|
for: 1m
|
|
severity: critical
|
|
- name: Pulsar high number of sink errors
|
|
description: "Observing more than 10 Sink errors per minute"
|
|
query: sum(rate(pulsar_sink_sink_exceptions_total{}[1m]) > 10) by (name)
|
|
for: 1m
|
|
severity: critical
|
|
|
|
- name: Nats
|
|
exporters:
|
|
- name: nats-io/prometheus-nats-exporter
|
|
slug: nats-exporter
|
|
doc_url: https://github.com/nats-io/prometheus-nats-exporter
|
|
rules:
|
|
- name: Nats high connection count
|
|
description: High number of NATS connections ({{ $value }}) for {{ $labels.instance }}
|
|
query: "gnatsd_varz_connections > 100"
|
|
severity: warning
|
|
for: 3m
|
|
- name: Nats high subscriptions count
|
|
description: High number of NATS subscriptions ({{ $value }}) for {{ $labels.instance }}
|
|
query: "gnatsd_connz_subscriptions > 50"
|
|
severity: warning
|
|
for: 3m
|
|
- name: Nats high routes count
|
|
description: High number of NATS routes ({{ $value }}) for {{ $labels.instance }}
|
|
query: "gnatsd_varz_routes > 10"
|
|
severity: warning
|
|
for: 3m
|
|
- name: Nats high memory usage
|
|
description: NATS server memory usage is above 200MB for {{ $labels.instance }}
|
|
query: "gnatsd_varz_mem > 200 * 1024 * 1024"
|
|
severity: warning
|
|
for: 5m
|
|
- name: Nats slow consumers
|
|
description: There are slow consumers in NATS for {{ $labels.instance }}
|
|
query: "gnatsd_varz_slow_consumers > 0"
|
|
severity: critical
|
|
for: 3m
|
|
- name: Nats server down
|
|
description: NATS server has been down for more than 5 minutes
|
|
query: 'absent(up{job="nats"})'
|
|
severity: critical
|
|
for: 5m
|
|
- name: Nats high CPU usage
|
|
description: NATS server is using more than 80% CPU for the last 5 minutes
|
|
query: "rate(gnatsd_varz_cpu[5m]) > 0.8"
|
|
severity: warning
|
|
for: 5m
|
|
- name: Nats high number of connections
|
|
description: NATS server has more than 1000 active connections
|
|
query: "gnatsd_connz_num_connections > 1000"
|
|
severity: warning
|
|
for: 5m
|
|
- name: Nats high JetStream store usage
|
|
description: JetStream store usage is over 80%
|
|
query: "gnatsd_varz_jetstream_stats_storage / gnatsd_varz_jetstream_config_max_storage > 0.8"
|
|
severity: warning
|
|
for: 5m
|
|
- name: Nats high JetStream memory usage
|
|
description: JetStream memory usage is over 80%
|
|
query: "gnatsd_varz_jetstream_stats_memory / gnatsd_varz_jetstream_config_max_memory > 0.8"
|
|
severity: warning
|
|
for: 5m
|
|
- name: Nats high number of subscriptions
|
|
description: NATS server has more than 1000 active subscriptions
|
|
query: "gnatsd_connz_subscriptions > 1000"
|
|
severity: warning
|
|
for: 5m
|
|
- name: Nats high pending bytes
|
|
description: NATS server has more than 100,000 pending bytes
|
|
query: "gnatsd_connz_pending_bytes > 100000"
|
|
severity: warning
|
|
for: 5m
|
|
- name: Nats too many errors
|
|
description: NATS server has encountered errors in the last 5 minutes
|
|
query: "increase(gnatsd_varz_jetstream_stats_api_errors[5m]) > 0"
|
|
severity: warning
|
|
for: 5m
|
|
- name: Nats JetStream consumers exceeded
|
|
description: JetStream has more than 100 active consumers
|
|
query: "sum(gnatsd_varz_jetstream_stats_accounts) > 100"
|
|
severity: warning
|
|
for: 5m
|
|
- name: Nats frequent authentication timeouts
|
|
description: There have been more than 5 authentication timeouts in the last 5 minutes
|
|
query: "increase(gnatsd_varz_auth_timeout[5m]) > 5"
|
|
severity: warning
|
|
for: 5m
|
|
- name: Nats max payload size exceeded
|
|
description: The max payload size allowed by NATS has been exceeded (1MB)
|
|
query: "max(gnatsd_varz_max_payload) > 1024 * 1024"
|
|
severity: critical
|
|
for: 5m
|
|
- name: Nats leaf node connection issue
|
|
description: No leaf node connections have been established in the last 5 minutes
|
|
query: "increase(gnatsd_varz_leafnodes[5m]) == 0"
|
|
severity: critical
|
|
for: 5m
|
|
- name: Nats max ping operations exceeded
|
|
description: The maximum number of ping operations in NATS has exceeded 50
|
|
query: "gnatsd_varz_ping_max > 50"
|
|
severity: warning
|
|
for: 5m
|
|
- name: Nats write deadline exceeded
|
|
description: The write deadline has been exceeded in NATS, indicating potential message delivery issues
|
|
query: "gnatsd_varz_write_deadline > 10"
|
|
severity: critical
|
|
for: 5m
|
|
|
|
- name: Solr
|
|
exporters:
|
|
- name: embedded exporter
|
|
slug: embedded-exporter
|
|
doc_url: https://solr.apache.org/guide/8_11/monitoring-solr-with-prometheus-and-grafana.html
|
|
rules:
|
|
- name: Solr update errors
|
|
description: Solr collection {{ $labels.collection }} has failed updates for replica {{ $labels.replica }} on {{ $labels.base_url }}.
|
|
query: "increase(solr_metrics_core_update_handler_errors_total[1m]) > 1"
|
|
severity: critical
|
|
- name: Solr query errors
|
|
description: Solr has increased query errors in collection {{ $labels.collection }} for replica {{ $labels.replica }} on {{ $labels.base_url }}.
|
|
query: 'increase(solr_metrics_core_errors_total{category="QUERY"}[1m]) > 1'
|
|
severity: warning
|
|
for: 5m
|
|
- name: Solr replication errors
|
|
description: Solr collection {{ $labels.collection }} has failed updates for replica {{ $labels.replica }} on {{ $labels.base_url }}.
|
|
query: 'increase(solr_metrics_core_errors_total{category="REPLICATION"}[1m]) > 1'
|
|
severity: critical
|
|
- name: Solr low live node count
|
|
description: Solr collection {{ $labels.collection }} has less than two live nodes for replica {{ $labels.replica }} on {{ $labels.base_url }}.
|
|
query: "solr_collections_live_nodes < 2"
|
|
severity: critical
|
|
|
|
- name: Hadoop
|
|
exporters:
|
|
- name: hadoop/jmx_exporter
|
|
slug: jmx_exporter
|
|
doc_url: https://github.com/prometheus/jmx_exporter
|
|
rules:
|
|
# Alert rule for NameNode availability
|
|
- name: Hadoop Name Node Down
|
|
query: up{job="hadoop-namenode"} == 0
|
|
for: 5m
|
|
severity: critical
|
|
description: "The Hadoop NameNode service is unavailable."
|
|
|
|
# Alert rule for ResourceManager availability
|
|
- name: Hadoop Resource Manager Down
|
|
query: up{job="hadoop-resourcemanager"} == 0
|
|
for: 5m
|
|
severity: critical
|
|
description: "The Hadoop ResourceManager service is unavailable."
|
|
|
|
# Alert rule for DataNode status
|
|
- name: Hadoop Data Node Out Of Service
|
|
query: hadoop_datanode_last_heartbeat == 0
|
|
for: 10m
|
|
severity: warning
|
|
description: "The Hadoop DataNode is not sending heartbeats."
|
|
|
|
# Alert rule for low HDFS disk space
|
|
- name: Hadoop HDFS Disk Space Low
|
|
query: (hadoop_hdfs_bytes_total - hadoop_hdfs_bytes_used) / hadoop_hdfs_bytes_total < 0.1
|
|
for: 15m
|
|
severity: warning
|
|
description: "Available HDFS disk space is running low."
|
|
|
|
# Alert rule for excessive MapReduce task failures
|
|
- name: Hadoop Map Reduce Task Failures
|
|
query: hadoop_mapreduce_task_failures_total > 100
|
|
for: 10m
|
|
severity: critical
|
|
description: "There is an unusually high number of MapReduce task failures."
|
|
|
|
# Alert rule for high ResourceManager memory usage
|
|
- name: Hadoop Resource Manager Memory High
|
|
query: hadoop_resourcemanager_memory_bytes / hadoop_resourcemanager_memory_max_bytes > 0.8
|
|
for: 15m
|
|
severity: warning
|
|
description: "The Hadoop ResourceManager is approaching its memory limit."
|
|
|
|
# Alert rule for high YARN container allocation failures
|
|
- name: Hadoop YARN Container Allocation Failures
|
|
query: hadoop_yarn_container_allocation_failures_total > 10
|
|
for: 10m
|
|
severity: warning
|
|
description: "There is a significant number of YARN container allocation failures."
|
|
|
|
# Alert rule for excessive HBase region server region count
|
|
- name: Hadoop HBase Region Count High
|
|
query: hadoop_hbase_region_count > 5000
|
|
for: 15m
|
|
severity: warning
|
|
description: "The HBase cluster has an unusually high number of regions."
|
|
|
|
# Alert rule for low HBase region server heap space
|
|
- name: Hadoop HBase Region Server Heap Low
|
|
query: hadoop_hbase_region_server_heap_bytes / hadoop_hbase_region_server_max_heap_bytes < 0.2
|
|
for: 10m
|
|
severity: critical
|
|
description: "HBase Region Servers are running low on heap space."
|
|
|
|
# Alert rule for high HBase Write Requests latency
|
|
- name: Hadoop HBase Write Requests Latency High
|
|
query: hadoop_hbase_write_requests_latency_seconds > 0.5
|
|
for: 10m
|
|
severity: warning
|
|
description: "HBase Write Requests are experiencing high latency."
|
|
|
|
- name: Reverse proxies and load balancers
|
|
services:
|
|
- name: Nginx
|
|
exporters:
|
|
- name: knyar/nginx-lua-prometheus
|
|
slug: knyar-nginx-exporter
|
|
doc_url: https://github.com/knyar/nginx-lua-prometheus
|
|
rules:
|
|
- name: Nginx high HTTP 4xx error rate
|
|
description: Too many HTTP requests with status 4xx (> 5%)
|
|
query: 'sum(rate(nginx_http_requests_total{status=~"^4.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5'
|
|
severity: critical
|
|
for: 1m
|
|
- name: Nginx high HTTP 5xx error rate
|
|
description: Too many HTTP requests with status 5xx (> 5%)
|
|
query: 'sum(rate(nginx_http_requests_total{status=~"^5.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5'
|
|
severity: critical
|
|
for: 1m
|
|
- name: Nginx latency high
|
|
description: Nginx p99 latency is higher than 3 seconds
|
|
query: "histogram_quantile(0.99, sum(rate(nginx_http_request_duration_seconds_bucket[2m])) by (host, node, le)) > 3"
|
|
severity: warning
|
|
for: 2m
|
|
|
|
- name: Apache
|
|
exporters:
|
|
- name: Lusitaniae/apache_exporter
|
|
slug: lusitaniae-apache-exporter
|
|
doc_url: https://github.com/Lusitaniae/apache_exporter
|
|
rules:
|
|
- name: Apache down
|
|
description: Apache down
|
|
query: "apache_up == 0"
|
|
severity: critical
|
|
- name: Apache workers load
|
|
description: Apache workers in busy state approach the max workers count 80% workers busy on {{ $labels.instance }}
|
|
query: '(sum by (instance) (apache_workers{state="busy"}) / sum by (instance) (apache_scoreboard) ) * 100 > 80'
|
|
severity: warning
|
|
for: 2m
|
|
- name: Apache restart
|
|
description: Apache has just been restarted.
|
|
query: "apache_uptime_seconds_total / 60 < 1"
|
|
severity: warning
|
|
|
|
- name: HaProxy
|
|
exporters:
|
|
- name: Embedded exporter (HAProxy >= v2)
|
|
slug: embedded-exporter-v2
|
|
doc_url: https://github.com/haproxy/haproxy/tree/master/contrib/prometheus-exporter
|
|
rules:
|
|
- name: HAProxy high HTTP 4xx error rate backend
|
|
description: Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}
|
|
query: ((sum by (proxy) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5
|
|
severity: critical
|
|
for: 1m
|
|
- name: HAProxy high HTTP 5xx error rate backend
|
|
description: Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}
|
|
query: ((sum by (proxy) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5
|
|
severity: critical
|
|
for: 1m
|
|
- name: HAProxy high HTTP 4xx error rate server
|
|
description: Too many HTTP requests with status 4xx (> 5%) on server {{ $labels.server }}
|
|
query: ((sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5
|
|
severity: critical
|
|
for: 1m
|
|
- name: HAProxy high HTTP 5xx error rate server
|
|
description: Too many HTTP requests with status 5xx (> 5%) on server {{ $labels.server }}
|
|
query: ((sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5
|
|
severity: critical
|
|
for: 1m
|
|
- name: HAProxy server response errors
|
|
description: Too many response errors to {{ $labels.server }} server (> 5%).
|
|
query: (sum by (server) (rate(haproxy_server_response_errors_total[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100 > 5
|
|
severity: critical
|
|
for: 1m
|
|
- name: HAProxy backend connection errors
|
|
description: Too many connection errors to {{ $labels.fqdn }}/{{ $labels.backend }} backend (> 100 req/s). Request throughput may be too high.
|
|
query: (sum by (proxy) (rate(haproxy_backend_connection_errors_total[1m]))) > 100
|
|
severity: critical
|
|
for: 1m
|
|
- name: HAProxy server connection errors
|
|
description: Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high.
|
|
query: (sum by (proxy) (rate(haproxy_server_connection_errors_total[1m]))) > 100
|
|
severity: critical
|
|
- name: HAProxy backend max active session > 80%
|
|
description: Session limit from backend {{ $labels.proxy }} reached 80% of limit - {{ $value | printf "%.2f"}}%
|
|
query: ((haproxy_backend_current_sessions >0) * 100) / (haproxy_backend_limit_sessions > 0) > 80
|
|
severity: warning
|
|
for: 2m
|
|
- name: HAProxy pending requests
|
|
description: Some HAProxy requests are pending on {{ $labels.proxy }} - {{ $value | printf "%.2f"}}
|
|
query: sum by (proxy) (rate(haproxy_backend_current_queue[2m])) > 0
|
|
severity: warning
|
|
for: 2m
|
|
- name: HAProxy HTTP slowing down
|
|
description: Average request time is increasing - {{ $value | printf "%.2f"}}
|
|
query: avg by (instance, proxy) (haproxy_backend_max_total_time_seconds) > 1
|
|
severity: warning
|
|
for: 1m
|
|
- name: HAProxy retry high
|
|
description: High rate of retry on {{ $labels.proxy }} - {{ $value | printf "%.2f"}}
|
|
query: sum by (proxy) (rate(haproxy_backend_retry_warnings_total[1m])) > 10
|
|
severity: warning
|
|
for: 2m
|
|
- name: HAproxy has no alive backends
|
|
description: HAProxy has no alive active or backup backends for {{ $labels.proxy }}
|
|
query: haproxy_backend_active_servers + haproxy_backend_backup_servers == 0
|
|
severity: critical
|
|
- name: HAProxy frontend security blocked requests
|
|
description: HAProxy is blocking requests for security reason
|
|
query: sum by (proxy) (rate(haproxy_frontend_denied_connections_total[2m])) > 10
|
|
severity: warning
|
|
for: 2m
|
|
- name: HAProxy server healthcheck failure
|
|
description: Some server healthcheck are failing on {{ $labels.server }}
|
|
query: increase(haproxy_server_check_failures_total[1m]) > 0
|
|
severity: warning
|
|
for: 1m
|
|
- name: prometheus/haproxy_exporter (HAProxy < v2)
|
|
slug: haproxy-exporter-v1
|
|
doc_url: https://github.com/prometheus/haproxy_exporter
|
|
rules:
|
|
- name: HAProxy down
|
|
description: HAProxy down
|
|
query: "haproxy_up == 0"
|
|
severity: critical
|
|
- name: HAProxy high HTTP 4xx error rate backend
|
|
description: Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}
|
|
query: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 5'
|
|
severity: critical
|
|
for: 1m
|
|
- name: HAProxy high HTTP 5xx error rate backend
|
|
description: Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}
|
|
query: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 5'
|
|
severity: critical
|
|
for: 1m
|
|
- name: HAProxy high HTTP 4xx error rate server
|
|
description: Too many HTTP requests with status 4xx (> 5%) on server {{ $labels.server }}
|
|
query: 'sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5'
|
|
severity: critical
|
|
for: 1m
|
|
- name: HAProxy high HTTP 5xx error rate server
|
|
description: Too many HTTP requests with status 5xx (> 5%) on server {{ $labels.server }}
|
|
query: 'sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5'
|
|
severity: critical
|
|
for: 1m
|
|
- name: HAProxy server response errors
|
|
description: Too many response errors to {{ $labels.server }} server (> 5%).
|
|
query: "sum by (server) (rate(haproxy_server_response_errors_total[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5"
|
|
severity: critical
|
|
for: 1m
|
|
- name: HAProxy backend connection errors
|
|
description: Too many connection errors to {{ $labels.fqdn }}/{{ $labels.backend }} backend (> 100 req/s). Request throughput may be too high.
|
|
query: "sum by (backend) (rate(haproxy_backend_connection_errors_total[1m])) > 100"
|
|
severity: critical
|
|
for: 1m
|
|
- name: HAProxy server connection errors
|
|
description: Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high.
|
|
query: "sum by (server) (rate(haproxy_server_connection_errors_total[1m])) > 100"
|
|
severity: critical
|
|
- name: HAProxy backend max active session
|
|
description: HAproxy backend {{ $labels.fqdn }}/{{ $labels.backend }} is reaching session limit (> 80%).
|
|
query: "((sum by (backend) (avg_over_time(haproxy_backend_current_sessions[2m]) * 100) / sum by (backend) (avg_over_time(haproxy_backend_limit_sessions[2m])))) > 80"
|
|
severity: warning
|
|
for: 2m
|
|
- name: HAProxy pending requests
|
|
description: Some HAProxy requests are pending on {{ $labels.fqdn }}/{{ $labels.backend }} backend
|
|
query: "sum by (backend) (haproxy_backend_current_queue) > 0"
|
|
severity: warning
|
|
for: 2m
|
|
- name: HAProxy HTTP slowing down
|
|
description: Average request time is increasing
|
|
query: "avg by (backend) (haproxy_backend_http_total_time_average_seconds) > 1"
|
|
severity: warning
|
|
for: 1m
|
|
- name: HAProxy retry high
|
|
description: High rate of retry on {{ $labels.fqdn }}/{{ $labels.backend }} backend
|
|
query: "sum by (backend) (rate(haproxy_backend_retry_warnings_total[1m])) > 10"
|
|
severity: warning
|
|
for: 2m
|
|
- name: HAProxy backend down
|
|
description: HAProxy backend is down
|
|
query: "haproxy_backend_up == 0"
|
|
severity: critical
|
|
- name: HAProxy server down
|
|
description: HAProxy server is down
|
|
query: "haproxy_server_up == 0"
|
|
severity: critical
|
|
- name: HAProxy frontend security blocked requests
|
|
description: HAProxy is blocking requests for security reason
|
|
query: "sum by (frontend) (rate(haproxy_frontend_requests_denied_total[2m])) > 10"
|
|
severity: warning
|
|
for: 2m
|
|
- name: HAProxy server healthcheck failure
|
|
description: Some server healthcheck are failing on {{ $labels.server }}
|
|
query: "increase(haproxy_server_check_failures_total[1m]) > 0"
|
|
severity: warning
|
|
for: 1m
|
|
|
|
- name: Traefik
|
|
exporters:
|
|
- name: Embedded exporter v2
|
|
slug: embedded-exporter-v2
|
|
doc_url: https://docs.traefik.io/observability/metrics/prometheus/
|
|
rules:
|
|
- name: Traefik service down
|
|
description: All Traefik services are down
|
|
query: "count(traefik_service_server_up) by (service) == 0"
|
|
severity: critical
|
|
- name: Traefik high HTTP 4xx error rate service
|
|
description: Traefik service 4xx error rate is above 5%
|
|
query: 'sum(rate(traefik_service_requests_total{code=~"4.*"}[3m])) by (service) / sum(rate(traefik_service_requests_total[3m])) by (service) * 100 > 5'
|
|
severity: critical
|
|
for: 1m
|
|
- name: Traefik high HTTP 5xx error rate service
|
|
description: Traefik service 5xx error rate is above 5%
|
|
query: 'sum(rate(traefik_service_requests_total{code=~"5.*"}[3m])) by (service) / sum(rate(traefik_service_requests_total[3m])) by (service) * 100 > 5'
|
|
severity: critical
|
|
for: 1m
|
|
- name: Embedded exporter v1
|
|
slug: embedded-exporter-v1
|
|
doc_url: https://docs.traefik.io/observability/metrics/prometheus/
|
|
rules:
|
|
- name: Traefik backend down
|
|
description: All Traefik backends are down
|
|
query: "count(traefik_backend_server_up) by (backend) == 0"
|
|
severity: critical
|
|
- name: Traefik high HTTP 4xx error rate backend
|
|
description: Traefik backend 4xx error rate is above 5%
|
|
query: 'sum(rate(traefik_backend_requests_total{code=~"4.*"}[3m])) by (backend) / sum(rate(traefik_backend_requests_total[3m])) by (backend) * 100 > 5'
|
|
severity: critical
|
|
for: 1m
|
|
- name: Traefik high HTTP 5xx error rate backend
|
|
description: Traefik backend 5xx error rate is above 5%
|
|
query: 'sum(rate(traefik_backend_requests_total{code=~"5.*"}[3m])) by (backend) / sum(rate(traefik_backend_requests_total[3m])) by (backend) * 100 > 5'
|
|
severity: critical
|
|
for: 1m
|
|
|
|
- name: Caddy
|
|
exporters:
|
|
- name: Embedded exporter
|
|
slug: embedded-exporter
|
|
doc_url: https://caddyserver.com/docs/metrics
|
|
rules:
|
|
- name: Caddy Reverse Proxy Down
|
|
description: "All Caddy reverse proxies are down"
|
|
query: "count(caddy_reverse_proxy_upstreams_healthy) by (upstream) == 0"
|
|
severity: critical
|
|
for: 0m
|
|
- name: Caddy high HTTP 4xx error rate service
|
|
description: "Caddy service 4xx error rate is above 5%"
|
|
query: 'sum(rate(caddy_http_request_duration_seconds_count{code=~"4.."}[3m])) by (instance) / sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) * 100 > 5'
|
|
severity: critical
|
|
for: 1m
|
|
- name: Caddy high HTTP 5xx error rate service
|
|
description: "Caddy service 5xx error rate is above 5%"
|
|
query: 'sum(rate(caddy_http_request_duration_seconds_count{code=~"5.."}[3m])) by (instance) / sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) * 100 > 5'
|
|
severity: critical
|
|
for: 1m
|
|
|
|
- name: Runtimes
|
|
services:
|
|
- name: PHP-FPM
|
|
exporters:
|
|
- name: bakins/php-fpm-exporter
|
|
slug: bakins-fpm-exporter
|
|
doc_url: https://github.com/bakins/php-fpm-exporter
|
|
rules:
|
|
- name: PHP-FPM max-children reached
|
|
description: PHP-FPM reached max children - {{ $labels.instance }}
|
|
query: "sum(phpfpm_max_children_reached_total) by (instance) > 0"
|
|
severity: warning
|
|
|
|
- name: JVM
|
|
exporters:
|
|
- name: java-client
|
|
slug: jvm-exporter
|
|
doc_url: https://github.com/prometheus/client_java
|
|
rules:
|
|
- name: JVM memory filling up
|
|
description: JVM memory is filling up (> 80%)
|
|
query: '(sum by (instance)(jvm_memory_used_bytes{area="heap"}) / sum by (instance)(jvm_memory_max_bytes{area="heap"})) * 100 > 80'
|
|
severity: warning
|
|
for: 2m
|
|
|
|
- name: Sidekiq
|
|
exporters:
|
|
- name: Strech/sidekiq-prometheus-exporter
|
|
slug: strech-sidekiq-exporter
|
|
doc_url: https://github.com/Strech/sidekiq-prometheus-exporter
|
|
rules:
|
|
- name: Sidekiq queue size
|
|
description: Sidekiq queue {{ $labels.name }} is growing
|
|
query: "sidekiq_queue_size > 100"
|
|
severity: warning
|
|
for: 1m
|
|
- name: Sidekiq scheduling latency too high
|
|
description: Sidekiq jobs are taking more than 1min to be picked up. Users may be seeing delays in background processing.
|
|
query: "max(sidekiq_queue_latency) > 60"
|
|
severity: critical
|
|
|
|
- name: Orchestrators
|
|
services:
|
|
- name: Kubernetes
|
|
exporters:
|
|
- name: kube-state-metrics
|
|
slug: kubestate-exporter
|
|
doc_url: https://github.com/kubernetes/kube-state-metrics/tree/master/docs
|
|
rules:
|
|
- name: Kubernetes Node not ready
|
|
summary: Kubernetes Node ready (node {{ $labels.node }})
|
|
description: Node {{ $labels.node }} has been unready for a long time
|
|
query: 'kube_node_status_condition{condition="Ready",status="true"} == 0'
|
|
severity: critical
|
|
for: 10m
|
|
- name: Kubernetes Node scheduling disabled
|
|
summary: Kubernetes node scheduling disabled (node {{ $labels.node }})
|
|
description: Node {{ $labels.node }} has been marked as unschedulable for more than 30 minutes.
|
|
query: 'kube_node_spec_taint{key="node.kubernetes.io/unschedulable"} == 1'
|
|
severity: warning
|
|
for: 30m
|
|
comments: |
|
|
Kubernetes Node with disabled schedules are fine.
|
|
This alarm can be useful to get warned if there are nodes which are longer unscheduled.
|
|
- name: Kubernetes Node memory pressure
|
|
summary: Kubernetes memory pressure (node {{ $labels.node }})
|
|
description: "Node {{ $labels.node }} has MemoryPressure condition"
|
|
query: 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1'
|
|
severity: critical
|
|
for: 2m
|
|
- name: Kubernetes Node disk pressure
|
|
summary: Kubernetes disk pressure (node {{ $labels.node }})
|
|
description: "Node {{ $labels.node }} has DiskPressure condition"
|
|
query: 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1'
|
|
severity: critical
|
|
for: 2m
|
|
- name: Kubernetes Node network unavailable
|
|
description: "Node {{ $labels.node }} has NetworkUnavailable condition"
|
|
query: 'kube_node_status_condition{condition="NetworkUnavailable",status="true"} == 1'
|
|
severity: critical
|
|
for: 2m
|
|
- name: Kubernetes Node out of pod capacity
|
|
description: "Node {{ $labels.node }} is out of pod capacity"
|
|
query: 'sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid, instance) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90'
|
|
severity: warning
|
|
for: 2m
|
|
- name: Kubernetes Container oom killer
|
|
summary: Kubernetes container oom killer ({{ $labels.namespace }}/{{ $labels.pod }}:{{ $labels.container }})
|
|
description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes."
|
|
query: '(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1'
|
|
severity: warning
|
|
- name: Kubernetes Job failed
|
|
summary: Kubernetes Job failed ({{ $labels.namespace }}/{{ $labels.job_name }})
|
|
description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete"
|
|
query: "kube_job_status_failed > 0"
|
|
severity: warning
|
|
- name: Kubernetes Job not starting
|
|
summary: Kubernetes Job not starting ({{ $labels.namespace }}/{{ $labels.job_name }})
|
|
description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} did not start for 10 minutes"
|
|
query: "kube_job_status_active == 0 and kube_job_status_failed == 0 and kube_job_status_succeeded == 0 and (time() - kube_job_status_start_time) > 600"
|
|
severity: warning
|
|
- name: Kubernetes CronJob failing
|
|
summary: Kubernetes CronJob failing ({{ $labels.namespace }}/{{ $labels.cronjob }})
|
|
description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is failing"
|
|
query: "(kube_cronjob_status_last_schedule_time > kube_cronjob_status_last_successful_time) AND (kube_cronjob_status_active == 0) AND (kube_cronjob_spec_suspend == 0)"
|
|
severity: critical
|
|
- name: Kubernetes CronJob suspended
|
|
summary: Kubernetes CronJob suspended ({{ $labels.namespace }}/{{ $labels.cronjob }})
|
|
description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is suspended"
|
|
query: "kube_cronjob_spec_suspend != 0"
|
|
severity: warning
|
|
- name: Kubernetes PersistentVolumeClaim pending
|
|
summary: Kubernetes PersistentVolumeClaim pending ({{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }})
|
|
description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pending"
|
|
query: 'kube_persistentvolumeclaim_status_phase{phase="Pending"} == 1'
|
|
severity: warning
|
|
for: 2m
|
|
- name: Kubernetes Volume out of disk space
|
|
description: Volume is almost full (< 10% left)
|
|
query: "kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes * 100 < 10"
|
|
severity: warning
|
|
for: 2m
|
|
- name: Kubernetes Volume full in four days
|
|
description: "Volume under {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available."
|
|
query: "predict_linear(kubelet_volume_stats_available_bytes[6h:5m], 4 * 24 * 3600) < 0"
|
|
severity: critical
|
|
- name: Kubernetes PersistentVolume error
|
|
summary: Kubernetes PersistentVolumeClaim pending ({{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }})
|
|
description: "Persistent volume {{ $labels.persistentvolume }} is in bad state"
|
|
query: 'kube_persistentvolume_status_phase{phase=~"Failed|Pending", job="kube-state-metrics"} > 0'
|
|
severity: critical
|
|
- name: Kubernetes StatefulSet down
|
|
summary: Kubernetes StatefulSet down ({{ $labels.namespace }}/{{ $labels.statefulset }})
|
|
description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} went down
|
|
query: "kube_statefulset_replicas != kube_statefulset_status_replicas_ready > 0"
|
|
severity: critical
|
|
for: 1m
|
|
- name: Kubernetes HPA scale inability
|
|
description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to scale
|
|
query: '(kube_horizontalpodautoscaler_spec_max_replicas - kube_horizontalpodautoscaler_status_desired_replicas) * on (horizontalpodautoscaler,namespace) (kube_horizontalpodautoscaler_status_condition{condition="ScalingLimited", status="true"} == 1) == 0'
|
|
severity: warning
|
|
for: 2m
|
|
- name: Kubernetes HPA metrics unavailability
|
|
description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to collect metrics
|
|
query: 'kube_horizontalpodautoscaler_status_condition{status="false", condition="ScalingActive"} == 1'
|
|
severity: warning
|
|
- name: Kubernetes HPA scale maximum
|
|
description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has hit maximum number of desired pods
|
|
query: "(kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas) and (kube_horizontalpodautoscaler_spec_max_replicas > 1) and (kube_horizontalpodautoscaler_spec_min_replicas != kube_horizontalpodautoscaler_spec_max_replicas)"
|
|
severity: info
|
|
for: 2m
|
|
- name: Kubernetes HPA underutilized
|
|
description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is constantly at minimum replicas for 50% of the time. Potential cost saving here.
|
|
query: "max(quantile_over_time(0.5, kube_horizontalpodautoscaler_status_desired_replicas[1d]) == kube_horizontalpodautoscaler_spec_min_replicas) by (horizontalpodautoscaler) > 3" # allow minimum 3 replicas running
|
|
severity: info
|
|
- name: Kubernetes Pod not healthy
|
|
summary: Kubernetes Pod not healthy ({{ $labels.namespace }}/{{ $labels.pod }})
|
|
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-running state for longer than 15 minutes.
|
|
query: 'sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"}) > 0'
|
|
severity: critical
|
|
for: 15m
|
|
- name: Kubernetes pod crash looping
|
|
summary: Kubernetes pod crash looping ({{ $labels.namespace }}/{{ $labels.pod }})
|
|
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping
|
|
query: "increase(kube_pod_container_status_restarts_total[1m]) > 3"
|
|
severity: warning
|
|
for: 2m
|
|
- name: Kubernetes ReplicaSet replicas mismatch
|
|
summary: Kubernetes ReplicasSet mismatch ({{ $labels.namespace }}/{{ $labels.replicaset }})
|
|
description: ReplicaSet {{ $labels.namespace }}/{{ $labels.replicaset }} replicas mismatch
|
|
query: "kube_replicaset_spec_replicas != kube_replicaset_status_ready_replicas"
|
|
severity: warning
|
|
for: 10m
|
|
- name: Kubernetes Deployment replicas mismatch
|
|
summary: Kubernetes Deployment replicas mismatch ({{ $labels.namespace }}/{{ $labels.deployment }})
|
|
description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replicas mismatch
|
|
query: "kube_deployment_spec_replicas != kube_deployment_status_replicas_available"
|
|
severity: warning
|
|
for: 10m
|
|
- name: Kubernetes StatefulSet replicas mismatch
|
|
description: StatefulSet does not match the expected number of replicas.
|
|
query: "kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas"
|
|
severity: warning
|
|
for: 10m
|
|
- name: Kubernetes Deployment generation mismatch
|
|
summary: Kubernetes Deployment generation mismatch ({{ $labels.namespace }}/{{ $labels.deployment }})
|
|
description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has failed but has not been rolled back.
|
|
query: "kube_deployment_status_observed_generation != kube_deployment_metadata_generation"
|
|
severity: critical
|
|
for: 10m
|
|
- name: Kubernetes StatefulSet generation mismatch
|
|
summary: Kubernetes StatefulSet generation mismatch ({{ $labels.namespace }}/{{ $labels.statefulset }})
|
|
description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has failed but has not been rolled back.
|
|
query: "kube_statefulset_status_observed_generation != kube_statefulset_metadata_generation"
|
|
severity: critical
|
|
for: 10m
|
|
- name: Kubernetes StatefulSet update not rolled out
|
|
summary: Kubernetes StatefulSet update not rolled out ({{ $labels.namespace }}/{{ $labels.statefulset }})
|
|
description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.
|
|
query: "max without (revision) (kube_statefulset_status_current_revision unless kube_statefulset_status_update_revision) * (kube_statefulset_replicas != kube_statefulset_status_replicas_updated)"
|
|
severity: warning
|
|
for: 10m
|
|
- name: Kubernetes DaemonSet rollout stuck
|
|
summary: Kubernetes DaemonSet rollout stuck ({{ $labels.namespace }}/{{ $labels.daemonset }})
|
|
description: Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled or not ready
|
|
query: "kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 or kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0"
|
|
severity: warning
|
|
for: 10m
|
|
- name: Kubernetes DaemonSet misscheduled
|
|
summary: Kubernetes DaemonSet misscheduled ({{ $labels.namespace }}/{{ $labels.daemonset }})
|
|
description: Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run
|
|
query: "kube_daemonset_status_number_misscheduled > 0"
|
|
severity: critical
|
|
for: 1m
|
|
- name: Kubernetes CronJob too long
|
|
summary: Kubernetes CronJob too long ({{ $labels.namespace }}/{{ $labels.cronjob }})
|
|
description: CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete.
|
|
query: "kube_job_status_start_time > 0 and absent(kube_job_status_completion_time) and (time() - kube_job_status_start_time) > 3600"
|
|
severity: warning
|
|
comments: |
|
|
Threshold should be customized for each cronjob name.
|
|
- name: Kubernetes Job slow completion
|
|
summary: Kubernetes job slow completion ({{ $labels.namespace }}/{{ $labels.job_name }})
|
|
description: Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in time.
|
|
query: "kube_job_spec_completions - kube_job_status_succeeded - kube_job_status_failed > 0"
|
|
severity: critical
|
|
for: 12h
|
|
- name: Kubernetes API server errors
|
|
description: Kubernetes API server is experiencing high error rate
|
|
query: 'sum(rate(apiserver_request_total{job="apiserver",code=~"(?:5..)"}[1m])) by (instance, job) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) by (instance, job) * 100 > 3'
|
|
severity: critical
|
|
for: 2m
|
|
- name: Kubernetes API client errors
|
|
description: Kubernetes API client is experiencing high error rate
|
|
query: '(sum(rate(rest_client_requests_total{code=~"(4|5).."}[1m])) by (instance, job) / sum(rate(rest_client_requests_total[1m])) by (instance, job)) * 100 > 1'
|
|
severity: critical
|
|
for: 2m
|
|
- name: Kubernetes client certificate expires next week
|
|
description: A client certificate used to authenticate to the apiserver is expiring next week.
|
|
query: 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60'
|
|
severity: warning
|
|
- name: Kubernetes client certificate expires soon
|
|
description: A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.
|
|
query: 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 24*60*60'
|
|
severity: critical
|
|
- name: Kubernetes API server latency
|
|
description: "Kubernetes API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}."
|
|
query: 'histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{verb!~"(?:CONNECT|WATCHLIST|WATCH|PROXY)"} [10m])) WITHOUT (subresource)) > 1'
|
|
severity: warning
|
|
for: 2m
|
|
|
|
- name: Nomad
|
|
exporters:
|
|
- name: Embedded exporter
|
|
slug: embedded-exporter
|
|
rules:
|
|
- name: Nomad job failed
|
|
description: Nomad job failed
|
|
query: "nomad_nomad_job_summary_failed > 0"
|
|
severity: warning
|
|
- name: Nomad job lost
|
|
description: Nomad job lost
|
|
query: "nomad_nomad_job_summary_lost > 0"
|
|
severity: warning
|
|
- name: Nomad job queued
|
|
description: Nomad job queued
|
|
query: "nomad_nomad_job_summary_queued > 0"
|
|
severity: warning
|
|
for: 2m
|
|
- name: Nomad blocked evaluation
|
|
description: Nomad blocked evaluation
|
|
query: "nomad_nomad_blocked_evals_total_blocked > 0"
|
|
severity: warning
|
|
|
|
- name: Consul
|
|
exporters:
|
|
- name: prometheus/consul_exporter
|
|
slug: consul-exporter
|
|
doc_url: https://github.com/prometheus/consul_exporter
|
|
rules:
|
|
- name: Consul service healthcheck failed
|
|
description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`"
|
|
query: "consul_catalog_service_node_healthy == 0"
|
|
severity: critical
|
|
for: 1m # allows a short service restart
|
|
- name: Consul missing master node
|
|
description: Numbers of consul raft peers should be 3, in order to preserve quorum.
|
|
query: "consul_raft_peers < 3"
|
|
severity: critical
|
|
- name: Consul agent unhealthy
|
|
description: A Consul agent is down
|
|
query: 'consul_health_node_status{status="critical"} == 1'
|
|
severity: critical
|
|
|
|
- name: Etcd
|
|
exporters:
|
|
- name: Embedded exporter
|
|
slug: embedded-exporter
|
|
rules:
|
|
- name: Etcd insufficient Members
|
|
description: Etcd cluster should have an odd number of members
|
|
query: "count(etcd_server_id) % 2 == 0"
|
|
severity: critical
|
|
- name: Etcd no Leader
|
|
description: Etcd cluster have no leader
|
|
query: "etcd_server_has_leader == 0"
|
|
severity: critical
|
|
- name: Etcd high number of leader changes
|
|
description: Etcd leader changed more than 2 times during 10 minutes
|
|
query: "increase(etcd_server_leader_changes_seen_total[10m]) > 2"
|
|
severity: warning
|
|
- name: Etcd high number of failed GRPC requests
|
|
description: More than 1% GRPC request failure detected in Etcd
|
|
query: 'sum(rate(grpc_server_handled_total{grpc_code!="OK"}[1m])) BY (grpc_service, grpc_method) / sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0.01'
|
|
severity: warning
|
|
for: 2m
|
|
- name: Etcd high number of failed GRPC requests
|
|
description: More than 5% GRPC request failure detected in Etcd
|
|
query: 'sum(rate(grpc_server_handled_total{grpc_code!="OK"}[1m])) BY (grpc_service, grpc_method) / sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0.05'
|
|
severity: critical
|
|
for: 2m
|
|
- name: Etcd GRPC requests slow
|
|
description: GRPC requests slowing down, 99th percentile is over 0.15s
|
|
query: 'histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{grpc_type="unary"}[1m])) by (grpc_service, grpc_method, le)) > 0.15'
|
|
severity: warning
|
|
for: 2m
|
|
- name: Etcd high number of failed HTTP requests
|
|
description: More than 1% HTTP failure detected in Etcd
|
|
query: "sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.01"
|
|
severity: warning
|
|
for: 2m
|
|
- name: Etcd high number of failed HTTP requests
|
|
description: More than 5% HTTP failure detected in Etcd
|
|
query: "sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.05"
|
|
severity: critical
|
|
for: 2m
|
|
- name: Etcd HTTP requests slow
|
|
description: HTTP requests slowing down, 99th percentile is over 0.15s
|
|
query: "histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[1m])) > 0.15"
|
|
severity: warning
|
|
for: 2m
|
|
- name: Etcd member communication slow
|
|
description: Etcd member communication slowing down, 99th percentile is over 0.15s
|
|
query: "histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) > 0.15"
|
|
severity: warning
|
|
for: 2m
|
|
- name: Etcd high number of failed proposals
|
|
description: Etcd server got more than 5 failed proposals past hour
|
|
query: "increase(etcd_server_proposals_failed_total[1h]) > 5"
|
|
severity: warning
|
|
for: 2m
|
|
- name: Etcd high fsync durations
|
|
description: Etcd WAL fsync duration increasing, 99th percentile is over 0.5s
|
|
query: "histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) > 0.5"
|
|
severity: warning
|
|
for: 2m
|
|
- name: Etcd high commit durations
|
|
description: Etcd commit duration increasing, 99th percentile is over 0.25s
|
|
query: "histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[1m])) > 0.25"
|
|
severity: warning
|
|
for: 2m
|
|
|
|
- name: Linkerd
|
|
exporters:
|
|
- name: Embedded exporter
|
|
slug: embedded-exporter
|
|
doc_url: https://linkerd.io/2/tasks/exporting-metrics/
|
|
rules:
|
|
- name: Linkerd high error rate
|
|
description: Linkerd error rate for {{ $labels.deployment | $labels.statefulset | $labels.daemonset }} is over 10%
|
|
query: "sum(rate(request_errors_total[1m])) by (deployment, statefulset, daemonset) / sum(rate(request_total[1m])) by (deployment, statefulset, daemonset) * 100 > 10"
|
|
severity: warning
|
|
for: 1m
|
|
|
|
- name: Istio
|
|
exporters:
|
|
- name: Embedded exporter
|
|
slug: embedded-exporter
|
|
doc_url: https://istio.io/latest/docs/tasks/observability/metrics/querying-metrics/
|
|
rules:
|
|
- name: Istio Kubernetes gateway availability drop
|
|
description: Gateway pods have dropped. Inbound traffic will likely be affected.
|
|
query: 'min(kube_deployment_status_replicas_available{deployment="istio-ingressgateway", namespace="istio-system"}) without (instance, pod) < 2'
|
|
severity: warning
|
|
for: 1m
|
|
- name: Istio Pilot high total request rate
|
|
description: Number of Istio Pilot push errors is too high (> 5%). Envoy sidecars might have outdated configuration.
|
|
query: "sum(rate(pilot_xds_push_errors[1m])) / sum(rate(pilot_xds_pushes[1m])) * 100 > 5"
|
|
severity: warning
|
|
for: 1m
|
|
- name: Istio Mixer Prometheus dispatches low
|
|
description: Number of Mixer dispatches to Prometheus is too low. Istio metrics might not be being exported properly.
|
|
query: 'sum(rate(mixer_runtime_dispatches_total{adapter=~"prometheus"}[1m])) < 180'
|
|
severity: warning
|
|
for: 1m
|
|
- name: Istio high total request rate
|
|
description: Global request rate in the service mesh is unusually high.
|
|
query: 'sum(rate(istio_requests_total{reporter="destination"}[5m])) > 1000'
|
|
severity: warning
|
|
for: 2m
|
|
- name: Istio low total request rate
|
|
description: Global request rate in the service mesh is unusually low.
|
|
query: 'sum(rate(istio_requests_total{reporter="destination"}[5m])) < 100'
|
|
severity: warning
|
|
for: 2m
|
|
- name: Istio high 4xx error rate
|
|
description: High percentage of HTTP 4xx responses in Istio (> 5%).
|
|
query: 'sum(rate(istio_requests_total{reporter="destination", response_code=~"4.*"}[5m])) / sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100 > 5'
|
|
severity: warning
|
|
for: 1m
|
|
- name: Istio high 5xx error rate
|
|
description: High percentage of HTTP 5xx responses in Istio (> 5%).
|
|
query: 'sum(rate(istio_requests_total{reporter="destination", response_code=~"5.*"}[5m])) / sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100 > 5'
|
|
severity: warning
|
|
for: 1m
|
|
- name: Istio high request latency
|
|
description: Istio average requests execution is longer than 100ms.
|
|
query: 'rate(istio_request_duration_milliseconds_sum{reporter="destination"}[1m]) / rate(istio_request_duration_milliseconds_count{reporter="destination"}[1m]) > 100'
|
|
severity: warning
|
|
for: 1m
|
|
- name: Istio latency 99 percentile
|
|
description: Istio 1% slowest requests are longer than 1000ms.
|
|
query: "histogram_quantile(0.99, sum(rate(istio_request_duration_milliseconds_bucket[1m])) by (destination_canonical_service, destination_workload_namespace, source_canonical_service, source_workload_namespace, le)) > 1000"
|
|
severity: warning
|
|
for: 1m
|
|
- name: Istio Pilot Duplicate Entry
|
|
description: Istio pilot duplicate entry error.
|
|
query: "sum(rate(pilot_duplicate_envoy_clusters{}[5m])) > 0"
|
|
severity: critical
|
|
|
|
- name: ArgoCD
|
|
exporters:
|
|
- name: Embedded exporter
|
|
slug: embedded-exporter
|
|
doc_url: https://argo-cd.readthedocs.io/en/stable/operator-manual/metrics/
|
|
rules:
|
|
- name: ArgoCD service not synced
|
|
description: Service {{ $labels.name }} run by argo is currently not in sync.
|
|
query: 'argocd_app_info{sync_status!="Synced"} != 0'
|
|
severity: warning
|
|
for: 15m
|
|
- name: ArgoCD service unhealthy
|
|
description: Service {{ $labels.name }} run by argo is currently not healthy.
|
|
query: 'argocd_app_info{health_status!="Healthy"} != 0'
|
|
severity: warning
|
|
for: 15m
|
|
|
|
- name: FluxCD
|
|
exporters:
|
|
- name: Embedded exporter
|
|
slug: embedded-exporter
|
|
doc_url: https://fluxcd.io/flux/monitoring/metrics/
|
|
rules:
|
|
- name: Flux Kustomization Failure
|
|
description: The {{ $labels.customresource_kind }} '{{ $labels.name }}' in namespace {{ $labels.exported_namespace }} is marked as not ready.
|
|
query: 'gotk_resource_info{ready="False", customresource_kind="Kustomization"} > 0'
|
|
severity: warning
|
|
for: 15m
|
|
- name: Flux HelmRelease Failure
|
|
description: The {{ $labels.customresource_kind }} '{{ $labels.name }}' in namespace {{ $labels.exported_namespace }} is marked as not ready.
|
|
query: 'gotk_resource_info{ready="False", customresource_kind="HelmRelease"} > 0'
|
|
severity: warning
|
|
for: 15m
|
|
- name: Flux Source Issue
|
|
description: Flux source {{ $labels.customresource_kind }} '{{ $labels.name }}' has issue(s).
|
|
query: 'gotk_resource_info{ready="False", customresource_kind=~"GitRepository|HelmRepository|Bucket|OCIRepository"} > 0'
|
|
severity: warning
|
|
for: 15m
|
|
- name: Flux Image Issue
|
|
description: The {{ $labels.customresource_kind }} '{{ $labels.name }}' is marked as not ready.
|
|
query: 'gotk_resource_info{ready="False", customresource_kind=~"ImagePolicy|ImageRepository|ImageUpdateAutomation"} > 0'
|
|
severity: warning
|
|
for: 15m
|
|
|
|
- name: Network, security and storage
|
|
services:
|
|
- name: Ceph
|
|
exporters:
|
|
- name: Embedded exporter
|
|
slug: embedded-exporter
|
|
doc_url: https://docs.ceph.com/en/quincy/mgr/prometheus/
|
|
rules:
|
|
- name: Ceph State
|
|
description: Ceph instance unhealthy
|
|
query: "ceph_health_status != 0"
|
|
severity: critical
|
|
- name: Ceph monitor clock skew
|
|
description: Ceph monitor clock skew detected. Please check ntp and hardware clock settings
|
|
query: "abs(ceph_monitor_clock_skew_seconds) > 0.2"
|
|
severity: warning
|
|
for: 2m
|
|
- name: Ceph monitor low space
|
|
description: Ceph monitor storage is low.
|
|
query: "ceph_monitor_avail_percent < 10"
|
|
severity: warning
|
|
for: 2m
|
|
- name: Ceph OSD Down
|
|
description: Ceph Object Storage Daemon Down
|
|
query: "ceph_osd_up == 0"
|
|
severity: critical
|
|
- name: Ceph high OSD latency
|
|
description: "Ceph Object Storage Daemon latency is high. Please check if it doesn't stuck in weird state."
|
|
query: "ceph_osd_perf_apply_latency_seconds > 5"
|
|
severity: warning
|
|
for: 1m
|
|
- name: Ceph OSD low space
|
|
description: Ceph Object Storage Daemon is going out of space. Please add more disks.
|
|
query: ceph_osd_utilization > 90
|
|
severity: warning
|
|
for: 2m
|
|
- name: Ceph OSD reweighted
|
|
description: Ceph Object Storage Daemon takes too much time to resize.
|
|
query: "ceph_osd_weight < 1"
|
|
severity: warning
|
|
for: 2m
|
|
- name: Ceph PG down
|
|
description: Some Ceph placement groups are down. Please ensure that all the data are available.
|
|
query: "ceph_pg_down > 0"
|
|
severity: critical
|
|
- name: Ceph PG incomplete
|
|
description: Some Ceph placement groups are incomplete. Please ensure that all the data are available.
|
|
query: "ceph_pg_incomplete > 0"
|
|
severity: critical
|
|
- name: Ceph PG inconsistent
|
|
description: Some Ceph placement groups are inconsistent. Data is available but inconsistent across nodes.
|
|
query: ceph_pg_inconsistent > 0
|
|
severity: warning
|
|
- name: Ceph PG activation long
|
|
description: Some Ceph placement groups are too long to activate.
|
|
query: "ceph_pg_activating > 0"
|
|
severity: warning
|
|
for: 2m
|
|
- name: Ceph PG backfill full
|
|
description: Some Ceph placement groups are located on full Object Storage Daemon on cluster. Those PGs can be unavailable shortly. Please check OSDs, change weight or reconfigure CRUSH rules.
|
|
query: "ceph_pg_backfill_toofull > 0"
|
|
severity: warning
|
|
for: 2m
|
|
- name: Ceph PG unavailable
|
|
description: Some Ceph placement groups are unavailable.
|
|
query: "ceph_pg_total - ceph_pg_active > 0"
|
|
severity: critical
|
|
|
|
- name: SpeedTest
|
|
exporters:
|
|
- name: Speedtest exporter
|
|
slug: nlamirault-speedtest-exporter
|
|
doc_url: https://github.com/nlamirault/speedtest_exporter
|
|
rules:
|
|
- name: SpeedTest Slow Internet Download
|
|
description: Internet download speed is currently {{humanize $value}} Mbps.
|
|
query: "avg_over_time(speedtest_download[10m]) < 100"
|
|
severity: warning
|
|
- name: SpeedTest Slow Internet Upload
|
|
description: Internet upload speed is currently {{humanize $value}} Mbps.
|
|
query: "avg_over_time(speedtest_upload[10m]) < 20"
|
|
severity: warning
|
|
|
|
- name: ZFS
|
|
exporters:
|
|
- name: node-exporter
|
|
slug: node-exporter
|
|
doc_url: https://github.com/prometheus/node_exporter
|
|
rules:
|
|
- name: ZFS offline pool
|
|
description: "A ZFS zpool is in a unexpected state: {{ $labels.state }}."
|
|
query: 'node_zfs_zpool_state{state!="online"} > 0'
|
|
severity: critical
|
|
for: 1m
|
|
- name: ZFS exporter
|
|
slug: zfs_exporter
|
|
doc_url: https://github.com/pdf/zfs_exporter
|
|
rules:
|
|
- name: ZFS pool out of space
|
|
description: Disk is almost full (< 10% left)
|
|
query: "zfs_pool_free_bytes * 100 / zfs_pool_size_bytes < 10 and ON (instance, device, mountpoint) zfs_pool_readonly == 0"
|
|
severity: warning
|
|
- name: ZFS pool unhealthy
|
|
description: ZFS pool state is {{ $value }}. See comments for more information.
|
|
query: "zfs_pool_health > 0"
|
|
severity: critical
|
|
comments: |
|
|
0: ONLINE
|
|
1: DEGRADED
|
|
2: FAULTED
|
|
3: OFFLINE
|
|
4: UNAVAIL
|
|
5: REMOVED
|
|
6: SUSPENDED
|
|
- name: ZFS collector failed
|
|
description: ZFS collector for {{ $labels.instance }} has failed to collect information
|
|
query: "zfs_scrape_collector_success != 1"
|
|
severity: warning
|
|
|
|
- name: OpenEBS
|
|
exporters:
|
|
- name: Embedded exporter
|
|
slug: embedded-exporter
|
|
rules:
|
|
- name: OpenEBS used pool capacity
|
|
description: "OpenEBS Pool use more than 80% of his capacity"
|
|
query: "openebs_used_pool_capacity_percent > 80"
|
|
severity: warning
|
|
for: 2m
|
|
|
|
- name: Minio
|
|
exporters:
|
|
- name: Embedded exporter
|
|
slug: embedded-exporter
|
|
rules:
|
|
- name: Minio cluster disk offline
|
|
description: "Minio cluster disk is offline"
|
|
query: "minio_cluster_drive_offline_total > 0"
|
|
severity: critical
|
|
- name: Minio node disk offline
|
|
description: "Minio cluster node disk is offline"
|
|
query: "minio_cluster_nodes_offline_total > 0"
|
|
severity: critical
|
|
- name: Minio disk space usage
|
|
description: "Minio available free space is low (< 10%)"
|
|
query: disk_storage_available / disk_storage_total * 100 < 10
|
|
severity: warning
|
|
|
|
- name: SSL/TLS
|
|
exporters:
|
|
- name: ssl_exporter
|
|
slug: ribbybibby-ssl-exporter
|
|
doc_url: https://github.com/ribbybibby/ssl_exporter
|
|
rules:
|
|
- name: SSL certificate probe failed
|
|
description: Failed to fetch SSL information {{ $labels.instance }}
|
|
query: ssl_probe_success == 0
|
|
severity: critical
|
|
- name: SSL certificate OSCP status unknown
|
|
description: Failed to get the OSCP status {{ $labels.instance }}
|
|
query: ssl_ocsp_response_status == 2
|
|
severity: warning
|
|
- name: SSL certificate revoked
|
|
description: SSL certificate revoked {{ $labels.instance }}
|
|
query: ssl_ocsp_response_status == 1
|
|
severity: critical
|
|
- name: SSL certificate expiry (< 7 days)
|
|
description: "{{ $labels.instance }} Certificate is expiring in 7 days"
|
|
query: ssl_verified_cert_not_after{chain_no="0"} - time() < 86400 * 7
|
|
severity: warning
|
|
|
|
- name: Juniper
|
|
exporters:
|
|
- name: czerwonk/junos_exporter
|
|
slug: czerwonk-junos-exporter
|
|
doc_url: https://github.com/czerwonk/junos_exporter
|
|
rules:
|
|
- name: Juniper switch down
|
|
description: The switch appears to be down
|
|
query: junos_up == 0
|
|
severity: critical
|
|
- name: Juniper high Bandwidth Usage 1GiB
|
|
description: Interface is highly saturated. (> 0.90GiB/s)
|
|
query: "rate(junos_interface_transmit_bytes[1m]) * 8 > 1e+9 * 0.90"
|
|
severity: critical
|
|
for: 1m
|
|
- name: Juniper high Bandwidth Usage 1GiB
|
|
description: Interface is getting saturated. (> 0.80GiB/s)
|
|
query: "rate(junos_interface_transmit_bytes[1m]) * 8 > 1e+9 * 0.80"
|
|
severity: warning
|
|
for: 1m
|
|
|
|
- name: CoreDNS
|
|
exporters:
|
|
- name: Embedded exporter
|
|
slug: embedded-exporter
|
|
rules:
|
|
- name: CoreDNS Panic Count
|
|
description: Number of CoreDNS panics encountered
|
|
query: "increase(coredns_panics_total[1m]) > 0"
|
|
severity: critical
|
|
|
|
- name: Freeswitch
|
|
exporters:
|
|
- name: znerol/prometheus-freeswitch-exporter
|
|
slug: znerol-freeswitch-exporter
|
|
doc_url: https://pypi.org/project/prometheus-freeswitch-exporter
|
|
rules:
|
|
- name: Freeswitch down
|
|
description: Freeswitch is unresponsive
|
|
query: "freeswitch_up == 0"
|
|
severity: critical
|
|
- name: Freeswitch Sessions Warning
|
|
description: 'High sessions usage on {{ $labels.instance }}: {{ $value | printf "%.2f"}}%'
|
|
query: "(freeswitch_session_active * 100 / freeswitch_session_limit) > 80"
|
|
severity: warning
|
|
for: 10m
|
|
- name: Freeswitch Sessions Critical
|
|
description: 'High sessions usage on {{ $labels.instance }}: {{ $value | printf "%.2f"}}%'
|
|
query: "(freeswitch_session_active * 100 / freeswitch_session_limit) > 90"
|
|
severity: critical
|
|
for: 5m
|
|
|
|
- name: Hashicorp Vault
|
|
exporters:
|
|
- name: Embedded exporter
|
|
slug: embedded-exporter
|
|
doc_url: https://github.com/hashicorp/vault/blob/master/website/content/docs/configuration/telemetry.mdx#prometheus
|
|
rules:
|
|
- name: Vault sealed
|
|
description: "Vault instance is sealed on {{ $labels.instance }}"
|
|
query: "vault_core_unsealed == 0"
|
|
severity: critical
|
|
- name: Vault too many pending tokens
|
|
description: 'Too many pending tokens {{ $labels.instance }}: {{ $value | printf "%.2f"}}%'
|
|
query: "avg(vault_token_create_count - vault_token_store_count) > 0"
|
|
severity: warning
|
|
for: 5m
|
|
- name: Vault too many infinity tokens
|
|
description: 'Too many infinity tokens {{ $labels.instance }}: {{ $value | printf "%.2f"}}%'
|
|
query: 'vault_token_count_by_ttl{creation_ttl="+Inf"} > 3'
|
|
severity: warning
|
|
for: 5m
|
|
- name: Vault cluster health
|
|
description: 'Vault cluster is not healthy {{ $labels.instance }}: {{ $value | printf "%.2f"}}%'
|
|
query: "sum(vault_core_active) / count(vault_core_active) <= 0.5"
|
|
severity: critical
|
|
|
|
- name: Cloudflare
|
|
exporters:
|
|
- name: lablabs/cloudflare-exporter
|
|
slug: lablabs-cloudflare-exporter
|
|
doc_url: https://github.com/lablabs/cloudflare-exporter
|
|
rules:
|
|
- name: Cloudflare http 4xx error rate
|
|
description: "Cloudflare high HTTP 4xx error rate (> 5% for domain {{ $labels.zone }})"
|
|
query: '(sum by(zone) (rate(cloudflare_zone_requests_status{status=~"^4.."}[15m])) / on (zone) sum by (zone) (rate(cloudflare_zone_requests_status[15m]))) * 100 > 5'
|
|
severity: warning
|
|
- name: Cloudflare http 5xx error rate
|
|
description: "Cloudflare high HTTP 5xx error rate (> 5% for domain {{ $labels.zone }})"
|
|
query: '(sum by (zone) (rate(cloudflare_zone_requests_status{status=~"^5.."}[5m])) / on (zone) sum by (zone) (rate(cloudflare_zone_requests_status[5m]))) * 100 > 5'
|
|
severity: critical
|
|
|
|
- name: Other
|
|
services:
|
|
- name: Thanos
|
|
exporters:
|
|
- name: Thanos Compactor
|
|
slug: thanos-compactor
|
|
rules:
|
|
- name: Thanos Compactor Multiple Running
|
|
description: "No more than one Thanos Compact instance should be running at once. There are {{$value}} instances running."
|
|
query: 'sum by (job) (up{job=~".*thanos-compact.*"}) > 1'
|
|
severity: warning
|
|
for: 5m
|
|
- name: Thanos Compactor Halted
|
|
description: "Thanos Compact {{$labels.job}} has failed to run and now is halted."
|
|
query: 'thanos_compact_halted{job=~".*thanos-compact.*"} == 1'
|
|
severity: warning
|
|
for: 5m
|
|
- name: Thanos Compactor High Compaction Failures
|
|
description: "Thanos Compact {{$labels.job}} is failing to execute {{$value | humanize}}% of compactions."
|
|
query: '(sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~".*thanos-compact.*"}[5m])) / sum by (job) (rate(thanos_compact_group_compactions_total{job=~".*thanos-compact.*"}[5m])) * 100 > 5)'
|
|
severity: warning
|
|
for: 15m
|
|
- name: Thanos Compact Bucket High Operation Failures
|
|
description: "Thanos Compact {{$labels.job}} Bucket is failing to execute {{$value | humanize}}% of operations."
|
|
query: '(sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-compact.*"}[5m])) / sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-compact.*"}[5m])) * 100 > 5)'
|
|
severity: warning
|
|
for: 15m
|
|
- name: Thanos Compact Has Not Run
|
|
description: "Thanos Compact {{$labels.job}} has not uploaded anything for 24 hours."
|
|
query: '(time() - max by (job) (max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~".*thanos-compact.*"}[24h]))) / 60 / 60 > 24'
|
|
severity: warning
|
|
for: 0m
|
|
- name: Thanos Query
|
|
slug: thanos-query
|
|
rules:
|
|
- name: Thanos Query Http Request Query Error Rate High
|
|
description: 'Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of "query" requests.'
|
|
query: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*", handler="query"}[5m]))/ sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query"}[5m]))) * 100 > 5'
|
|
severity: critical
|
|
for: 5m
|
|
- name: Thanos Query Http Request Query Range Error Rate High
|
|
description: 'Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of "query_range" requests.'
|
|
query: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*", handler="query_range"}[5m]))/ sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query_range"}[5m]))) * 100 > 5'
|
|
severity: critical
|
|
for: 5m
|
|
- name: Thanos Query Grpc Server Error Rate
|
|
description: "Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of requests."
|
|
query: '(sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-query.*"}[5m]))/ sum by (job) (rate(grpc_server_started_total{job=~".*thanos-query.*"}[5m])) * 100 > 5)'
|
|
severity: warning
|
|
for: 5m
|
|
- name: Thanos Query Grpc Client Error Rate
|
|
description: "Thanos Query {{$labels.job}} is failing to send {{$value | humanize}}% of requests."
|
|
query: '(sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~".*thanos-query.*"}[5m])) / sum by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*"}[5m]))) * 100 > 5'
|
|
severity: warning
|
|
for: 5m
|
|
- name: Thanos Query High D N S Failures
|
|
description: "Thanos Query {{$labels.job}} have {{$value | humanize}}% of failing DNS queries for store endpoints."
|
|
query: '(sum by (job) (rate(thanos_query_store_apis_dns_failures_total{job=~".*thanos-query.*"}[5m])) / sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~".*thanos-query.*"}[5m]))) * 100 > 1'
|
|
severity: warning
|
|
for: 15m
|
|
- name: Thanos Query Instant Latency High
|
|
description: "Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for instant queries."
|
|
query: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m]))) > 40 and sum by (job) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m])) > 0)'
|
|
severity: critical
|
|
for: 10m
|
|
- name: Thanos Query Range Latency High
|
|
description: "Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for range queries."
|
|
query: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query_range"}[5m]))) > 90 and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-query.*", handler="query_range"}[5m])) > 0)'
|
|
severity: critical
|
|
for: 10m
|
|
- name: Thanos Query Overload
|
|
description: "Thanos Query {{$labels.job}} has been overloaded for more than 15 minutes. This may be a symptom of excessive simultanous complex requests, low performance of the Prometheus API, or failures within these components. Assess the health of the Thanos query instances, the connnected Prometheus instances, look for potential senders of these requests and then contact support."
|
|
query: "(max_over_time(thanos_query_concurrent_gate_queries_max[5m]) - avg_over_time(thanos_query_concurrent_gate_queries_in_flight[5m]) < 1)"
|
|
severity: warning
|
|
for: 15m
|
|
- name: Thanos Receiver
|
|
slug: thanos-receiver
|
|
rules:
|
|
- name: Thanos Receive Http Request Error Rate High
|
|
description: "Thanos Receive {{$labels.job}} is failing to handle {{$value | humanize}}% of requests."
|
|
query: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-receive.*", handler="receive"}[5m]))/ sum by (job) (rate(http_requests_total{job=~".*thanos-receive.*", handler="receive"}[5m]))) * 100 > 5'
|
|
severity: critical
|
|
for: 5m
|
|
- name: Thanos Receive Http Request Latency High
|
|
description: "Thanos Receive {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for requests."
|
|
query: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-receive.*", handler="receive"}[5m]))) > 10 and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-receive.*", handler="receive"}[5m])) > 0)'
|
|
severity: critical
|
|
for: 10m
|
|
- name: Thanos Receive High Replication Failures
|
|
description: "Thanos Receive {{$labels.job}} is failing to replicate {{$value | humanize}}% of requests."
|
|
query: 'thanos_receive_replication_factor > 1 and ((sum by (job) (rate(thanos_receive_replications_total{result="error", job=~".*thanos-receive.*"}[5m])) / sum by (job) (rate(thanos_receive_replications_total{job=~".*thanos-receive.*"}[5m]))) > (max by (job) (floor((thanos_receive_replication_factor{job=~".*thanos-receive.*"}+1)/ 2)) / max by (job) (thanos_receive_hashring_nodes{job=~".*thanos-receive.*"}))) * 100'
|
|
severity: warning
|
|
for: 5m
|
|
- name: Thanos Receive High Forward Request Failures
|
|
description: "Thanos Receive {{$labels.job}} is failing to forward {{$value | humanize}}% of requests."
|
|
query: '(sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~".*thanos-receive.*"}[5m]))/ sum by (job) (rate(thanos_receive_forward_requests_total{job=~".*thanos-receive.*"}[5m]))) * 100 > 20'
|
|
severity: info
|
|
for: 5m
|
|
- name: Thanos Receive High Hashring File Refresh Failures
|
|
description: "Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{$value | humanize}} of attempts failed."
|
|
query: '(sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~".*thanos-receive.*"}[5m])) / sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{job=~".*thanos-receive.*"}[5m])) > 0)'
|
|
severity: warning
|
|
for: 15m
|
|
- name: Thanos Receive Config Reload Failure
|
|
description: "Thanos Receive {{$labels.job}} has not been able to reload hashring configurations."
|
|
query: 'avg by (job) (thanos_receive_config_last_reload_successful{job=~".*thanos-receive.*"}) != 1'
|
|
severity: warning
|
|
for: 5m
|
|
- name: Thanos Receive No Upload
|
|
description: "Thanos Receive {{$labels.instance}} has not uploaded latest data to object storage."
|
|
query: '(up{job=~".*thanos-receive.*"} - 1) + on (job, instance) (sum by (job, instance) (increase(thanos_shipper_uploads_total{job=~".*thanos-receive.*"}[3h])) == 0)'
|
|
severity: critical
|
|
for: 3h
|
|
- name: Thanos Sidecar
|
|
slug: thanos-sidecar
|
|
rules:
|
|
- name: Thanos Sidecar Bucket Operations Failed
|
|
description: "Thanos Sidecar {{$labels.instance}} bucket operations are failing"
|
|
query: 'sum by (job, instance) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-sidecar.*"}[5m])) > 0'
|
|
severity: critical
|
|
for: 5m
|
|
- name: Thanos Sidecar No Connection To Started Prometheus
|
|
description: "Thanos Sidecar {{$labels.instance}} is unhealthy."
|
|
query: 'thanos_sidecar_prometheus_up{job=~".*thanos-sidecar.*"} == 0 and on (namespace, pod)prometheus_tsdb_data_replay_duration_seconds != 0'
|
|
severity: critical
|
|
for: 5m
|
|
- name: Thanos Store
|
|
slug: thanos-store
|
|
rules:
|
|
- name: Thanos Store Grpc Error Rate
|
|
description: "Thanos Store {{$labels.job}} is failing to handle {{$value | humanize}}% of requests."
|
|
query: '(sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-store.*"}[5m]))/ sum by (job) (rate(grpc_server_started_total{job=~".*thanos-store.*"}[5m])) * 100 > 5)'
|
|
severity: warning
|
|
for: 5m
|
|
- name: Thanos Store Series Gate Latency High
|
|
description: "Thanos Store {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for store series gate requests."
|
|
query: '(histogram_quantile(0.99, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=~".*thanos-store.*"}[5m]))) > 2 and sum by (job) (rate(thanos_bucket_store_series_gate_duration_seconds_count{job=~".*thanos-store.*"}[5m])) > 0)'
|
|
severity: warning
|
|
for: 10m
|
|
- name: Thanos Store Bucket High Operation Failures
|
|
description: "Thanos Store {{$labels.job}} Bucket is failing to execute {{$value | humanize}}% of operations."
|
|
query: '(sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-store.*"}[5m])) / sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-store.*"}[5m])) * 100 > 5)'
|
|
severity: warning
|
|
for: 15m
|
|
- name: Thanos Store Objstore Operation Latency High
|
|
description: "Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of {{$value}} seconds for the bucket operations."
|
|
query: '(histogram_quantile(0.99, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~".*thanos-store.*"}[5m]))) > 2 and sum by (job) (rate(thanos_objstore_bucket_operation_duration_seconds_count{job=~".*thanos-store.*"}[5m])) > 0)'
|
|
severity: warning
|
|
for: 10m
|
|
- name: Thanos Ruler
|
|
slug: thanos-ruler
|
|
rules:
|
|
- name: Thanos Rule Queue Is Dropping Alerts
|
|
description: "Thanos Rule {{$labels.instance}} is failing to queue alerts."
|
|
query: 'sum by (job, instance) (rate(thanos_alert_queue_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0'
|
|
severity: critical
|
|
for: 5m
|
|
- name: Thanos Rule Sender Is Failing Alerts
|
|
description: "Thanos Rule {{$labels.instance}} is failing to send alerts to alertmanager."
|
|
query: 'sum by (job, instance) (rate(thanos_alert_sender_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0'
|
|
severity: critical
|
|
for: 5m
|
|
- name: Thanos Rule High Rule Evaluation Failures
|
|
description: "Thanos Rule {{$labels.instance}} is failing to evaluate rules."
|
|
query: '(sum by (job, instance) (rate(prometheus_rule_evaluation_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) * 100 > 5)'
|
|
severity: critical
|
|
for: 5m
|
|
- name: Thanos Rule High Rule Evaluation Warnings
|
|
description: "Thanos Rule {{$labels.instance}} has high number of evaluation warnings."
|
|
query: 'sum by (job, instance) (rate(thanos_rule_evaluation_with_warnings_total{job=~".*thanos-rule.*"}[5m])) > 0'
|
|
severity: info
|
|
for: 15m
|
|
- name: Thanos Rule Rule Evaluation Latency High
|
|
description: "Thanos Rule {{$labels.instance}} has higher evaluation latency than interval for {{$labels.rule_group}}."
|
|
query: '(sum by (job, instance, rule_group) (prometheus_rule_group_last_duration_seconds{job=~".*thanos-rule.*"}) > sum by (job, instance, rule_group) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"}))'
|
|
severity: warning
|
|
for: 5m
|
|
- name: Thanos Rule Grpc Error Rate
|
|
description: "Thanos Rule {{$labels.job}} is failing to handle {{$value | humanize}}% of requests."
|
|
query: '(sum by (job, instance) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-rule.*"}[5m]))/ sum by (job, instance) (rate(grpc_server_started_total{job=~".*thanos-rule.*"}[5m])) * 100 > 5)'
|
|
severity: warning
|
|
for: 5m
|
|
- name: Thanos Rule Config Reload Failure
|
|
description: "Thanos Rule {{$labels.job}} has not been able to reload its configuration."
|
|
query: 'avg by (job, instance) (thanos_rule_config_last_reload_successful{job=~".*thanos-rule.*"}) != 1'
|
|
severity: info
|
|
for: 5m
|
|
- name: Thanos Rule Query High D N S Failures
|
|
description: "Thanos Rule {{$labels.job}} has {{$value | humanize}}% of failing DNS queries for query endpoints."
|
|
query: '(sum by (job, instance) (rate(thanos_rule_query_apis_dns_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) * 100 > 1)'
|
|
severity: warning
|
|
for: 15m
|
|
- name: Thanos Rule Alertmanager High D N S Failures
|
|
description: "Thanos Rule {{$labels.instance}} has {{$value | humanize}}% of failing DNS queries for Alertmanager endpoints."
|
|
query: '(sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) * 100 > 1)'
|
|
severity: warning
|
|
for: 15m
|
|
- name: Thanos Rule No Evaluation For10 Intervals
|
|
description: "Thanos Rule {{$labels.job}} has rule groups that did not evaluate for at least 10x of their expected interval."
|
|
query: 'time() - max by (job, instance, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job=~".*thanos-rule.*"})>10 * max by (job, instance, group) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"})'
|
|
severity: info
|
|
for: 5m
|
|
- name: Thanos No Rule Evaluations
|
|
description: "Thanos Rule {{$labels.instance}} did not perform any rule evaluations in the past 10 minutes."
|
|
query: 'sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) <= 0 and sum by (job, instance) (thanos_rule_loaded_rules{job=~".*thanos-rule.*"}) > 0'
|
|
severity: critical
|
|
for: 5m
|
|
- name: Thanos Bucket Replicate
|
|
slug: thanos-bucket-replicate
|
|
rules:
|
|
- name: Thanos Bucket Replicate Error Rate
|
|
description: "Thanos Replicate is failing to run, {{$value | humanize}}% of attempts failed."
|
|
query: '(sum by (job) (rate(thanos_replicate_replication_runs_total{result="error", job=~".*thanos-bucket-replicate.*"}[5m]))/ on (job) group_left sum by (job) (rate(thanos_replicate_replication_runs_total{job=~".*thanos-bucket-replicate.*"}[5m]))) * 100 >= 10'
|
|
severity: critical
|
|
for: 5m
|
|
- name: Thanos Bucket Replicate Run Latency
|
|
description: "Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for the replicate operations."
|
|
query: '(histogram_quantile(0.99, sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m]))) > 20 and sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m])) > 0)'
|
|
severity: critical
|
|
for: 5m
|
|
- name: Thanos Component Absent
|
|
slug: thanos-component-absent
|
|
rules:
|
|
- name: Thanos Compact Is Down
|
|
description: "ThanosCompact has disappeared. Prometheus target for the component cannot be discovered."
|
|
query: 'absent(up{job=~".*thanos-compact.*"} == 1)'
|
|
severity: critical
|
|
for: 5m
|
|
- name: Thanos Query Is Down
|
|
description: "ThanosQuery has disappeared. Prometheus target for the component cannot be discovered."
|
|
query: 'absent(up{job=~".*thanos-query.*"} == 1)'
|
|
severity: critical
|
|
for: 5m
|
|
- name: Thanos Receive Is Down
|
|
description: "ThanosReceive has disappeared. Prometheus target for the component cannot be discovered."
|
|
query: 'absent(up{job=~".*thanos-receive.*"} == 1)'
|
|
severity: critical
|
|
for: 5m
|
|
- name: Thanos Rule Is Down
|
|
description: "ThanosRule has disappeared. Prometheus target for the component cannot be discovered."
|
|
query: 'absent(up{job=~".*thanos-rule.*"} == 1)'
|
|
severity: critical
|
|
for: 5m
|
|
- name: Thanos Sidecar Is Down
|
|
description: "ThanosSidecar has disappeared. Prometheus target for the component cannot be discovered."
|
|
query: 'absent(up{job=~".*thanos-sidecar.*"} == 1)'
|
|
severity: critical
|
|
for: 5m
|
|
- name: Thanos Store Is Down
|
|
description: "ThanosStore has disappeared. Prometheus target for the component cannot be discovered."
|
|
query: absent(up{job=~".*thanos-store.*"} == 1)
|
|
severity: critical
|
|
for: 5m
|
|
|
|
- name: Loki
|
|
exporters:
|
|
- name: Embedded exporter
|
|
slug: embedded-exporter
|
|
rules:
|
|
- name: Loki process too many restarts
|
|
description: A loki process had too many restarts (target {{ $labels.instance }})
|
|
query: changes(process_start_time_seconds{job=~".*loki.*"}[15m]) > 2
|
|
severity: warning
|
|
- name: Loki request errors
|
|
description: The {{ $labels.job }} and {{ $labels.route }} are experiencing errors
|
|
query: '100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10'
|
|
severity: critical
|
|
for: 15m
|
|
- name: Loki request panic
|
|
description: The {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics
|
|
query: sum(increase(loki_panic_total[10m])) by (namespace, job) > 0
|
|
severity: critical
|
|
for: 5m
|
|
- name: Loki request latency
|
|
description: The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency
|
|
query: (histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (le))) > 1
|
|
severity: critical
|
|
for: 5m
|
|
- name: Promtail
|
|
exporters:
|
|
- name: Embedded exporter
|
|
slug: embedded-exporter
|
|
rules:
|
|
- name: Promtail request errors
|
|
description: The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.
|
|
query: '100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance) / sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) > 10'
|
|
severity: critical
|
|
for: 5m
|
|
- name: Promtail request latency
|
|
description: The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
|
|
query: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[5m])) by (le)) > 1
|
|
severity: critical
|
|
for: 5m
|
|
- name: Cortex
|
|
exporters:
|
|
- name: Embedded exporter
|
|
slug: embedded-exporter
|
|
rules:
|
|
- name: Cortex ruler configuration reload failure
|
|
description: Cortex ruler configuration reload failure (instance {{ $labels.instance }})
|
|
query: cortex_ruler_config_last_reload_successful != 1
|
|
severity: warning
|
|
- name: Cortex not connected to Alertmanager
|
|
description: Cortex not connected to Alertmanager (instance {{ $labels.instance }})
|
|
query: cortex_prometheus_notifications_alertmanagers_discovered < 1
|
|
severity: critical
|
|
- name: Cortex notification are being dropped
|
|
description: Cortex notification are being dropped due to errors (instance {{ $labels.instance }})
|
|
query: rate(cortex_prometheus_notifications_dropped_total[5m]) > 0
|
|
severity: critical
|
|
- name: Cortex notification error
|
|
description: Cortex is failing when sending alert notifications (instance {{ $labels.instance }})
|
|
query: rate(cortex_prometheus_notifications_errors_total[5m]) > 0
|
|
severity: critical
|
|
- name: Cortex ingester unhealthy
|
|
description: Cortex has an unhealthy ingester
|
|
query: cortex_ring_members{state="Unhealthy", name="ingester"} > 0
|
|
severity: critical
|
|
- name: Cortex frontend queries stuck
|
|
description: There are queued up queries in query-frontend.
|
|
query: sum by (job) (cortex_query_frontend_queue_length) > 0
|
|
severity: critical
|
|
for: 5m
|
|
|
|
- name: Grafana Alloy
|
|
exporters:
|
|
- slug: embedded-exporter
|
|
rules:
|
|
- name: Grafana Alloy service down
|
|
description: Alloy on (instance {{ $labels.instance }}) is not responding or has stopped running.
|
|
query: "count by (instance) (alloy_build_info) unless count by (instance) (alloy_build_info offset 2m) "
|
|
severity: critical
|
|
|
|
- name: OpenTelemetry Collector
|
|
exporters:
|
|
- name: Embedded exporter
|
|
slug: embedded-exporter
|
|
doc_url: https://opentelemetry.io/docs/collector/internal-telemetry/
|
|
comments: |
|
|
OpenTelemetry Collector self-monitoring metrics are exposed on port 8888 by default at the /metrics endpoint.
|
|
These alerts monitor the collector's health when metrics are ingested via the Prometheus OTLP endpoint or scraped directly.
|
|
All collector internal metrics are prefixed with 'otelcol_'.
|
|
rules:
|
|
- name: OpenTelemetry Collector down
|
|
description: OpenTelemetry Collector instance has disappeared or is not being scraped
|
|
query: 'up{job=~".*otel.*collector.*"} == 0'
|
|
severity: critical
|
|
for: 1m
|
|
- name: OpenTelemetry Collector receiver refused spans
|
|
description: "OpenTelemetry Collector is refusing spans on {{ $labels.receiver }}"
|
|
query: 'rate(otelcol_receiver_refused_spans[5m]) > 0'
|
|
severity: critical
|
|
for: 5m
|
|
- name: OpenTelemetry Collector receiver refused metric points
|
|
description: "OpenTelemetry Collector is refusing metric points on {{ $labels.receiver }}"
|
|
query: 'rate(otelcol_receiver_refused_metric_points[5m]) > 0'
|
|
severity: critical
|
|
for: 5m
|
|
- name: OpenTelemetry Collector receiver refused log records
|
|
description: "OpenTelemetry Collector is refusing log records on {{ $labels.receiver }}"
|
|
query: 'rate(otelcol_receiver_refused_log_records[5m]) > 0'
|
|
severity: critical
|
|
for: 5m
|
|
- name: OpenTelemetry Collector exporter failed spans
|
|
description: "OpenTelemetry Collector failing to send spans via {{ $labels.exporter }}"
|
|
query: 'rate(otelcol_exporter_send_failed_spans[5m]) > 0'
|
|
severity: warning
|
|
for: 5m
|
|
- name: OpenTelemetry Collector exporter failed metric points
|
|
description: "OpenTelemetry Collector failing to send metric points via {{ $labels.exporter }}"
|
|
query: 'rate(otelcol_exporter_send_failed_metric_points[5m]) > 0'
|
|
severity: warning
|
|
for: 5m
|
|
- name: OpenTelemetry Collector exporter failed log records
|
|
description: "OpenTelemetry Collector failing to send log records via {{ $labels.exporter }}"
|
|
query: 'rate(otelcol_exporter_send_failed_log_records[5m]) > 0'
|
|
severity: warning
|
|
for: 5m
|
|
- name: OpenTelemetry Collector exporter queue nearly full
|
|
description: "OpenTelemetry Collector exporter {{ $labels.exporter }} queue is over 80% full"
|
|
query: '(otelcol_exporter_queue_size / on(instance, job, exporter) otelcol_exporter_queue_capacity) > 0.8 and otelcol_exporter_queue_capacity > 0'
|
|
severity: warning
|
|
- name: OpenTelemetry Collector processor refused spans
|
|
description: "OpenTelemetry Collector processor {{ $labels.processor }} is refusing spans, likely due to backpressure"
|
|
query: 'rate(otelcol_processor_refused_spans[5m]) > 0'
|
|
severity: warning
|
|
for: 5m
|
|
- name: OpenTelemetry Collector processor refused metric points
|
|
description: "OpenTelemetry Collector processor {{ $labels.processor }} is refusing metric points, likely due to backpressure"
|
|
query: 'rate(otelcol_processor_refused_metric_points[5m]) > 0'
|
|
severity: warning
|
|
for: 5m
|
|
- name: OpenTelemetry Collector high memory usage
|
|
description: "OpenTelemetry Collector memory usage is above 90%"
|
|
query: '(otelcol_process_runtime_heap_alloc_bytes{job=~".*otel.*collector.*"} / on(instance, job) otelcol_process_runtime_total_sys_memory_bytes{job=~".*otel.*collector.*"}) > 0.9'
|
|
severity: warning
|
|
for: 5m
|
|
- name: OpenTelemetry Collector OTLP receiver errors
|
|
description: "OpenTelemetry Collector OTLP receiver is completely failing - all spans are being refused"
|
|
query: 'rate(otelcol_receiver_accepted_spans{receiver=~"otlp"}[5m]) == 0 and rate(otelcol_receiver_refused_spans{receiver=~"otlp"}[5m]) > 0'
|
|
severity: critical
|
|
for: 2m
|
|
|
|
- name: Jenkins
|
|
exporters:
|
|
- name: Metric plugin
|
|
slug: metric-plugin
|
|
doc_url: https://plugins.jenkins.io/prometheus/
|
|
rules:
|
|
- name: Jenkins node offline
|
|
description: "At least one Jenkins node offline: `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})"
|
|
query: "jenkins_node_offline_value > 0"
|
|
severity: critical
|
|
for: 5m
|
|
- name: Jenkins no node online
|
|
description: "No Jenkins nodes are online: `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})"
|
|
query: "jenkins_node_online_value == 0"
|
|
severity: critical
|
|
- name: Jenkins healthcheck
|
|
description: "Jenkins healthcheck score: {{$value}}. Healthcheck failure for `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})"
|
|
query: "jenkins_health_check_score < 1"
|
|
severity: critical
|
|
- name: Jenkins outdated plugins
|
|
description: "{{ $value }} plugins need update"
|
|
query: "sum(jenkins_plugins_withUpdate) by (instance) > 3"
|
|
severity: warning
|
|
for: 1d
|
|
- name: Jenkins builds health score
|
|
description: "Healthcheck failure for `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})"
|
|
query: "default_jenkins_builds_health_score < 1"
|
|
severity: critical
|
|
- name: Jenkins run failure total
|
|
description: "Job run failures: ({{$value}}) {{$labels.jenkins_job}}. Healthcheck failure for `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})"
|
|
query: "delta(jenkins_runs_failure_total[1h]) > 100"
|
|
severity: warning
|
|
- name: Jenkins build tests failing
|
|
description: "Last build tests failed: {{$labels.jenkins_job}}. Failed build Tests for job `{{$labels.jenkins_job}}` on {{$labels.instance}}/{{$labels.env}} ({{$labels.region}})"
|
|
query: "default_jenkins_builds_last_build_tests_failing > 0"
|
|
severity: warning
|
|
- name: Jenkins last build failed
|
|
description: "Last build failed: {{$labels.jenkins_job}}. Failed build for job `{{$labels.jenkins_job}}` on {{$labels.instance}}/{{$labels.env}} ({{$labels.region}})"
|
|
query: "default_jenkins_builds_last_build_result_ordinal == 2"
|
|
severity: warning
|
|
comments: |
|
|
* RUNNING -1 true - The build had no errors.
|
|
* SUCCESS 0 true - The build had no errors.
|
|
* UNSTABLE 1 true - The build had some errors but they were not fatal. For example, some tests failed.
|
|
* FAILURE 2 false - The build had a fatal error.
|
|
* NOT_BUILT 3 false - The module was not built.
|
|
* ABORTED 4 false - The build was manually aborted.
|
|
|
|
- name: APC UPS
|
|
exporters:
|
|
- name: mdlayher/apcupsd_exporter
|
|
slug: apcupsd_exporter
|
|
doc_url: https://github.com/mdlayher/apcupsd_exporter
|
|
rules:
|
|
- name: APC UPS Battery nearly empty
|
|
description: Battery is almost empty (< 10% left)
|
|
query: "apcupsd_battery_charge_percent < 10"
|
|
severity: critical
|
|
- name: APC UPS Less than 15 Minutes of battery time remaining
|
|
description: Battery is almost empty (< 15 Minutes remaining)
|
|
query: "apcupsd_battery_time_left_seconds < 900"
|
|
severity: critical
|
|
- name: APC UPS AC input outage
|
|
description: UPS now running on battery (since {{$value | humanizeDuration}})
|
|
query: "apcupsd_battery_time_on_seconds > 0"
|
|
severity: warning
|
|
- name: APC UPS low battery voltage
|
|
description: Battery voltage is lower than nominal (< 95%)
|
|
query: "(apcupsd_battery_volts / apcupsd_battery_nominal_volts) < 0.95"
|
|
severity: warning
|
|
- name: APC UPS high temperature
|
|
description: Internal temperature is high ({{$value}}°C)
|
|
query: "apcupsd_internal_temperature_celsius >= 40"
|
|
severity: warning
|
|
for: 2m
|
|
- name: APC UPS high load
|
|
description: UPS load is > 80%
|
|
query: "apcupsd_ups_load_percent > 80"
|
|
severity: warning
|
|
|
|
- name: Graph Node
|
|
exporters:
|
|
- name: Embedded exporter
|
|
slug: embedded-exporter
|
|
rules:
|
|
- name: Provider failed because net_version failed
|
|
description: "Failed net_version for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`"
|
|
query: "eth_rpc_status == 1"
|
|
severity: critical
|
|
- name: Provider failed because get genesis failed
|
|
description: "Failed to get genesis for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`"
|
|
query: "eth_rpc_status == 2"
|
|
severity: critical
|
|
- name: Provider failed because net_version timeout
|
|
description: "net_version timeout for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`"
|
|
query: "eth_rpc_status == 3"
|
|
severity: critical
|
|
- name: Provider failed because get genesis timeout
|
|
description: "Timeout to get genesis for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`"
|
|
query: "eth_rpc_status == 4"
|
|
severity: critical
|
|
- name: Store connection is too slow
|
|
description: "Store connection is too slow to `{{$labels.pool}}` pool, `{{$labels.shard}}` shard in Graph node `{{$labels.instance}}`"
|
|
query: "store_connection_wait_time_ms > 10"
|
|
severity: warning
|
|
- name: Store connection is too slow
|
|
description: "Store connection is too slow to `{{$labels.pool}}` pool, `{{$labels.shard}}` shard in Graph node `{{$labels.instance}}`"
|
|
query: "store_connection_wait_time_ms > 20"
|
|
severity: critical
|