From 46043360de5b8526c489c79bccdec6d204833ef0 Mon Sep 17 00:00:00 2001 From: Evi Vanoost Date: Sun, 25 Feb 2024 14:53:30 -0500 Subject: [PATCH] Removed queries that throw errors when systems are upgraded. Also fixed and simplified a few Postgres queries. --- _data/rules.yml | 102 +++++++++++++++++++++++++++--------------------- 1 file changed, 57 insertions(+), 45 deletions(-) diff --git a/_data/rules.yml b/_data/rules.yml index afd85dd..269dde5 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -137,37 +137,37 @@ groups: rules: - name: Host out of memory description: Node memory is filling up (< 10% left) - query: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < .10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + query: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < .10)' severity: warning for: 2m - name: Host memory under memory pressure description: The node is under heavy memory pressure. High rate of loading memory pages from disk. - query: '(rate(node_vmstat_pgmajfault[5m]) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + query: '(rate(node_vmstat_pgmajfault[5m]) > 1000)' severity: warning - name: Host Memory is underutilized description: "Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})" # We use MemFree, many buffers (ZFS, databases etc) are declared as available memory, but would perform poorly if reduced - query: '((avg_over_time(node_memory_MemFree_bytes[30m]) / node_memory_MemTotal_bytes) > .80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + query: '((avg_over_time(node_memory_MemFree_bytes[30m]) / node_memory_MemTotal_bytes) > .80)' severity: info for: 1w comments: | You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly - name: Host unusual network throughput in description: "Host receive bandwidth is high (>80%)" - query: '((rate(node_network_receive_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80) * on (instance) group_left (nodename) node_uname_info{nodename=~".+"}' + query: '((rate(node_network_receive_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)' severity: warning - name: Host unusual network throughput out description: "Host transmit bandwidth is high (>80%)" - query: '((rate(node_network_transmit_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80) * on (instance) group_left (nodename) node_uname_info{nodename=~".+"}' + query: '((rate(node_network_transmit_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)' severity: warning - name: Host unusual disk read rate description: "Disk is too busy (IO wait > 80%)" - query: '(rate(node_disk_io_time_seconds_total[5m]) > .80) * on (instance) group_left (nodename) node_uname_info{nodename=~".+"}' + query: '(rate(node_disk_io_time_seconds_total[5m]) > .80)' severity: warning - name: Host out of disk space description: "Disk is almost full (< 10% left)" # Network filesystems have quotas etc. and should not be included in this alert - query: '(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0) * on (instance) group_left (nodename) node_uname_info{nodename=~".+"}' + query: '(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0)' severity: critical comments: | Please add ignored mountpoints in node_exporter parameters like @@ -185,7 +185,7 @@ groups: for: 2m - name: Host out of inodes description: Disk is almost running out of available inodes (< 10% left) - query: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + query: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0)' severity: critical for: 2m - name: Host filesystem device error @@ -204,41 +204,41 @@ groups: for: 2m - name: Host unusual disk read latency description: Disk latency is growing (read operations > 100ms) - query: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + query: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0)' severity: warning for: 2m - name: Host unusual disk write latency description: Disk latency is growing (write operations > 100ms) - query: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + query: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0)' severity: warning for: 2m - name: Host high CPU load description: CPU load is > 80% - query: '((avg by (instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > .80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + query: '((avg by (instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > .80)' severity: warning for: 10m - name: Host CPU is underutilized description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs." - query: '((avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > .80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + query: '(avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > .80' severity: info for: 1w comments: | You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly - name: Host CPU steal noisy neighbor description: CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit. - query: '(avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + query: 'avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10' severity: warning - name: Host CPU high iowait description: CPU iowait > 10%. Your CPU is idling waiting for storage to respond. - query: '(avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + query: 'avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10' severity: warning - name: Host unusual disk IO description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities." - query: '(rate(node_disk_io_time_seconds_total[5m]) > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + query: 'rate(node_disk_io_time_seconds_total[5m]) > 0.8' severity: warning - name: Host context switching description: Context switching is growing on the node (> 10000 / CPU / s) - query: '((rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + query: '(rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 10000' severity: warning comments: | 10000 context switches is an arbitrary number. @@ -246,81 +246,82 @@ groups: Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58 - name: Host swap is filling up description: Swap is filling up (>80%) - query: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + query: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80)' severity: warning for: 2m - name: Host systemd service crashed description: "systemd service crashed" - query: '(node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + query: '(node_systemd_unit_state{state="failed"} == 1)' severity: warning - name: Host physical component too hot description: "Physical hardware component too hot" - query: '((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl"} > 75)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + query: 'node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl"} > 75' severity: warning for: 5m - name: Host node overtemperature alarm description: "Physical node temperature alarm triggered" - query: '(node_hwmon_temp_crit_alarm_celsius == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + query: '(node_hwmon_temp_crit_alarm_celsius == 1)' severity: critical - name: Host Software RAID insufficient drives description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} has insufficient drives remaining." - query: '((node_md_disks_required - on(device, instance) node_md_disks{state="active"}) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + query: '((node_md_disks_required - on(device, instance) node_md_disks{state="active"}) > 0)' severity: critical - name: Host Software RAID disk failure description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} needs attention." - query: '(node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + query: '(node_md_disks{state="failed"} > 0)' severity: warning for: 2m - name: Host kernel version deviations description: Kernel version for {{ $labels.instance }} has changed query: 'changes(node_uname_info[1h]) > 0' + comments: | + This alert may happen when the host is rebooted after a software update. severity: info - for: 6h - name: Host OOM kill detected description: OOM kill detected - query: '(increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' - severity: warning + query: '(increase(node_vmstat_oom_kill[1m]) > 0)' + severity: critical - name: Host EDAC Correctable Errors detected description: 'Host {{ $labels.instance }} has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.' - query: '(increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + query: '(increase(node_edac_correctable_errors_total[1m]) > 0)' severity: info - name: Host EDAC Uncorrectable Errors detected description: 'Host {{ $labels.instance }} has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.' - query: '(node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + query: '(node_edac_uncorrectable_errors_total > 0)' severity: warning - name: Host Network Receive Errors description: 'Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.' - query: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + query: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01)' severity: warning for: 2m - name: Host Network Transmit Errors description: 'Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.' - query: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + query: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01)' severity: warning for: 2m - name: Host Network Bond Degraded description: 'Bond "{{ $labels.device }}" degraded on "{{ $labels.instance }}".' - query: '((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + query: '((node_bonding_active - node_bonding_slaves) != 0)' severity: warning for: 2m - name: Host conntrack limit description: "The number of conntrack is approaching limit" - query: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + query: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8)' severity: warning for: 5m - name: Host clock skew description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host." - query: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + query: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0))' severity: warning for: 10m - name: Host clock not synchronising description: "Clock not synchronising. Ensure NTP is configured on this host." - query: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + query: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16)' severity: warning for: 2m - name: Host requires reboot description: "{{ $labels.instance }} requires a reboot." - query: '(node_reboot_required > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + query: '(node_reboot_required > 0)' severity: info for: 4h @@ -331,18 +332,22 @@ groups: doc_url: https://github.com/prometheus-community/smartctl_exporter rules: - name: SMART device temperature warning - description: Device temperature warning on {{ $labels.instance }} drive {{ $labels.device }}) - query: avg_over_time(smartctl_device_temperature{temperature_type="current"} [5m]) > 60 + description: Device temperature warning on {{ $labels.instance }} drive {{ $labels.device }} over 60°C + query: (avg_over_time(smartctl_device_temperature{temperature_type="current"} [5m]) unless on (instance, device) smartctl_device_temperature{temperature_type="drive_trip"}) > 60 severity: warning - name: SMART device temperature critical - description: Device temperature critical on {{ $labels.instance }} drive {{ $labels.device }}) - query: max_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) >= 70 + description: Device temperature critical on {{ $labels.instance }} drive {{ $labels.device }} over 70°C + query: (max_over_time(smartctl_device_temperature{temperature_type="current"} [5m]) unless on (instance, device) smartctl_device_temperature{temperature_type="drive_trip"}) > 70 severity: critical # Datacenter drives have a trip temperature - - name: SMART device temperature was over trip value + - name: SMART device temperature over trip value description: Device temperature over trip value on {{ $labels.instance }} drive {{ $labels.device }}) query: max_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) >= on(device, instance) smartctl_device_temperature{temperature_type="drive_trip"} severity: critical + - name: SMART device temperature nearing trip value + description: Device temperature at 80% of trip value on {{ $labels.instance }} drive {{ $labels.device }}) + query: max_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) >= on(device, instance) (smartctl_device_temperature{temperature_type="drive_trip"} * .80) + severity: warning - name: SMART status description: Device has a SMART status failure on {{ $labels.instance }} drive {{ $labels.device }}) query: smartctl_device_smart_status != 1 @@ -355,6 +360,8 @@ groups: description: Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }}) query: smartctl_device_media_errors > 0 severity: critical + comments: | + Media errors are a sign of a failing disk. Replace the disk as soon as possible. - name: SMART Wearout Indicator description: Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }}) # The threshold is not present on devices that do not support it @@ -646,13 +653,15 @@ groups: - name: Postgresql too many connections description: PostgreSQL instance has too many connections (> 80%). query: "sum by (instance, job, server) (pg_stat_activity_count) > min by (instance, job, server) (pg_settings_max_connections * 0.8)" - severity: warning + severity: critical for: 2m - name: Postgresql not enough connections description: PostgreSQL instance should have more connections (> 5) query: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5' - severity: warning + severity: critical for: 2m + comments: | + If the number of connections is too low, it may indicate that the application has died. - name: Postgresql dead locks description: PostgreSQL has dead-locks query: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5' @@ -663,9 +672,9 @@ groups: severity: warning - name: Postgresql commit rate low description: Postgresql seems to be processing very few transactions - query: "rate(pg_stat_database_xact_commit[1m]) < 10" + query: 'increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[1m]) < 5' severity: critical - for: 2m + for: 5m - name: Postgresql low XID consumption description: Postgresql seems to be consuming transaction IDs very slowly query: "rate(pg_txid_current[1m]) < 5" @@ -691,12 +700,15 @@ groups: for: 2m - name: Postgresql configuration changed description: Postgres Database configuration change has occurred - query: '{__name__=~"pg_settings_.*"} != ON(__name__) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"} OFFSET 5m' + query: 'changes(label_replace({__name__=~"pg_settings_.*"},"name","$1","__name__", "(.+)")[1h:]) > 0' severity: info - name: Postgresql SSL compression active - description: Database connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`. + description: Database allows connections with SSL compression enabled. query: "sum(pg_stat_ssl_compression) > 0" severity: critical + comments: | + TLS compression is a security risk and should be disabled. It has been removed for TLSv1.3. + https://www.bytebase.com/docs/slow-query/enable-pg-stat-statements-for-postgresql/ - name: Postgresql too many locks acquired description: Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction. query: "((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20"