diff --git a/dist/rules/caddy/embedded-exporter.yml b/dist/rules/caddy/null.yml similarity index 87% rename from dist/rules/caddy/embedded-exporter.yml rename to dist/rules/caddy/null.yml index 4f23002..64b0230 100644 --- a/dist/rules/caddy/embedded-exporter.yml +++ b/dist/rules/caddy/null.yml @@ -1,6 +1,6 @@ groups: -- name: EmbeddedExporter +- name: rules: @@ -10,8 +10,8 @@ groups: labels: severity: critical annotations: - summary: Caddy reverse proxy down (instance {{ $labels.instance }}) - description: "All Caddy reverse proxies are down\n LABELS = {{ $labels }}" + summary: Caddy Reverse Proxy Down (instance {{ $labels.instance }}) + description: "All Caddy reverse proxies are down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CaddyHighHttp4xxErrorRateService expr: 'sum(rate(caddy_http_request_duration_seconds_count{code=~"4.."}[3m])) by (instance) / sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) * 100 > 5' diff --git a/dist/rules/host-and-hardware/node-exporter.yml b/dist/rules/host-and-hardware/node-exporter.yml index d52b34d..f168bcf 100644 --- a/dist/rules/host-and-hardware/node-exporter.yml +++ b/dist/rules/host-and-hardware/node-exporter.yml @@ -38,7 +38,7 @@ groups: severity: warning annotations: summary: Host unusual network throughput in (instance {{ $labels.instance }}) - description: "Host receive bandwidth is high (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Host receive bandwidth is high (>80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostUnusualNetworkThroughputOut expr: '((rate(node_network_transmit_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)' @@ -74,7 +74,7 @@ groups: severity: warning annotations: summary: Host disk may fill in 24 hours (instance {{ $labels.instance }}) - description: "Filesystem will likely run out of space within the next 24 hours\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Filesystem will likely run out of space within the next 24 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostOutOfInodes expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0)' @@ -94,17 +94,17 @@ groups: summary: Host filesystem device error (instance {{ $labels.instance }}) description: "Error stat-ing the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: HostInodesWillFillIn24Hours + - alert: HostInodesMayFillIn24Hours expr: 'predict_linear(node_filesystem_files_free{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_files_free > 0' for: 2m labels: severity: warning annotations: - summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }}) - description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: Host inodes may fill in 24 hours (instance {{ $labels.instance }}) + description: "Filesystem will likely run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostUnusualDiskReadLatency - expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0)' + expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0' for: 2m labels: severity: warning @@ -164,10 +164,10 @@ groups: severity: warning annotations: summary: Host unusual disk IO (instance {{ $labels.instance }}) - description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostContextSwitchingHigh - expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"}))/(rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2' + expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2' for: 0m labels: severity: warning @@ -203,7 +203,7 @@ groups: description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostNodeOvertemperatureAlarm - expr: '((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + expr: '((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1))' for: 0m labels: severity: critical @@ -217,7 +217,7 @@ groups: labels: severity: critical annotations: - summary: Host Software RAID insufficient drives (instance {{ $labels.instance }}) + summary: Host software RAID insufficient drives (instance {{ $labels.instance }}) description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} has insufficient drives remaining.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostSoftwareRaidDiskFailure @@ -226,7 +226,7 @@ groups: labels: severity: warning annotations: - summary: Host Software RAID disk failure (instance {{ $labels.instance }}) + summary: Host software RAID disk failure (instance {{ $labels.instance }}) description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} needs attention.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostKernelVersionDeviations @@ -236,13 +236,13 @@ groups: severity: info annotations: summary: Host kernel version deviations (instance {{ $labels.instance }}) - description: "Kernel version for {{ $labels.instance }} has changed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Kernel version for {{ $labels.instance }} has changed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostOomKillDetected expr: '(increase(node_vmstat_oom_kill[1m]) > 0)' for: 0m labels: - severity: critical + severity: warning annotations: summary: Host OOM kill detected (instance {{ $labels.instance }}) description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/dist/rules/postgresql/postgres-exporter.yml b/dist/rules/postgresql/postgres-exporter.yml index 692acef..42e5bb8 100644 --- a/dist/rules/postgresql/postgres-exporter.yml +++ b/dist/rules/postgresql/postgres-exporter.yml @@ -32,7 +32,7 @@ groups: description: "Postgresql exporter is showing errors. A query may be buggy in query.yaml\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PostgresqlTableNotAutoVacuumed - expr: '((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_vacuum_threshold) and (time() - pg_stat_user_tables_last_autovacuum) > 864000' + expr: '((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_vacuum_threshold) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10' for: 0m labels: severity: warning @@ -41,7 +41,7 @@ groups: description: "Table {{ $labels.relname }} has not been auto vacuumed for 10 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PostgresqlTableNotAutoAnalyzed - expr: '((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_analyze_threshold) and (time() - pg_stat_user_tables_last_autoanalyze) > 864000' + expr: '((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_analyze_threshold) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10' for: 0m labels: severity: warning @@ -53,7 +53,7 @@ groups: expr: 'sum by (instance, job, server) (pg_stat_activity_count) > min by (instance, job, server) (pg_settings_max_connections * 0.8)' for: 2m labels: - severity: critical + severity: warning annotations: summary: Postgresql too many connections (instance {{ $labels.instance }}) description: "PostgreSQL instance has too many connections (> 80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" @@ -87,7 +87,7 @@ groups: - alert: PostgresqlCommitRateLow expr: 'increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[5m]) < 5' - for: 5m + for: 2m labels: severity: critical annotations: @@ -155,7 +155,7 @@ groups: severity: critical annotations: summary: Postgresql SSL compression active (instance {{ $labels.instance }}) - description: "Database allows connections with SSL compression enabled.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Database allows connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PostgresqlTooManyLocksAcquired expr: '((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20' diff --git a/dist/rules/prometheus-self-monitoring/embedded-exporter.yml b/dist/rules/prometheus-self-monitoring/embedded-exporter.yml index 8a2e402..908f001 100644 --- a/dist/rules/prometheus-self-monitoring/embedded-exporter.yml +++ b/dist/rules/prometheus-self-monitoring/embedded-exporter.yml @@ -32,7 +32,7 @@ groups: description: "A Prometheus job does not have living target anymore.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PrometheusTargetMissingWithWarmupTime - expr: 'sum by (instance, job) ((up == 0) * on (instance) group_left (__name__) (node_time_seconds - node_boot_time_seconds > 600))' + expr: 'sum by (instance, job) ((up == 0) * on (instance) group_left(__name__) (node_time_seconds - node_boot_time_seconds > 600))' for: 0m labels: severity: critical @@ -248,7 +248,7 @@ groups: description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PrometheusTimeseriesCardinality - expr: '(label_replace(count by (__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") unless on (__name__) ({__name__=~"node_cpu.*|node_systemd_unit_state"})) > 10000' + expr: 'label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000' for: 0m labels: severity: warning