Publish

2026-06-21 00:47:18 +08:00 · 2025-02-16 22:17:02 +00:00 · 2025-02-16 22:17:02 +00:00 · 20f9a36615
commit 20f9a36615
parent fb857e8b39
4 changed files with 23 additions and 23 deletions
--- a/dist/rules/caddy/embedded-exporter.yml
+++ b/dist/rules/caddy/embedded-exporter.yml
@ -1,6 +1,6 @@
 groups:

- name: EmbeddedExporter
+- name: 

  rules:

@ -10,8 +10,8 @@ groups:
      labels:
        severity: critical
      annotations:
-        summary: Caddy reverse proxy down (instance {{ $labels.instance }})
-        description: "All Caddy reverse proxies are down\n LABELS = {{ $labels }}"
+        summary: Caddy Reverse Proxy Down (instance {{ $labels.instance }})
+        description: "All Caddy reverse proxies are down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CaddyHighHttp4xxErrorRateService
      expr: 'sum(rate(caddy_http_request_duration_seconds_count{code=~"4.."}[3m])) by (instance) / sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) * 100 > 5'
--- a/dist/rules/host-and-hardware/node-exporter.yml
+++ b/dist/rules/host-and-hardware/node-exporter.yml
@ -38,7 +38,7 @@ groups:
        severity: warning
      annotations:
        summary: Host unusual network throughput in (instance {{ $labels.instance }})
-        description: "Host receive bandwidth is high (>80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Host receive bandwidth is high (>80%).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostUnusualNetworkThroughputOut
      expr: '((rate(node_network_transmit_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)'
@ -74,7 +74,7 @@ groups:
        severity: warning
      annotations:
        summary: Host disk may fill in 24 hours (instance {{ $labels.instance }})
-        description: "Filesystem will likely run out of space within the next 24 hours\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Filesystem will likely run out of space within the next 24 hours.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostOutOfInodes
      expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0)'
@ -94,17 +94,17 @@ groups:
        summary: Host filesystem device error (instance {{ $labels.instance }})
        description: "Error stat-ing the {{ $labels.mountpoint }} filesystem\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

-    - alert: HostInodesWillFillIn24Hours
+    - alert: HostInodesMayFillIn24Hours
      expr: 'predict_linear(node_filesystem_files_free{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_files_free > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
-        summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
-        description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Host inodes may fill in 24 hours (instance {{ $labels.instance }})
+        description: "Filesystem will likely run out of inodes within the next 24 hours at current write rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostUnusualDiskReadLatency
-      expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0)'
+      expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0'
      for: 2m
      labels:
        severity: warning
@ -164,10 +164,10 @@ groups:
        severity: warning
      annotations:
        summary: Host unusual disk IO (instance {{ $labels.instance }})
-        description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities. Check storage for issues.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostContextSwitchingHigh
-      expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"}))/(rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2'
+      expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2'
      for: 0m
      labels:
        severity: warning
@ -203,7 +203,7 @@ groups:
        description: "Physical hardware component too hot\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostNodeOvertemperatureAlarm
-      expr: '((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: '((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1))'
      for: 0m
      labels:
        severity: critical
@ -217,7 +217,7 @@ groups:
      labels:
        severity: critical
      annotations:
-        summary: Host Software RAID insufficient drives (instance {{ $labels.instance }})
+        summary: Host software RAID insufficient drives (instance {{ $labels.instance }})
        description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} has insufficient drives remaining.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostSoftwareRaidDiskFailure
@ -226,7 +226,7 @@ groups:
      labels:
        severity: warning
      annotations:
-        summary: Host Software RAID disk failure (instance {{ $labels.instance }})
+        summary: Host software RAID disk failure (instance {{ $labels.instance }})
        description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} needs attention.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostKernelVersionDeviations
@ -236,13 +236,13 @@ groups:
        severity: info
      annotations:
        summary: Host kernel version deviations (instance {{ $labels.instance }})
-        description: "Kernel version for {{ $labels.instance }} has changed\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Kernel version for {{ $labels.instance }} has changed.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: HostOomKillDetected
      expr: '(increase(node_vmstat_oom_kill[1m]) > 0)'
      for: 0m
      labels:
-        severity: critical
+        severity: warning
      annotations:
        summary: Host OOM kill detected (instance {{ $labels.instance }})
        description: "OOM kill detected\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/dist/rules/postgresql/postgres-exporter.yml
+++ b/dist/rules/postgresql/postgres-exporter.yml
@ -32,7 +32,7 @@ groups:
        description: "Postgresql exporter is showing errors. A query may be buggy in query.yaml\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PostgresqlTableNotAutoVacuumed
-      expr: '((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_vacuum_threshold) and (time() - pg_stat_user_tables_last_autovacuum) > 864000'
+      expr: '((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_vacuum_threshold) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10'
      for: 0m
      labels:
        severity: warning
@ -41,7 +41,7 @@ groups:
        description: "Table {{ $labels.relname }} has not been auto vacuumed for 10 days\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PostgresqlTableNotAutoAnalyzed
-      expr: '((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_analyze_threshold) and (time() - pg_stat_user_tables_last_autoanalyze) > 864000'
+      expr: '((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_analyze_threshold) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10'
      for: 0m
      labels:
        severity: warning
@ -53,7 +53,7 @@ groups:
      expr: 'sum by (instance, job, server) (pg_stat_activity_count) > min by (instance, job, server) (pg_settings_max_connections * 0.8)'
      for: 2m
      labels:
-        severity: critical
+        severity: warning
      annotations:
        summary: Postgresql too many connections (instance {{ $labels.instance }})
        description: "PostgreSQL instance has too many connections (> 80%).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
@ -87,7 +87,7 @@ groups:

    - alert: PostgresqlCommitRateLow
      expr: 'increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[5m]) < 5'
-      for: 5m
+      for: 2m
      labels:
        severity: critical
      annotations:
@ -155,7 +155,7 @@ groups:
        severity: critical
      annotations:
        summary: Postgresql SSL compression active (instance {{ $labels.instance }})
-        description: "Database allows connections with SSL compression enabled.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Database allows connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PostgresqlTooManyLocksAcquired
      expr: '((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20'
--- a/dist/rules/prometheus-self-monitoring/embedded-exporter.yml
+++ b/dist/rules/prometheus-self-monitoring/embedded-exporter.yml
@ -32,7 +32,7 @@ groups:
        description: "A Prometheus job does not have living target anymore.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PrometheusTargetMissingWithWarmupTime
-      expr: 'sum by (instance, job) ((up == 0) * on (instance) group_left (__name__) (node_time_seconds - node_boot_time_seconds > 600))'
+      expr: 'sum by (instance, job) ((up == 0) * on (instance) group_left(__name__) (node_time_seconds - node_boot_time_seconds > 600))'
      for: 0m
      labels:
        severity: critical
@ -248,7 +248,7 @@ groups:
        description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: PrometheusTimeseriesCardinality
-      expr: '(label_replace(count by (__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") unless on (__name__) ({__name__=~"node_cpu.*|node_systemd_unit_state"})) > 10000'
+      expr: 'label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000'
      for: 0m
      labels:
        severity: warning