mirror of
https://github.com/samber/awesome-prometheus-alerts.git
synced 2026-06-21 00:47:18 +08:00
Publish
This commit is contained in:
parent
fb857e8b39
commit
20f9a36615
4 changed files with 23 additions and 23 deletions
|
|
@ -1,6 +1,6 @@
|
|||
groups:
|
||||
|
||||
- name: EmbeddedExporter
|
||||
- name:
|
||||
|
||||
rules:
|
||||
|
||||
|
|
@ -10,8 +10,8 @@ groups:
|
|||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Caddy reverse proxy down (instance {{ $labels.instance }})
|
||||
description: "All Caddy reverse proxies are down\n LABELS = {{ $labels }}"
|
||||
summary: Caddy Reverse Proxy Down (instance {{ $labels.instance }})
|
||||
description: "All Caddy reverse proxies are down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CaddyHighHttp4xxErrorRateService
|
||||
expr: 'sum(rate(caddy_http_request_duration_seconds_count{code=~"4.."}[3m])) by (instance) / sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) * 100 > 5'
|
||||
26
dist/rules/host-and-hardware/node-exporter.yml
vendored
26
dist/rules/host-and-hardware/node-exporter.yml
vendored
|
|
@ -38,7 +38,7 @@ groups:
|
|||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual network throughput in (instance {{ $labels.instance }})
|
||||
description: "Host receive bandwidth is high (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Host receive bandwidth is high (>80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualNetworkThroughputOut
|
||||
expr: '((rate(node_network_transmit_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)'
|
||||
|
|
@ -74,7 +74,7 @@ groups:
|
|||
severity: warning
|
||||
annotations:
|
||||
summary: Host disk may fill in 24 hours (instance {{ $labels.instance }})
|
||||
description: "Filesystem will likely run out of space within the next 24 hours\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Filesystem will likely run out of space within the next 24 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostOutOfInodes
|
||||
expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0)'
|
||||
|
|
@ -94,17 +94,17 @@ groups:
|
|||
summary: Host filesystem device error (instance {{ $labels.instance }})
|
||||
description: "Error stat-ing the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostInodesWillFillIn24Hours
|
||||
- alert: HostInodesMayFillIn24Hours
|
||||
expr: 'predict_linear(node_filesystem_files_free{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_files_free > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
|
||||
description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
summary: Host inodes may fill in 24 hours (instance {{ $labels.instance }})
|
||||
description: "Filesystem will likely run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualDiskReadLatency
|
||||
expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0)'
|
||||
expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -164,10 +164,10 @@ groups:
|
|||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk IO (instance {{ $labels.instance }})
|
||||
description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostContextSwitchingHigh
|
||||
expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"}))/(rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2'
|
||||
expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -203,7 +203,7 @@ groups:
|
|||
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostNodeOvertemperatureAlarm
|
||||
expr: '((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
expr: '((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1))'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
|
|
@ -217,7 +217,7 @@ groups:
|
|||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Host Software RAID insufficient drives (instance {{ $labels.instance }})
|
||||
summary: Host software RAID insufficient drives (instance {{ $labels.instance }})
|
||||
description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} has insufficient drives remaining.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostSoftwareRaidDiskFailure
|
||||
|
|
@ -226,7 +226,7 @@ groups:
|
|||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host Software RAID disk failure (instance {{ $labels.instance }})
|
||||
summary: Host software RAID disk failure (instance {{ $labels.instance }})
|
||||
description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} needs attention.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostKernelVersionDeviations
|
||||
|
|
@ -236,13 +236,13 @@ groups:
|
|||
severity: info
|
||||
annotations:
|
||||
summary: Host kernel version deviations (instance {{ $labels.instance }})
|
||||
description: "Kernel version for {{ $labels.instance }} has changed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Kernel version for {{ $labels.instance }} has changed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostOomKillDetected
|
||||
expr: '(increase(node_vmstat_oom_kill[1m]) > 0)'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host OOM kill detected (instance {{ $labels.instance }})
|
||||
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
|
|||
10
dist/rules/postgresql/postgres-exporter.yml
vendored
10
dist/rules/postgresql/postgres-exporter.yml
vendored
|
|
@ -32,7 +32,7 @@ groups:
|
|||
description: "Postgresql exporter is showing errors. A query may be buggy in query.yaml\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlTableNotAutoVacuumed
|
||||
expr: '((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_vacuum_threshold) and (time() - pg_stat_user_tables_last_autovacuum) > 864000'
|
||||
expr: '((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_vacuum_threshold) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -41,7 +41,7 @@ groups:
|
|||
description: "Table {{ $labels.relname }} has not been auto vacuumed for 10 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlTableNotAutoAnalyzed
|
||||
expr: '((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_analyze_threshold) and (time() - pg_stat_user_tables_last_autoanalyze) > 864000'
|
||||
expr: '((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_analyze_threshold) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -53,7 +53,7 @@ groups:
|
|||
expr: 'sum by (instance, job, server) (pg_stat_activity_count) > min by (instance, job, server) (pg_settings_max_connections * 0.8)'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql too many connections (instance {{ $labels.instance }})
|
||||
description: "PostgreSQL instance has too many connections (> 80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
@ -87,7 +87,7 @@ groups:
|
|||
|
||||
- alert: PostgresqlCommitRateLow
|
||||
expr: 'increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[5m]) < 5'
|
||||
for: 5m
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
|
|
@ -155,7 +155,7 @@ groups:
|
|||
severity: critical
|
||||
annotations:
|
||||
summary: Postgresql SSL compression active (instance {{ $labels.instance }})
|
||||
description: "Database allows connections with SSL compression enabled.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Database allows connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlTooManyLocksAcquired
|
||||
expr: '((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20'
|
||||
|
|
|
|||
|
|
@ -32,7 +32,7 @@ groups:
|
|||
description: "A Prometheus job does not have living target anymore.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusTargetMissingWithWarmupTime
|
||||
expr: 'sum by (instance, job) ((up == 0) * on (instance) group_left (__name__) (node_time_seconds - node_boot_time_seconds > 600))'
|
||||
expr: 'sum by (instance, job) ((up == 0) * on (instance) group_left(__name__) (node_time_seconds - node_boot_time_seconds > 600))'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
|
|
@ -248,7 +248,7 @@ groups:
|
|||
description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusTimeseriesCardinality
|
||||
expr: '(label_replace(count by (__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") unless on (__name__) ({__name__=~"node_cpu.*|node_systemd_unit_state"})) > 10000'
|
||||
expr: 'label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
|
|||
Loading…
Reference in a new issue