mirror of
https://github.com/samber/awesome-prometheus-alerts.git
synced 2026-06-26 19:37:27 +08:00
Publish
This commit is contained in:
parent
224e6d00a9
commit
7e0d0097ce
3 changed files with 8 additions and 8 deletions
12
dist/rules/host-and-hardware/node-exporter.yml
vendored
12
dist/rules/host-and-hardware/node-exporter.yml
vendored
|
|
@ -23,8 +23,8 @@ groups:
|
||||||
description: "The node is under heavy memory pressure. High rate of loading memory pages from disk.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "The node is under heavy memory pressure. High rate of loading memory pages from disk.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: HostMemoryIsUnderutilized
|
- alert: HostMemoryIsUnderutilized
|
||||||
expr: '((avg_over_time(node_memory_MemFree_bytes[30m]) / node_memory_MemTotal_bytes) > .80)'
|
expr: 'min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8'
|
||||||
for: 1w
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: info
|
severity: info
|
||||||
annotations:
|
annotations:
|
||||||
|
|
@ -122,7 +122,7 @@ groups:
|
||||||
description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: HostHighCpuLoad
|
- alert: HostHighCpuLoad
|
||||||
expr: '((avg by (instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > .80)'
|
expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > .80'
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
|
@ -131,7 +131,7 @@ groups:
|
||||||
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: HostCpuIsUnderutilized
|
- alert: HostCpuIsUnderutilized
|
||||||
expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > .80'
|
expr: '(min by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8'
|
||||||
for: 1w
|
for: 1w
|
||||||
labels:
|
labels:
|
||||||
severity: info
|
severity: info
|
||||||
|
|
@ -194,7 +194,7 @@ groups:
|
||||||
description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: HostPhysicalComponentTooHot
|
- alert: HostPhysicalComponentTooHot
|
||||||
expr: 'node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl"} > 75'
|
expr: 'node_hwmon_temp_celsius > node_hwmon_temp_max_celsius'
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
|
@ -203,7 +203,7 @@ groups:
|
||||||
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: HostNodeOvertemperatureAlarm
|
- alert: HostNodeOvertemperatureAlarm
|
||||||
expr: '(node_hwmon_temp_crit_alarm_celsius == 1)'
|
expr: 'node_hwmon_temp_crit_alarm_celsius == 1 or node_hwmon_temp_alarm == 1'
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
|
|
|
||||||
2
dist/rules/postgresql/postgres-exporter.yml
vendored
2
dist/rules/postgresql/postgres-exporter.yml
vendored
|
|
@ -86,7 +86,7 @@ groups:
|
||||||
description: "Ratio of transactions being aborted compared to committed is > 2 %\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Ratio of transactions being aborted compared to committed is > 2 %\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: PostgresqlCommitRateLow
|
- alert: PostgresqlCommitRateLow
|
||||||
expr: 'increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[1m]) < 5'
|
expr: 'increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[5m]) < 5'
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
|
|
|
||||||
|
|
@ -248,7 +248,7 @@ groups:
|
||||||
description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: PrometheusTimeseriesCardinality
|
- alert: PrometheusTimeseriesCardinality
|
||||||
expr: 'label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000'
|
expr: '(label_replace(count by (__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") unless on (__name__) ({__name__=~"node_cpu_seconds_total|node_systemd_unit_state"})) > 10000'
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue