This commit is contained in:
samber 2024-03-06 16:15:04 +00:00
parent 224e6d00a9
commit 7e0d0097ce
3 changed files with 8 additions and 8 deletions

View file

@ -23,8 +23,8 @@ groups:
description: "The node is under heavy memory pressure. High rate of loading memory pages from disk.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "The node is under heavy memory pressure. High rate of loading memory pages from disk.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostMemoryIsUnderutilized - alert: HostMemoryIsUnderutilized
expr: '((avg_over_time(node_memory_MemFree_bytes[30m]) / node_memory_MemTotal_bytes) > .80)' expr: 'min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8'
for: 1w for: 0m
labels: labels:
severity: info severity: info
annotations: annotations:
@ -122,7 +122,7 @@ groups:
description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostHighCpuLoad - alert: HostHighCpuLoad
expr: '((avg by (instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > .80)' expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > .80'
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
@ -131,7 +131,7 @@ groups:
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostCpuIsUnderutilized - alert: HostCpuIsUnderutilized
expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > .80' expr: '(min by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8'
for: 1w for: 1w
labels: labels:
severity: info severity: info
@ -194,7 +194,7 @@ groups:
description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostPhysicalComponentTooHot - alert: HostPhysicalComponentTooHot
expr: 'node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl"} > 75' expr: 'node_hwmon_temp_celsius > node_hwmon_temp_max_celsius'
for: 5m for: 5m
labels: labels:
severity: warning severity: warning
@ -203,7 +203,7 @@ groups:
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNodeOvertemperatureAlarm - alert: HostNodeOvertemperatureAlarm
expr: '(node_hwmon_temp_crit_alarm_celsius == 1)' expr: 'node_hwmon_temp_crit_alarm_celsius == 1 or node_hwmon_temp_alarm == 1'
for: 0m for: 0m
labels: labels:
severity: critical severity: critical

View file

@ -86,7 +86,7 @@ groups:
description: "Ratio of transactions being aborted compared to committed is > 2 %\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Ratio of transactions being aborted compared to committed is > 2 %\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlCommitRateLow - alert: PostgresqlCommitRateLow
expr: 'increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[1m]) < 5' expr: 'increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[5m]) < 5'
for: 5m for: 5m
labels: labels:
severity: critical severity: critical

View file

@ -248,7 +248,7 @@ groups:
description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusTimeseriesCardinality - alert: PrometheusTimeseriesCardinality
expr: 'label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000' expr: '(label_replace(count by (__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") unless on (__name__) ({__name__=~"node_cpu_seconds_total|node_systemd_unit_state"})) > 10000'
for: 0m for: 0m
labels: labels:
severity: warning severity: warning