diff --git a/dist/rules/host-and-hardware/node-exporter.yml b/dist/rules/host-and-hardware/node-exporter.yml index 79506c5..0ce5002 100644 --- a/dist/rules/host-and-hardware/node-exporter.yml +++ b/dist/rules/host-and-hardware/node-exporter.yml @@ -23,8 +23,8 @@ groups: description: "The node is under heavy memory pressure. High rate of loading memory pages from disk.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostMemoryIsUnderutilized - expr: '((avg_over_time(node_memory_MemFree_bytes[30m]) / node_memory_MemTotal_bytes) > .80)' - for: 1w + expr: 'min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8' + for: 0m labels: severity: info annotations: @@ -122,7 +122,7 @@ groups: description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostHighCpuLoad - expr: '((avg by (instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > .80)' + expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > .80' for: 10m labels: severity: warning @@ -131,7 +131,7 @@ groups: description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostCpuIsUnderutilized - expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > .80' + expr: '(min by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8' for: 1w labels: severity: info @@ -194,7 +194,7 @@ groups: description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostPhysicalComponentTooHot - expr: 'node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl"} > 75' + expr: 'node_hwmon_temp_celsius > node_hwmon_temp_max_celsius' for: 5m labels: severity: warning @@ -203,7 +203,7 @@ groups: description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostNodeOvertemperatureAlarm - expr: '(node_hwmon_temp_crit_alarm_celsius == 1)' + expr: 'node_hwmon_temp_crit_alarm_celsius == 1 or node_hwmon_temp_alarm == 1' for: 0m labels: severity: critical diff --git a/dist/rules/postgresql/postgres-exporter.yml b/dist/rules/postgresql/postgres-exporter.yml index c9a93b1..1e96a42 100644 --- a/dist/rules/postgresql/postgres-exporter.yml +++ b/dist/rules/postgresql/postgres-exporter.yml @@ -86,7 +86,7 @@ groups: description: "Ratio of transactions being aborted compared to committed is > 2 %\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PostgresqlCommitRateLow - expr: 'increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[1m]) < 5' + expr: 'increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[5m]) < 5' for: 5m labels: severity: critical diff --git a/dist/rules/prometheus-self-monitoring/embedded-exporter.yml b/dist/rules/prometheus-self-monitoring/embedded-exporter.yml index 65bfd82..56117b0 100644 --- a/dist/rules/prometheus-self-monitoring/embedded-exporter.yml +++ b/dist/rules/prometheus-self-monitoring/embedded-exporter.yml @@ -248,7 +248,7 @@ groups: description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PrometheusTimeseriesCardinality - expr: 'label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000' + expr: '(label_replace(count by (__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") unless on (__name__) ({__name__=~"node_cpu_seconds_total|node_systemd_unit_state"})) > 10000' for: 0m labels: severity: warning