Refined some more queries

2026-06-26 19:37:27 +08:00 · 2024-03-06 11:13:48 -05:00 · 2024-03-06 11:13:48 -05:00 · 224e6d00a9
commit 224e6d00a9
parent c026db7e52
2 changed files with 12 additions and 11 deletions
--- a/.github/workflows/dist.yml
+++ b/.github/workflows/dist.yml
@ -9,8 +9,6 @@ on:
 jobs:
  publish:
    name: Publish
-    # Check if the PR is not from a fork or manually executed
-    if: ${{ (github.repository_owner == 'samber') || (github.event_name == 'workflow_dispatch') }}
    runs-on: ubuntu-latest
    steps:
      - name: Checkout Repo
--- a/_data/rules.yml
+++ b/_data/rules.yml
@ -126,7 +126,8 @@ groups:
                severity: critical
              - name: Prometheus timeseries cardinality
                description: 'The "{{ $labels.name }}" timeseries cardinality is getting very high: {{ $value }}'
-                query: 'label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000'
+                # Node CPU seconds total and Node SystemD Unit State are always high cardinality due to systemd containing services and CPU containing cores
+                query: '(label_replace(count by (__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") unless on (__name__) ({__name__=~"node_cpu_seconds_total|node_systemd_unit_state"})) > 10000'
                severity: warning

      - name: Host and hardware
@ -147,9 +148,8 @@ groups:
              - name: Host Memory is underutilized
                description: "Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})"
                # We use MemFree, many buffers (ZFS, databases etc) are declared as available memory, but would perform poorly if reduced
-                query: '((avg_over_time(node_memory_MemFree_bytes[30m]) / node_memory_MemTotal_bytes) > .80)'
+                query: 'min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8'
                severity: info
-                for: 1w
                comments: |
                  You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
              - name: Host unusual network throughput in
@ -214,16 +214,16 @@ groups:
                for: 2m
              - name: Host high CPU load
                description: CPU load is > 80%
-                query: '((avg by (instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > .80)'
+                query: '(avg by (instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > .80'
                severity: warning
                for: 10m
              - name: Host CPU is underutilized
                description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs."
-                query: '(avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > .80'
+                query: '(min by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8'
                severity: info
-                for: 1w
                comments: |
                  You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
+                for: 1w
              - name: Host CPU steal noisy neighbor
                description: CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.
                query: 'avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10'
@ -255,12 +255,15 @@ groups:
                severity: warning
              - name: Host physical component too hot
                description: "Physical hardware component too hot"
-                query: 'node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl"} > 75'
+                # Some components have different max temperatures (eg. 65 for hard drive sensors, 90-100 for CPU).
+                # This is defined for all sensors, the crit value may not be defined for everything.
+                query: 'node_hwmon_temp_celsius > node_hwmon_temp_max_celsius'
                severity: warning
                for: 5m
              - name: Host node overtemperature alarm
                description: "Physical node temperature alarm triggered"
-                query: '(node_hwmon_temp_crit_alarm_celsius == 1)'
+                # This is a critical alarm, some things (eg. NVMe) have just the temp alarm.
+                query: 'node_hwmon_temp_crit_alarm_celsius == 1 or node_hwmon_temp_alarm == 1'
                severity: critical
              - name: Host Software RAID insufficient drives
                description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} has insufficient drives remaining."
@ -672,7 +675,7 @@ groups:
                severity: warning
              - name: Postgresql commit rate low
                description: Postgresql seems to be processing very few transactions
-                query: 'increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[1m]) < 5'
+                query: 'increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[5m]) < 5'
                severity: critical
                for: 5m
              - name: Postgresql low XID consumption