From a4dbefd85313934277d0d7aeedf3767fedff1785 Mon Sep 17 00:00:00 2001 From: samber Date: Thu, 22 Jun 2023 16:30:42 +0000 Subject: [PATCH] Publish --- dist/rules/host-and-hardware/node-exporter.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/dist/rules/host-and-hardware/node-exporter.yml b/dist/rules/host-and-hardware/node-exporter.yml index c9bb84f..ce5176a 100644 --- a/dist/rules/host-and-hardware/node-exporter.yml +++ b/dist/rules/host-and-hardware/node-exporter.yml @@ -22,14 +22,14 @@ groups: summary: Host memory under memory pressure (instance {{ $labels.instance }}) description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: HostMemoryIsUnderUtilized + - alert: HostMemoryIsUnderutilized expr: '(100 - (rate(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 1w labels: severity: info annotations: - summary: Host Memory is under utilized (instance {{ $labels.instance }}) - description: "Node memory is < 20% for 1 week. Consider reducing memory space.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: Host Memory is underutilized (instance {{ $labels.instance }}) + description: "Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostUnusualNetworkThroughputIn expr: '(sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' @@ -123,20 +123,20 @@ groups: - alert: HostHighCpuLoad expr: '(sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' - for: 0m + for: 10m labels: severity: warning annotations: summary: Host high CPU load (instance {{ $labels.instance }}) description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: HostCpuIsUnderUtilized + - alert: HostCpuIsUnderutilized expr: '(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 1w labels: severity: info annotations: - summary: Host CPU is under utilized (instance {{ $labels.instance }}) + summary: Host CPU is underutilized (instance {{ $labels.instance }}) description: "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostCpuStealNoisyNeighbor