From 981e82d6490652943f87accb1525c0940adade7d Mon Sep 17 00:00:00 2001 From: Rob Brown Date: Thu, 30 Apr 2020 13:27:30 +0100 Subject: [PATCH 1/3] Add HostEDACUncorrectableErrorsdetected and HostEDACCorrectableErrorsdetected rules --- _data/rules.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/_data/rules.yml b/_data/rules.yml index d9003b9..2d715bb 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -207,6 +207,14 @@ groups: description: OOM kill detected query: 'increase(node_vmstat_oom_kill[30m]) > 1' severity: warning + - name: Host EDAC Correctable Errors detected + description: '{{ $labels.instance }} has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.' + query: 'increase(node_edac_correctable_errors_total[5m])' + severity: warning + - name: Host EDAC Uncorrectable Errors detected + description: '{{ $labels.instance }} has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.' + query: 'increase(node_edac_uncorrectable_errors_total[5m])' + severity: warning - name: Docker containers exporters: From 4b22c078ea83c1ffc4bb91e3f8790b33bbb7df9e Mon Sep 17 00:00:00 2001 From: Rob Brown Date: Mon, 4 May 2020 18:47:20 +0100 Subject: [PATCH 2/3] Align EDAC errors with comments --- _data/rules.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/_data/rules.yml b/_data/rules.yml index 2d715bb..40262f5 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -210,10 +210,10 @@ groups: - name: Host EDAC Correctable Errors detected description: '{{ $labels.instance }} has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.' query: 'increase(node_edac_correctable_errors_total[5m])' - severity: warning + severity: info - name: Host EDAC Uncorrectable Errors detected description: '{{ $labels.instance }} has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.' - query: 'increase(node_edac_uncorrectable_errors_total[5m])' + query: 'node_edac_uncorrectable_errors_total > 1' severity: warning - name: Docker containers From 8912db93bc87772c5aba3d06ca5eb31959469d3f Mon Sep 17 00:00:00 2001 From: Rob Brown Date: Mon, 4 May 2020 19:04:52 +0100 Subject: [PATCH 3/3] Fix "greater than" value --- _data/rules.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_data/rules.yml b/_data/rules.yml index 40262f5..6da8c7b 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -213,7 +213,7 @@ groups: severity: info - name: Host EDAC Uncorrectable Errors detected description: '{{ $labels.instance }} has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.' - query: 'node_edac_uncorrectable_errors_total > 1' + query: 'node_edac_uncorrectable_errors_total >= 1' severity: warning - name: Docker containers