From 59dc6dca5c9bb366033090518ed2831b8acd422c Mon Sep 17 00:00:00 2001 From: samber Date: Sat, 24 Feb 2024 19:15:25 +0000 Subject: [PATCH] Publish --- .../smartctl-exporter.yml | 56 ++++++++++++------- 1 file changed, 37 insertions(+), 19 deletions(-) diff --git a/dist/rules/s.m.a.r.t-device-monitoring/smartctl-exporter.yml b/dist/rules/s.m.a.r.t-device-monitoring/smartctl-exporter.yml index 1946c38..4334bf0 100644 --- a/dist/rules/s.m.a.r.t-device-monitoring/smartctl-exporter.yml +++ b/dist/rules/s.m.a.r.t-device-monitoring/smartctl-exporter.yml @@ -5,46 +5,64 @@ groups: rules: - alert: SmartDeviceTemperatureWarning - expr: 'smartctl_device_temperature > 60' - for: 2m + expr: 'avg_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) > 60' + for: 0m labels: severity: warning annotations: - summary: Smart device temperature warning (instance {{ $labels.instance }}) - description: "Device temperature warning (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: SMART device temperature warning (instance {{ $labels.instance }}) + description: "Device temperature warning (instance {{ $labels.instance }}, drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SmartDeviceTemperatureCritical - expr: 'smartctl_device_temperature > 80' - for: 2m + expr: 'max_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) >= 70' + for: 0m labels: severity: critical annotations: - summary: Smart device temperature critical (instance {{ $labels.instance }}) - description: "Device temperature critical (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: SMART device temperature critical (instance {{ $labels.instance }}) + description: "Device temperature critical (instance {{ $labels.instance }}, drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: SmartDeviceTemperatureWasOverTripValue + expr: 'max_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) >= on(device, instance) smartctl_device_temperature{temperature_type="drive_trip"}' + for: 0m + labels: + severity: critical + annotations: + summary: SMART device temperature was over trip value (instance {{ $labels.instance }}) + description: "Device temperature over trip value (instance {{ $labels.instance }}, drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: SmartStatus + expr: 'smartctl_device_smart_status != 1' + for: 0m + labels: + severity: critical + annotations: + summary: SMART status (instance {{ $labels.instance }}) + description: "Device has a SMART status failure (instance {{ $labels.instance }}, drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SmartCriticalWarning expr: 'smartctl_device_critical_warning > 0' - for: 15m + for: 0m labels: severity: critical annotations: - summary: Smart critical warning (instance {{ $labels.instance }}) - description: "device has critical warning (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: SMART critical warning (instance {{ $labels.instance }}) + description: "Disk controller has critical warning (instance {{ $labels.instance }}, drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SmartMediaErrors expr: 'smartctl_device_media_errors > 0' - for: 15m + for: 0m labels: severity: critical annotations: - summary: Smart media errors (instance {{ $labels.instance }}) - description: "device has media errors (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: SMART media errors (instance {{ $labels.instance }}) + description: "Disk controller detected media errors (instance {{ $labels.instance }}, drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: SmartNvmeWearoutIndicator - expr: 'smartctl_device_available_spare{device=~"nvme.*"} < smartctl_device_available_spare_threshold{device=~"nvme.*"}' - for: 15m + - alert: SmartWearoutIndicator + expr: 'smartctl_device_available_spare < smartctl_device_available_spare_threshold' + for: 0m labels: severity: critical annotations: - summary: Smart NVME Wearout Indicator (instance {{ $labels.instance }}) - description: "NVMe device is wearing out (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: SMART Wearout Indicator (instance {{ $labels.instance }}) + description: "Device is wearing out (instance {{ $labels.instance }}, drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"