This commit is contained in:
samber 2024-02-24 19:15:25 +00:00
parent 79960ae2b4
commit 59dc6dca5c

View file

@ -5,46 +5,64 @@ groups:
rules: rules:
- alert: SmartDeviceTemperatureWarning - alert: SmartDeviceTemperatureWarning
expr: 'smartctl_device_temperature > 60' expr: 'avg_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) > 60'
for: 2m for: 0m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Smart device temperature warning (instance {{ $labels.instance }}) summary: SMART device temperature warning (instance {{ $labels.instance }})
description: "Device temperature warning (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Device temperature warning (instance {{ $labels.instance }}, drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SmartDeviceTemperatureCritical - alert: SmartDeviceTemperatureCritical
expr: 'smartctl_device_temperature > 80' expr: 'max_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) >= 70'
for: 2m for: 0m
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: Smart device temperature critical (instance {{ $labels.instance }}) summary: SMART device temperature critical (instance {{ $labels.instance }})
description: "Device temperature critical (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Device temperature critical (instance {{ $labels.instance }}, drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SmartDeviceTemperatureWasOverTripValue
expr: 'max_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) >= on(device, instance) smartctl_device_temperature{temperature_type="drive_trip"}'
for: 0m
labels:
severity: critical
annotations:
summary: SMART device temperature was over trip value (instance {{ $labels.instance }})
description: "Device temperature over trip value (instance {{ $labels.instance }}, drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SmartStatus
expr: 'smartctl_device_smart_status != 1'
for: 0m
labels:
severity: critical
annotations:
summary: SMART status (instance {{ $labels.instance }})
description: "Device has a SMART status failure (instance {{ $labels.instance }}, drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SmartCriticalWarning - alert: SmartCriticalWarning
expr: 'smartctl_device_critical_warning > 0' expr: 'smartctl_device_critical_warning > 0'
for: 15m for: 0m
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: Smart critical warning (instance {{ $labels.instance }}) summary: SMART critical warning (instance {{ $labels.instance }})
description: "device has critical warning (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Disk controller has critical warning (instance {{ $labels.instance }}, drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SmartMediaErrors - alert: SmartMediaErrors
expr: 'smartctl_device_media_errors > 0' expr: 'smartctl_device_media_errors > 0'
for: 15m for: 0m
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: Smart media errors (instance {{ $labels.instance }}) summary: SMART media errors (instance {{ $labels.instance }})
description: "device has media errors (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Disk controller detected media errors (instance {{ $labels.instance }}, drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SmartNvmeWearoutIndicator - alert: SmartWearoutIndicator
expr: 'smartctl_device_available_spare{device=~"nvme.*"} < smartctl_device_available_spare_threshold{device=~"nvme.*"}' expr: 'smartctl_device_available_spare < smartctl_device_available_spare_threshold'
for: 15m for: 0m
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: Smart NVME Wearout Indicator (instance {{ $labels.instance }}) summary: SMART Wearout Indicator (instance {{ $labels.instance }})
description: "NVMe device is wearing out (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Device is wearing out (instance {{ $labels.instance }}, drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"