mirror of
https://github.com/samber/awesome-prometheus-alerts.git
synced 2026-06-26 11:27:00 +08:00
Publish
This commit is contained in:
parent
79960ae2b4
commit
59dc6dca5c
1 changed files with 37 additions and 19 deletions
|
|
@ -5,46 +5,64 @@ groups:
|
|||
rules:
|
||||
|
||||
- alert: SmartDeviceTemperatureWarning
|
||||
expr: 'smartctl_device_temperature > 60'
|
||||
for: 2m
|
||||
expr: 'avg_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) > 60'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Smart device temperature warning (instance {{ $labels.instance }})
|
||||
description: "Device temperature warning (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
summary: SMART device temperature warning (instance {{ $labels.instance }})
|
||||
description: "Device temperature warning (instance {{ $labels.instance }}, drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: SmartDeviceTemperatureCritical
|
||||
expr: 'smartctl_device_temperature > 80'
|
||||
for: 2m
|
||||
expr: 'max_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) >= 70'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Smart device temperature critical (instance {{ $labels.instance }})
|
||||
description: "Device temperature critical (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
summary: SMART device temperature critical (instance {{ $labels.instance }})
|
||||
description: "Device temperature critical (instance {{ $labels.instance }}, drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: SmartDeviceTemperatureWasOverTripValue
|
||||
expr: 'max_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) >= on(device, instance) smartctl_device_temperature{temperature_type="drive_trip"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: SMART device temperature was over trip value (instance {{ $labels.instance }})
|
||||
description: "Device temperature over trip value (instance {{ $labels.instance }}, drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: SmartStatus
|
||||
expr: 'smartctl_device_smart_status != 1'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: SMART status (instance {{ $labels.instance }})
|
||||
description: "Device has a SMART status failure (instance {{ $labels.instance }}, drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: SmartCriticalWarning
|
||||
expr: 'smartctl_device_critical_warning > 0'
|
||||
for: 15m
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Smart critical warning (instance {{ $labels.instance }})
|
||||
description: "device has critical warning (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
summary: SMART critical warning (instance {{ $labels.instance }})
|
||||
description: "Disk controller has critical warning (instance {{ $labels.instance }}, drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: SmartMediaErrors
|
||||
expr: 'smartctl_device_media_errors > 0'
|
||||
for: 15m
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Smart media errors (instance {{ $labels.instance }})
|
||||
description: "device has media errors (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
summary: SMART media errors (instance {{ $labels.instance }})
|
||||
description: "Disk controller detected media errors (instance {{ $labels.instance }}, drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: SmartNvmeWearoutIndicator
|
||||
expr: 'smartctl_device_available_spare{device=~"nvme.*"} < smartctl_device_available_spare_threshold{device=~"nvme.*"}'
|
||||
for: 15m
|
||||
- alert: SmartWearoutIndicator
|
||||
expr: 'smartctl_device_available_spare < smartctl_device_available_spare_threshold'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Smart NVME Wearout Indicator (instance {{ $labels.instance }})
|
||||
description: "NVMe device is wearing out (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
summary: SMART Wearout Indicator (instance {{ $labels.instance }})
|
||||
description: "Device is wearing out (instance {{ $labels.instance }}, drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
|
|||
Loading…
Reference in a new issue