awesome-prometheus-alerts/dist/rules/s.m.a.r.t-device-monitoring/smartctl-exporter.yml
guruevi 70ac7d9cae
Various updates and quality of life changes (#405)
* smartctl_exporter publishes both drive_trip and current drive temperatures. Since most of the alerts are going to be permanent, it does not make sense to wait for the alert to be on for a certain time. Temperature sensors likewise vary, using the last sample is not sufficient to alert on potential issues.

* Add an option to run GitHub Action manually

* Add an option to force running the action for testing purposes

* Set variables correctly

* Set variables correctly

* Publish

* Clean up some more metrics

* Publish

* Minor bug fixes

* Publish

* Removed queries that throw errors when systems are upgraded. Also fixed and simplified a few Postgres queries.

* Publish

* Refined some more queries

* Publish

* PostgreSQL now has optimized autovacuum behavior

* Publish

* PostgreSQL now has optimized autovacuum behavior

* Publish

* Publish

* Query fails if instance names are not unique across jobs. This fixes it.

* Publish

* Ruby is out of date

---------

Co-authored-by: samber <samber@users.noreply.github.com>
2025-01-28 06:06:47 +01:00

77 lines
3.8 KiB
YAML

groups:
- name: SmartctlExporter
rules:
- alert: SmartDeviceTemperatureWarning
expr: '(avg_over_time(smartctl_device_temperature{temperature_type="current"} [5m]) unless on (instance, device) smartctl_device_temperature{temperature_type="drive_trip"}) > 60'
for: 0m
labels:
severity: warning
annotations:
summary: SMART device temperature warning (instance {{ $labels.instance }})
description: "Device temperature warning on {{ $labels.instance }} drive {{ $labels.device }} over 60°C\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SmartDeviceTemperatureCritical
expr: '(max_over_time(smartctl_device_temperature{temperature_type="current"} [5m]) unless on (instance, device) smartctl_device_temperature{temperature_type="drive_trip"}) > 70'
for: 0m
labels:
severity: critical
annotations:
summary: SMART device temperature critical (instance {{ $labels.instance }})
description: "Device temperature critical on {{ $labels.instance }} drive {{ $labels.device }} over 70°C\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SmartDeviceTemperatureOverTripValue
expr: 'max_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) >= on(device, instance) smartctl_device_temperature{temperature_type="drive_trip"}'
for: 0m
labels:
severity: critical
annotations:
summary: SMART device temperature over trip value (instance {{ $labels.instance }})
description: "Device temperature over trip value on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SmartDeviceTemperatureNearingTripValue
expr: 'max_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) >= on(device, instance) (smartctl_device_temperature{temperature_type="drive_trip"} * .80)'
for: 0m
labels:
severity: warning
annotations:
summary: SMART device temperature nearing trip value (instance {{ $labels.instance }})
description: "Device temperature at 80% of trip value on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SmartStatus
expr: 'smartctl_device_smart_status != 1'
for: 0m
labels:
severity: critical
annotations:
summary: SMART status (instance {{ $labels.instance }})
description: "Device has a SMART status failure on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SmartCriticalWarning
expr: 'smartctl_device_critical_warning > 0'
for: 0m
labels:
severity: critical
annotations:
summary: SMART critical warning (instance {{ $labels.instance }})
description: "Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SmartMediaErrors
expr: 'smartctl_device_media_errors > 0'
for: 0m
labels:
severity: critical
annotations:
summary: SMART media errors (instance {{ $labels.instance }})
description: "Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SmartWearoutIndicator
expr: 'smartctl_device_available_spare < smartctl_device_available_spare_threshold'
for: 0m
labels:
severity: critical
annotations:
summary: SMART Wearout Indicator (instance {{ $labels.instance }})
description: "Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"