groups: - name: SmartctlExporter rules: - alert: SmartDeviceTemperatureWarning expr: 'avg_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) > 60' for: 0m labels: severity: warning annotations: summary: SMART device temperature warning (instance {{ $labels.instance }}) description: "Device temperature warning (instance {{ $labels.instance }}, drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SmartDeviceTemperatureCritical expr: 'max_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) >= 70' for: 0m labels: severity: critical annotations: summary: SMART device temperature critical (instance {{ $labels.instance }}) description: "Device temperature critical (instance {{ $labels.instance }}, drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SmartDeviceTemperatureWasOverTripValue expr: 'max_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) >= on(device, instance) smartctl_device_temperature{temperature_type="drive_trip"}' for: 0m labels: severity: critical annotations: summary: SMART device temperature was over trip value (instance {{ $labels.instance }}) description: "Device temperature over trip value (instance {{ $labels.instance }}, drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SmartStatus expr: 'smartctl_device_smart_status != 1' for: 0m labels: severity: critical annotations: summary: SMART status (instance {{ $labels.instance }}) description: "Device has a SMART status failure (instance {{ $labels.instance }}, drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SmartCriticalWarning expr: 'smartctl_device_critical_warning > 0' for: 0m labels: severity: critical annotations: summary: SMART critical warning (instance {{ $labels.instance }}) description: "Disk controller has critical warning (instance {{ $labels.instance }}, drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SmartMediaErrors expr: 'smartctl_device_media_errors > 0' for: 0m labels: severity: critical annotations: summary: SMART media errors (instance {{ $labels.instance }}) description: "Disk controller detected media errors (instance {{ $labels.instance }}, drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: SmartWearoutIndicator expr: 'smartctl_device_available_spare < smartctl_device_available_spare_threshold' for: 0m labels: severity: critical annotations: summary: SMART Wearout Indicator (instance {{ $labels.instance }}) description: "Device is wearing out (instance {{ $labels.instance }}, drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"