From 8d3ae0436be52599296efb67f082206cab413872 Mon Sep 17 00:00:00 2001 From: Evi Vanoost Date: Sat, 24 Feb 2024 13:49:42 -0500 Subject: [PATCH] smartctl_exporter publishes both drive_trip and current drive temperatures. Since most of the alerts are going to be permanent, it does not make sense to wait for the alert to be on for a certain time. Temperature sensors likewise vary, using the last sample is not sufficient to alert on potential issues. --- CONTRIBUTING.md | 4 ++-- _data/rules.yml | 41 +++++++++++++++++++++++------------------ 2 files changed, 25 insertions(+), 20 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 1fcb24b..02b8c38 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -32,8 +32,8 @@ Or with Docker: docker run --rm -it -p 4000:4000 -v $(pwd):/srv/jekyll jekyll/jekyll jekyll serve ``` -Or with Docker-Compose: +Or with Docker Compose: ``` -docker-compose up -d +docker compose up -d ``` diff --git a/_data/rules.yml b/_data/rules.yml index 42ff8f0..9abb285 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -338,31 +338,36 @@ groups: slug: smartctl-exporter doc_url: https://github.com/prometheus-community/smartctl_exporter rules: - - name: Smart device temperature warning - description: Device temperature warning (instance {{ $labels.instance }}) - query: smartctl_device_temperature > 60 + - name: SMART device temperature warning + description: Device temperature warning (instance {{ $labels.instance }}, drive {{ $labels.device }}) + query: avg_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) > 60 severity: warning - for: 2m - - name: Smart device temperature critical - description: Device temperature critical (instance {{ $labels.instance }}) - query: smartctl_device_temperature > 80 + - name: SMART device temperature critical + description: Device temperature critical (instance {{ $labels.instance }}, drive {{ $labels.device }}) + query: max_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) >= 70 severity: critical - for: 2m - - name: Smart critical warning - description: device has critical warning (instance {{ $labels.instance }}) + # Datacenter drives have a trip temperature + - name: SMART device temperature was over trip value + description: Device temperature over trip value (instance {{ $labels.instance }}, drive {{ $labels.device }}) + query: max_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) >= on(device, instance) smartctl_device_temperature{temperature_type="drive_trip"} + severity: critical + - name: SMART status + description: Device has a SMART status failure (instance {{ $labels.instance }}, drive {{ $labels.device }}) + query: smartctl_device_smart_status != 1 + severity: critical + - name: SMART critical warning + description: Disk controller has critical warning (instance {{ $labels.instance }}, drive {{ $labels.device }}) query: smartctl_device_critical_warning > 0 severity: critical - for: 15m - - name: Smart media errors - description: device has media errors (instance {{ $labels.instance }}) + - name: SMART media errors + description: Disk controller detected media errors (instance {{ $labels.instance }}, drive {{ $labels.device }}) query: smartctl_device_media_errors > 0 severity: critical - for: 15m - - name: Smart NVME Wearout Indicator - description: NVMe device is wearing out (instance {{ $labels.instance }}) - query: smartctl_device_available_spare{device=~"nvme.*"} < smartctl_device_available_spare_threshold{device=~"nvme.*"} + - name: SMART Wearout Indicator + description: Device is wearing out (instance {{ $labels.instance }}, drive {{ $labels.device }}) + # The threshold is not present on devices that do not support it + query: smartctl_device_available_spare < smartctl_device_available_spare_threshold severity: critical - for: 15m - name: Docker containers exporters: