From 8b443be6d2f2e3e8685569b6c5d28f7d35fdcffb Mon Sep 17 00:00:00 2001 From: Samuel Berthe Date: Mon, 16 Mar 2026 14:07:14 +0100 Subject: [PATCH] feat: add systemd_exporter alerting rules (7 rules) (#522) * feat: add systemd_exporter alerting rules (7 rules) Add new Systemd service under Basic resource monitoring with rules for: - Unit failed/inactive state detection - Service crash loop detection - Task limit exhaustion - Socket refused/high connections - Timer missed trigger * fix: narrow systemd unit inactive query to reduce noise Add type="service" and name filter to the inactive unit alert to avoid false positives from legitimately inactive units. --- README.md | 1 + _data/rules.yml | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+) diff --git a/README.md b/README.md index 2babb28..64781ca 100644 --- a/README.md +++ b/README.md @@ -52,6 +52,7 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts - [Netdata](https://samber.github.io/awesome-prometheus-alerts/rules#netdata) - [eBPF](https://samber.github.io/awesome-prometheus-alerts/rules#ebpf) - [Process Exporter](https://samber.github.io/awesome-prometheus-alerts/rules#process-exporter) +- [Systemd](https://samber.github.io/awesome-prometheus-alerts/rules#systemd) #### Databases and brokers diff --git a/_data/rules.yml b/_data/rules.yml index 229392e..d097b18 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -810,6 +810,52 @@ groups: comments: | Detects restarts by watching for changes in the oldest process start time within the group. + - name: Systemd + exporters: + - name: prometheus-community/systemd_exporter + slug: systemd-exporter + doc_url: https://github.com/prometheus-community/systemd_exporter + rules: + - name: Systemd unit failed + description: "Systemd unit {{ $labels.name }} has entered failed state. (instance {{ $labels.instance }})" + query: 'systemd_unit_state{state="failed"} == 1' + severity: warning + for: 5m + - name: Systemd unit inactive + description: "Systemd unit {{ $labels.name }} is inactive. (instance {{ $labels.instance }})" + query: 'systemd_unit_state{state="inactive", type="service", name=~"your-critical-service.+"} == 1' + severity: warning + for: 5m + comments: | + Many units are legitimately inactive. You must adjust the name=~ filter to match your critical services. + - name: Systemd service crash looping + description: "Systemd service {{ $labels.name }} has restarted {{ $value }} times in the last hour. (instance {{ $labels.instance }})" + query: 'increase(systemd_service_restart_total[1h]) > 5' + severity: critical + for: 5m + - name: Systemd unit tasks near limit + description: "Systemd unit {{ $labels.name }} is using {{ $value | humanizePercentage }} of its task limit. (instance {{ $labels.instance }})" + query: 'systemd_unit_tasks_current / systemd_unit_tasks_max > 0.9 and systemd_unit_tasks_max > 0' + severity: warning + for: 5m + - name: Systemd socket refused connections + description: "Systemd socket {{ $labels.name }} is refusing connections. (instance {{ $labels.instance }})" + query: 'increase(systemd_socket_refused_connections_total[5m]) > 0' + severity: warning + - name: Systemd socket high connections + description: "Systemd socket {{ $labels.name }} has {{ $value }} active connections. (instance {{ $labels.instance }})" + query: 'systemd_socket_current_connections > 100' + severity: warning + comments: | + Threshold of 100 connections is arbitrary. Adjust to your workload. + - name: Systemd timer missed trigger + description: "Systemd timer {{ $labels.name }} has not triggered for over 24 hours. (instance {{ $labels.instance }})" + query: '(time() - systemd_timer_last_trigger_seconds) / 3600 > 24 and systemd_timer_last_trigger_seconds > 0' + severity: warning + for: 5m + comments: | + Triggers if timer hasn't fired in 24 hours. Adjust threshold per timer schedule. + - name: Databases and brokers services: - name: MySQL