From 3db9281508670f80ec71af53f09dd48496bd79f3 Mon Sep 17 00:00:00 2001 From: Samuel Berthe Date: Mon, 16 Mar 2026 02:34:34 +0100 Subject: [PATCH] feat: add SNMP exporter alerting rules (#507) Add 7 alerting rules for prometheus/snmp_exporter covering device availability, interface status, error rates, bandwidth utilization, and device restarts. Rules use standard IF-MIB and SNMPv2-MIB metrics. --- README.md | 1 + _data/rules.yml | 50 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+) diff --git a/README.md b/README.md index 30997bf..0282e29 100644 --- a/README.md +++ b/README.md @@ -110,6 +110,7 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts - [FreeSwitch](https://samber.github.io/awesome-prometheus-alerts/rules#freeswitch) - [Hashicorp Vault](https://samber.github.io/awesome-prometheus-alerts/rules#hashicorp-vault) - [Cloudflare](https://samber.github.io/awesome-prometheus-alerts/rules#cloudflare) +- [SNMP](https://samber.github.io/awesome-prometheus-alerts/rules#snmp) #### Other diff --git a/_data/rules.yml b/_data/rules.yml index 1ac85f7..dbc9dd8 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -3084,6 +3084,56 @@ groups: query: '(sum by (zone) (rate(cloudflare_zone_requests_status{status=~"^5.."}[5m])) / on (zone) sum by (zone) (rate(cloudflare_zone_requests_status[5m]))) * 100 > 5' severity: critical + - name: SNMP + exporters: + - name: prometheus/snmp_exporter + slug: snmp-exporter + doc_url: https://github.com/prometheus/snmp_exporter + comments: | + These rules use standard IF-MIB and SNMPv2-MIB metrics. Metric names depend on your snmp.yml module configuration. + Thresholds for bandwidth and error rates are rough defaults - adjust to your environment. + rules: + - name: SNMP target down + description: "SNMP device {{ $labels.instance }} is unreachable." + query: 'up{job=~"snmp.*"} == 0' + severity: critical + for: 5m + comments: From the official snmp-mixin. + - name: SNMP interface down + description: "Interface {{ $labels.ifDescr }} on {{ $labels.instance }} is operationally down while administratively up." + query: "ifOperStatus{ifAdminStatus=\"1\"} == 2" + severity: critical + for: 2m + - name: SNMP interface high inbound error rate + description: "Interface {{ $labels.ifDescr }} on {{ $labels.instance }} has an inbound error rate above 5%." + query: "rate(ifInErrors[5m]) / (rate(ifHCInUcastPkts[5m]) + rate(ifHCInBroadcastPkts[5m]) + rate(ifHCInMulticastPkts[5m])) > 0.05 and (rate(ifHCInUcastPkts[5m]) + rate(ifHCInBroadcastPkts[5m]) + rate(ifHCInMulticastPkts[5m])) > 0" + severity: warning + for: 5m + comments: Threshold is a rough default. Adjust based on your network environment. + - name: SNMP interface high outbound error rate + description: "Interface {{ $labels.ifDescr }} on {{ $labels.instance }} has an outbound error rate above 5%." + query: "rate(ifOutErrors[5m]) / (rate(ifHCOutUcastPkts[5m]) + rate(ifHCOutBroadcastPkts[5m]) + rate(ifHCOutMulticastPkts[5m])) > 0.05 and (rate(ifHCOutUcastPkts[5m]) + rate(ifHCOutBroadcastPkts[5m]) + rate(ifHCOutMulticastPkts[5m])) > 0" + severity: warning + for: 5m + comments: Threshold is a rough default. Adjust based on your network environment. + - name: SNMP interface high bandwidth usage inbound + description: "Interface {{ $labels.ifDescr }} on {{ $labels.instance }} inbound utilization is above 80%." + query: "rate(ifHCInOctets[5m]) * 8 / ifSpeed > 0.80 and ifSpeed > 0" + severity: warning + for: 15m + comments: Threshold is a rough default. Adjust based on your link capacity and traffic patterns. + - name: SNMP interface high bandwidth usage outbound + description: "Interface {{ $labels.ifDescr }} on {{ $labels.instance }} outbound utilization is above 80%." + query: "rate(ifHCOutOctets[5m]) * 8 / ifSpeed > 0.80 and ifSpeed > 0" + severity: warning + for: 15m + comments: Threshold is a rough default. Adjust based on your link capacity and traffic patterns. + - name: SNMP device restarted + description: "SNMP device {{ $labels.instance }} has restarted (uptime < 5 minutes)." + query: "sysUpTime / 100 < 300" + severity: info + comments: sysUpTime is in centiseconds (hundredths of a second). + - name: Other services: - name: Thanos