mirror of
https://github.com/samber/awesome-prometheus-alerts.git
synced 2026-06-25 02:46:59 +08:00
feat: add SNMP exporter alerting rules (#507)
Add 7 alerting rules for prometheus/snmp_exporter covering device availability, interface status, error rates, bandwidth utilization, and device restarts. Rules use standard IF-MIB and SNMPv2-MIB metrics.
This commit is contained in:
parent
b039066277
commit
3db9281508
2 changed files with 51 additions and 0 deletions
|
|
@ -110,6 +110,7 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts
|
||||||
- [FreeSwitch](https://samber.github.io/awesome-prometheus-alerts/rules#freeswitch)
|
- [FreeSwitch](https://samber.github.io/awesome-prometheus-alerts/rules#freeswitch)
|
||||||
- [Hashicorp Vault](https://samber.github.io/awesome-prometheus-alerts/rules#hashicorp-vault)
|
- [Hashicorp Vault](https://samber.github.io/awesome-prometheus-alerts/rules#hashicorp-vault)
|
||||||
- [Cloudflare](https://samber.github.io/awesome-prometheus-alerts/rules#cloudflare)
|
- [Cloudflare](https://samber.github.io/awesome-prometheus-alerts/rules#cloudflare)
|
||||||
|
- [SNMP](https://samber.github.io/awesome-prometheus-alerts/rules#snmp)
|
||||||
|
|
||||||
#### Other
|
#### Other
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -3084,6 +3084,56 @@ groups:
|
||||||
query: '(sum by (zone) (rate(cloudflare_zone_requests_status{status=~"^5.."}[5m])) / on (zone) sum by (zone) (rate(cloudflare_zone_requests_status[5m]))) * 100 > 5'
|
query: '(sum by (zone) (rate(cloudflare_zone_requests_status{status=~"^5.."}[5m])) / on (zone) sum by (zone) (rate(cloudflare_zone_requests_status[5m]))) * 100 > 5'
|
||||||
severity: critical
|
severity: critical
|
||||||
|
|
||||||
|
- name: SNMP
|
||||||
|
exporters:
|
||||||
|
- name: prometheus/snmp_exporter
|
||||||
|
slug: snmp-exporter
|
||||||
|
doc_url: https://github.com/prometheus/snmp_exporter
|
||||||
|
comments: |
|
||||||
|
These rules use standard IF-MIB and SNMPv2-MIB metrics. Metric names depend on your snmp.yml module configuration.
|
||||||
|
Thresholds for bandwidth and error rates are rough defaults - adjust to your environment.
|
||||||
|
rules:
|
||||||
|
- name: SNMP target down
|
||||||
|
description: "SNMP device {{ $labels.instance }} is unreachable."
|
||||||
|
query: 'up{job=~"snmp.*"} == 0'
|
||||||
|
severity: critical
|
||||||
|
for: 5m
|
||||||
|
comments: From the official snmp-mixin.
|
||||||
|
- name: SNMP interface down
|
||||||
|
description: "Interface {{ $labels.ifDescr }} on {{ $labels.instance }} is operationally down while administratively up."
|
||||||
|
query: "ifOperStatus{ifAdminStatus=\"1\"} == 2"
|
||||||
|
severity: critical
|
||||||
|
for: 2m
|
||||||
|
- name: SNMP interface high inbound error rate
|
||||||
|
description: "Interface {{ $labels.ifDescr }} on {{ $labels.instance }} has an inbound error rate above 5%."
|
||||||
|
query: "rate(ifInErrors[5m]) / (rate(ifHCInUcastPkts[5m]) + rate(ifHCInBroadcastPkts[5m]) + rate(ifHCInMulticastPkts[5m])) > 0.05 and (rate(ifHCInUcastPkts[5m]) + rate(ifHCInBroadcastPkts[5m]) + rate(ifHCInMulticastPkts[5m])) > 0"
|
||||||
|
severity: warning
|
||||||
|
for: 5m
|
||||||
|
comments: Threshold is a rough default. Adjust based on your network environment.
|
||||||
|
- name: SNMP interface high outbound error rate
|
||||||
|
description: "Interface {{ $labels.ifDescr }} on {{ $labels.instance }} has an outbound error rate above 5%."
|
||||||
|
query: "rate(ifOutErrors[5m]) / (rate(ifHCOutUcastPkts[5m]) + rate(ifHCOutBroadcastPkts[5m]) + rate(ifHCOutMulticastPkts[5m])) > 0.05 and (rate(ifHCOutUcastPkts[5m]) + rate(ifHCOutBroadcastPkts[5m]) + rate(ifHCOutMulticastPkts[5m])) > 0"
|
||||||
|
severity: warning
|
||||||
|
for: 5m
|
||||||
|
comments: Threshold is a rough default. Adjust based on your network environment.
|
||||||
|
- name: SNMP interface high bandwidth usage inbound
|
||||||
|
description: "Interface {{ $labels.ifDescr }} on {{ $labels.instance }} inbound utilization is above 80%."
|
||||||
|
query: "rate(ifHCInOctets[5m]) * 8 / ifSpeed > 0.80 and ifSpeed > 0"
|
||||||
|
severity: warning
|
||||||
|
for: 15m
|
||||||
|
comments: Threshold is a rough default. Adjust based on your link capacity and traffic patterns.
|
||||||
|
- name: SNMP interface high bandwidth usage outbound
|
||||||
|
description: "Interface {{ $labels.ifDescr }} on {{ $labels.instance }} outbound utilization is above 80%."
|
||||||
|
query: "rate(ifHCOutOctets[5m]) * 8 / ifSpeed > 0.80 and ifSpeed > 0"
|
||||||
|
severity: warning
|
||||||
|
for: 15m
|
||||||
|
comments: Threshold is a rough default. Adjust based on your link capacity and traffic patterns.
|
||||||
|
- name: SNMP device restarted
|
||||||
|
description: "SNMP device {{ $labels.instance }} has restarted (uptime < 5 minutes)."
|
||||||
|
query: "sysUpTime / 100 < 300"
|
||||||
|
severity: info
|
||||||
|
comments: sysUpTime is in centiseconds (hundredths of a second).
|
||||||
|
|
||||||
- name: Other
|
- name: Other
|
||||||
services:
|
services:
|
||||||
- name: Thanos
|
- name: Thanos
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue