diff --git a/_data/rules.yml b/_data/rules.yml index 425dfee..4483ddf 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -1136,12 +1136,12 @@ groups: 1m delay allows a restart without triggering an alert. - name: Memcached connection limit approaching (> 80%) description: "Memcached connection usage is above 80% on {{ $labels.instance }} (current value: {{ $value }}%)" - query: "(memcached_current_connections / memcached_max_connections * 100) > 80" + query: "(memcached_current_connections / memcached_max_connections * 100) > 80 and memcached_max_connections > 0" severity: warning for: 2m - name: Memcached connection limit approaching (> 95%) description: "Memcached connection usage is above 95% on {{ $labels.instance }} (current value: {{ $value }}%)" - query: "(memcached_current_connections / memcached_max_connections * 100) > 95" + query: "(memcached_current_connections / memcached_max_connections * 100) > 95 and memcached_max_connections > 0" severity: critical for: 2m - name: Memcached out of memory errors @@ -1151,7 +1151,7 @@ groups: for: 5m - name: Memcached memory usage high (> 90%) description: "Memcached memory usage is above 90% on {{ $labels.instance }} (current value: {{ $value }}%)" - query: "(memcached_current_bytes / memcached_limit_bytes * 100) > 90" + query: "(memcached_current_bytes / memcached_limit_bytes * 100) > 90 and memcached_limit_bytes > 0" severity: warning for: 5m comments: | @@ -1165,7 +1165,7 @@ groups: A sustained eviction rate indicates memory pressure. Consider increasing memcached memory limit or reducing cache usage. Threshold of 10 evictions/s is a rough default — adjust based on your workload. - name: Memcached low cache hit rate (< 80%) description: "Memcached cache hit rate is below 80% on {{ $labels.instance }} (current value: {{ $value }}%)" - query: "(rate(memcached_commands_total{command=\"get\", status=\"hit\"}[5m]) / (rate(memcached_commands_total{command=\"get\", status=\"hit\"}[5m]) + rate(memcached_commands_total{command=\"get\", status=\"miss\"}[5m])) * 100) < 80 and (rate(memcached_commands_total{command=\"get\", status=\"hit\"}[5m]) + rate(memcached_commands_total{command=\"get\", status=\"miss\"}[5m])) > 0" + query: '(rate(memcached_commands_total{command="get", status="hit"}[5m]) / (rate(memcached_commands_total{command="get", status="hit"}[5m]) + rate(memcached_commands_total{command="get", status="miss"}[5m])) * 100) < 80 and (rate(memcached_commands_total{command="get", status="hit"}[5m]) + rate(memcached_commands_total{command="get", status="miss"}[5m])) > 0' severity: warning for: 10m comments: | @@ -3792,30 +3792,30 @@ groups: comments: From the official snmp-mixin. - name: SNMP interface down description: "Interface {{ $labels.ifDescr }} on {{ $labels.instance }} is operationally down while administratively up." - query: "ifOperStatus{ifAdminStatus=\"1\"} == 2" + query: '(ifOperStatus{job=~"snmp.*"} == 2) and on(instance, job, ifIndex) (ifAdminStatus{job=~"snmp.*"} == 1)' severity: critical for: 2m - name: SNMP interface high inbound error rate description: "Interface {{ $labels.ifDescr }} on {{ $labels.instance }} has an inbound error rate above 5%." - query: "rate(ifInErrors[5m]) / (rate(ifHCInUcastPkts[5m]) + rate(ifHCInBroadcastPkts[5m]) + rate(ifHCInMulticastPkts[5m])) > 0.05 and (rate(ifHCInUcastPkts[5m]) + rate(ifHCInBroadcastPkts[5m]) + rate(ifHCInMulticastPkts[5m])) > 0" + query: 'rate(ifInErrors{job=~"snmp.*"}[5m]) / (rate(ifHCInUcastPkts{job=~"snmp.*"}[5m]) + rate(ifHCInBroadcastPkts{job=~"snmp.*"}[5m]) + rate(ifHCInMulticastPkts{job=~"snmp.*"}[5m])) > 0.05 and (rate(ifHCInUcastPkts{job=~"snmp.*"}[5m]) + rate(ifHCInBroadcastPkts{job=~"snmp.*"}[5m]) + rate(ifHCInMulticastPkts{job=~"snmp.*"}[5m])) > 0' severity: warning for: 5m comments: Threshold is a rough default. Adjust based on your network environment. - name: SNMP interface high outbound error rate description: "Interface {{ $labels.ifDescr }} on {{ $labels.instance }} has an outbound error rate above 5%." - query: "rate(ifOutErrors[5m]) / (rate(ifHCOutUcastPkts[5m]) + rate(ifHCOutBroadcastPkts[5m]) + rate(ifHCOutMulticastPkts[5m])) > 0.05 and (rate(ifHCOutUcastPkts[5m]) + rate(ifHCOutBroadcastPkts[5m]) + rate(ifHCOutMulticastPkts[5m])) > 0" + query: 'rate(ifOutErrors{job=~"snmp.*"}[5m]) / (rate(ifHCOutUcastPkts{job=~"snmp.*"}[5m]) + rate(ifHCOutBroadcastPkts{job=~"snmp.*"}[5m]) + rate(ifHCOutMulticastPkts{job=~"snmp.*"}[5m])) > 0.05 and (rate(ifHCOutUcastPkts{job=~"snmp.*"}[5m]) + rate(ifHCOutBroadcastPkts{job=~"snmp.*"}[5m]) + rate(ifHCOutMulticastPkts{job=~"snmp.*"}[5m])) > 0' severity: warning for: 5m comments: Threshold is a rough default. Adjust based on your network environment. - name: SNMP interface high bandwidth usage inbound description: "Interface {{ $labels.ifDescr }} on {{ $labels.instance }} inbound utilization is above 80%." - query: "rate(ifHCInOctets[5m]) * 8 / ifSpeed > 0.80 and ifSpeed > 0" + query: 'rate(ifHCInOctets{job=~"snmp.*"}[5m]) * 8 / ifSpeed > 0.80 and ifSpeed > 0' severity: warning for: 15m comments: Threshold is a rough default. Adjust based on your link capacity and traffic patterns. - name: SNMP interface high bandwidth usage outbound description: "Interface {{ $labels.ifDescr }} on {{ $labels.instance }} outbound utilization is above 80%." - query: "rate(ifHCOutOctets[5m]) * 8 / ifSpeed > 0.80 and ifSpeed > 0" + query: 'rate(ifHCOutOctets{job=~"snmp.*"}[5m]) * 8 / ifSpeed > 0.80 and ifSpeed > 0' severity: warning for: 15m comments: Threshold is a rough default. Adjust based on your link capacity and traffic patterns.