From d44bfd4c4b6dd1bc82e9a0503bbfa7302dbe8462 Mon Sep 17 00:00:00 2001 From: samber Date: Mon, 16 Mar 2026 02:26:04 +0000 Subject: [PATCH] Publish --- dist/rules/envoy/embedded-exporter.yml | 6 +- dist/rules/memcached/memcached-exporter.yml | 91 +++++++++++++++++++++ 2 files changed, 93 insertions(+), 4 deletions(-) create mode 100644 dist/rules/memcached/memcached-exporter.yml diff --git a/dist/rules/envoy/embedded-exporter.yml b/dist/rules/envoy/embedded-exporter.yml index bbe9aaa..f489b0c 100644 --- a/dist/rules/envoy/embedded-exporter.yml +++ b/dist/rules/envoy/embedded-exporter.yml @@ -77,9 +77,8 @@ groups: summary: Envoy high cluster upstream connection failures (instance {{ $labels.instance }}) description: "High rate of upstream connection failures in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - # The +1 in the denominator guards against division by zero. - alert: EnvoyHighClusterUpstreamRequestTimeoutRate - expr: 'increase(envoy_cluster_upstream_rq_timeout[5m]) / (increase(envoy_cluster_upstream_rq_completed[5m]) + 1) * 100 > 5' + expr: 'increase(envoy_cluster_upstream_rq_timeout[5m]) / increase(envoy_cluster_upstream_rq_completed[5m]) * 100 > 5 and increase(envoy_cluster_upstream_rq_completed[5m]) > 0' for: 5m labels: severity: warning @@ -87,9 +86,8 @@ groups: summary: Envoy high cluster upstream request timeout rate (instance {{ $labels.instance }}) description: "More than 5% of upstream requests are timing out in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - # The +1 in the denominator guards against division by zero. - alert: EnvoyHighClusterUpstream5xxErrorRate - expr: 'increase(envoy_cluster_upstream_rq_xx{envoy_response_code_class="5"}[5m]) / (increase(envoy_cluster_upstream_rq_completed[5m]) + 1) * 100 > 5' + expr: 'increase(envoy_cluster_upstream_rq_xx{envoy_response_code_class="5"}[5m]) / increase(envoy_cluster_upstream_rq_completed[5m]) * 100 > 5 and increase(envoy_cluster_upstream_rq_completed[5m]) > 0' for: 1m labels: severity: critical diff --git a/dist/rules/memcached/memcached-exporter.yml b/dist/rules/memcached/memcached-exporter.yml new file mode 100644 index 0000000..c1a68b3 --- /dev/null +++ b/dist/rules/memcached/memcached-exporter.yml @@ -0,0 +1,91 @@ +groups: + +- name: MemcachedExporter + + + rules: + + # 1m delay allows a restart without triggering an alert. + - alert: MemcachedDown + expr: 'memcached_up == 0' + for: 1m + labels: + severity: critical + annotations: + summary: Memcached down (instance {{ $labels.instance }}) + description: "Memcached instance is down on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: MemcachedConnectionLimitApproaching(>80%) + expr: '(memcached_current_connections / memcached_max_connections * 100) > 80' + for: 2m + labels: + severity: warning + annotations: + summary: Memcached connection limit approaching (> 80%) (instance {{ $labels.instance }}) + description: "Memcached connection usage is above 80% on {{ $labels.instance }} (current value: {{ $value }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: MemcachedConnectionLimitApproaching(>95%) + expr: '(memcached_current_connections / memcached_max_connections * 100) > 95' + for: 2m + labels: + severity: critical + annotations: + summary: Memcached connection limit approaching (> 95%) (instance {{ $labels.instance }}) + description: "Memcached connection usage is above 95% on {{ $labels.instance }} (current value: {{ $value }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: MemcachedOutOfMemoryErrors + expr: 'sum without (slab) (rate(memcached_slab_items_outofmemory_total[5m])) > 0' + for: 5m + labels: + severity: warning + annotations: + summary: Memcached out of memory errors (instance {{ $labels.instance }}) + description: "Memcached is returning out-of-memory errors on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + # High memory usage is expected if the cache is well-utilized. This alert fires when it approaches the configured limit, which may cause evictions. + - alert: MemcachedMemoryUsageHigh(>90%) + expr: '(memcached_current_bytes / memcached_limit_bytes * 100) > 90' + for: 5m + labels: + severity: warning + annotations: + summary: Memcached memory usage high (> 90%) (instance {{ $labels.instance }}) + description: "Memcached memory usage is above 90% on {{ $labels.instance }} (current value: {{ $value }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + # A sustained eviction rate indicates memory pressure. Consider increasing memcached memory limit or reducing cache usage. Threshold of 10 evictions/s is a rough default — adjust based on your workload. + - alert: MemcachedHighEvictionRate + expr: 'rate(memcached_items_evicted_total[5m]) > 10' + for: 5m + labels: + severity: warning + annotations: + summary: Memcached high eviction rate (instance {{ $labels.instance }}) + description: "Memcached is evicting items at a high rate on {{ $labels.instance }} ({{ $value }} evictions/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + # A low hit rate may indicate poor cache utilization, incorrect cache keys, or TTLs that are too short. Threshold of 80% is a rough default — adjust based on your workload and access patterns. + - alert: MemcachedLowCacheHitRate(<80%) + expr: '(rate(memcached_commands_total{command="get", status="hit"}[5m]) / (rate(memcached_commands_total{command="get", status="hit"}[5m]) + rate(memcached_commands_total{command="get", status="miss"}[5m])) * 100) < 80 and (rate(memcached_commands_total{command="get", status="hit"}[5m]) + rate(memcached_commands_total{command="get", status="miss"}[5m])) > 0' + for: 10m + labels: + severity: warning + annotations: + summary: Memcached low cache hit rate (< 80%) (instance {{ $labels.instance }}) + description: "Memcached cache hit rate is below 80% on {{ $labels.instance }} (current value: {{ $value }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: MemcachedConnectionsRejected + expr: 'increase(memcached_connections_rejected_total[5m]) > 0' + for: 5m + labels: + severity: warning + annotations: + summary: Memcached connections rejected (instance {{ $labels.instance }}) + description: "Memcached is rejecting connections on {{ $labels.instance }} ({{ $value }} rejections in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: MemcachedItemsTooLarge + expr: 'increase(memcached_item_too_large_total[5m]) > 0' + for: 5m + labels: + severity: info + annotations: + summary: Memcached items too large (instance {{ $labels.instance }}) + description: "Memcached is rejecting items exceeding max-item-size on {{ $labels.instance }} ({{ $value }} rejections in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"