mirror of
https://github.com/samber/awesome-prometheus-alerts.git
synced 2026-06-25 02:46:59 +08:00
Merge branch 'master' into worktree-pr-511-fixes
This commit is contained in:
commit
0defa99bba
6 changed files with 567 additions and 4 deletions
|
|
@ -51,6 +51,7 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts
|
||||||
- [Proxmox VE](https://samber.github.io/awesome-prometheus-alerts/rules#proxmox-ve)
|
- [Proxmox VE](https://samber.github.io/awesome-prometheus-alerts/rules#proxmox-ve)
|
||||||
- [Netdata](https://samber.github.io/awesome-prometheus-alerts/rules#netdata)
|
- [Netdata](https://samber.github.io/awesome-prometheus-alerts/rules#netdata)
|
||||||
- [eBPF](https://samber.github.io/awesome-prometheus-alerts/rules#ebpf)
|
- [eBPF](https://samber.github.io/awesome-prometheus-alerts/rules#ebpf)
|
||||||
|
- [Process Exporter](https://samber.github.io/awesome-prometheus-alerts/rules#process-exporter)
|
||||||
|
|
||||||
#### Databases and brokers
|
#### Databases and brokers
|
||||||
|
|
||||||
|
|
@ -102,6 +103,7 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts
|
||||||
- [Istio](https://samber.github.io/awesome-prometheus-alerts/rules#istio)
|
- [Istio](https://samber.github.io/awesome-prometheus-alerts/rules#istio)
|
||||||
- [ArgoCD](https://samber.github.io/awesome-prometheus-alerts/rules#argocd)
|
- [ArgoCD](https://samber.github.io/awesome-prometheus-alerts/rules#argocd)
|
||||||
- [FluxCD](https://samber.github.io/awesome-prometheus-alerts/rules#fluxcd)
|
- [FluxCD](https://samber.github.io/awesome-prometheus-alerts/rules#fluxcd)
|
||||||
|
- [OpenStack](https://samber.github.io/awesome-prometheus-alerts/rules#openstack)
|
||||||
|
|
||||||
#### Network, security and storage
|
#### Network, security and storage
|
||||||
|
|
||||||
|
|
|
||||||
179
_data/rules.yml
179
_data/rules.yml
|
|
@ -742,6 +742,74 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
for: 5m
|
for: 5m
|
||||||
|
|
||||||
|
- name: Process Exporter
|
||||||
|
exporters:
|
||||||
|
- name: ncabatoff/process-exporter
|
||||||
|
slug: process-exporter
|
||||||
|
doc_url: https://github.com/ncabatoff/process-exporter
|
||||||
|
rules:
|
||||||
|
- name: Process exporter group down
|
||||||
|
description: "No processes found for group {{ $labels.groupname }}. The service may have stopped. (instance {{ $labels.instance }})"
|
||||||
|
query: 'namedprocess_namegroup_num_procs == 0'
|
||||||
|
severity: critical
|
||||||
|
for: 2m
|
||||||
|
- name: Process exporter high memory usage
|
||||||
|
description: "Process group {{ $labels.groupname }} is using {{ $value | humanize }}B of resident memory. (instance {{ $labels.instance }})"
|
||||||
|
query: 'namedprocess_namegroup_memory_bytes{memtype="resident"} > 4e+09'
|
||||||
|
severity: warning
|
||||||
|
for: 5m
|
||||||
|
comments: |
|
||||||
|
Threshold of 4GB is arbitrary and depends on the process being monitored. Adjust per group.
|
||||||
|
- name: Process exporter high CPU usage
|
||||||
|
description: "Process group {{ $labels.groupname }} is using {{ $value }}% CPU (core-equivalent). (instance {{ $labels.instance }})"
|
||||||
|
query: 'rate(namedprocess_namegroup_cpu_seconds_total[5m]) * 100 > 80'
|
||||||
|
severity: warning
|
||||||
|
for: 5m
|
||||||
|
comments: |
|
||||||
|
Value is core-equivalent %: 100% = 1 full core, 200% = 2 cores, etc. Threshold of 80% is per-core. Adjust based on expected workload.
|
||||||
|
- name: Process exporter high file descriptor usage
|
||||||
|
description: "Process group {{ $labels.groupname }} is using more than 80% of its file descriptor limit. (instance {{ $labels.instance }})"
|
||||||
|
query: 'namedprocess_namegroup_worst_fd_ratio > 0.8'
|
||||||
|
severity: warning
|
||||||
|
for: 5m
|
||||||
|
- name: Process exporter file descriptors exhausted
|
||||||
|
description: "Process group {{ $labels.groupname }} has nearly exhausted its file descriptor limit. (instance {{ $labels.instance }})"
|
||||||
|
query: 'namedprocess_namegroup_worst_fd_ratio > 0.95'
|
||||||
|
severity: critical
|
||||||
|
for: 2m
|
||||||
|
- name: Process exporter high swap usage
|
||||||
|
description: "Process group {{ $labels.groupname }} is using {{ $value | humanize }}B of swap. (instance {{ $labels.instance }})"
|
||||||
|
query: 'namedprocess_namegroup_memory_bytes{memtype="swapped"} > 512e+06'
|
||||||
|
severity: warning
|
||||||
|
for: 5m
|
||||||
|
comments: |
|
||||||
|
Threshold of 512MB is arbitrary. Adjust per group and environment.
|
||||||
|
- name: Process exporter zombie processes
|
||||||
|
description: "Process group {{ $labels.groupname }} has {{ $value }} zombie processes. (instance {{ $labels.instance }})"
|
||||||
|
query: 'namedprocess_namegroup_states{state="Zombie"} > 0'
|
||||||
|
severity: warning
|
||||||
|
for: 5m
|
||||||
|
- name: Process exporter high context switching
|
||||||
|
description: "Process group {{ $labels.groupname }} has a high rate of context switches ({{ $value }}/s). (instance {{ $labels.instance }})"
|
||||||
|
query: 'rate(namedprocess_namegroup_context_switches_total[5m]) > 10000'
|
||||||
|
severity: warning
|
||||||
|
for: 5m
|
||||||
|
comments: |
|
||||||
|
Threshold of 10000 switches/s is a rough default. Adjust based on the workload profile.
|
||||||
|
- name: Process exporter high disk write IO
|
||||||
|
description: "Process group {{ $labels.groupname }} is performing {{ $value | humanize }}B/s of disk writes. (instance {{ $labels.instance }})"
|
||||||
|
query: 'rate(namedprocess_namegroup_write_bytes_total[5m]) > 100e+06'
|
||||||
|
severity: warning
|
||||||
|
for: 5m
|
||||||
|
comments: |
|
||||||
|
Threshold of 100MB/s is arbitrary. Adjust per group.
|
||||||
|
- name: Process exporter process restarting
|
||||||
|
description: "Process group {{ $labels.groupname }} has restarted (oldest process start time changed). (instance {{ $labels.instance }})"
|
||||||
|
query: 'changes(namedprocess_namegroup_oldest_start_time_seconds[5m]) > 0 and namedprocess_namegroup_num_procs > 0'
|
||||||
|
severity: info
|
||||||
|
comments: |
|
||||||
|
Detects restarts by watching for changes in the oldest process start time within the group.
|
||||||
|
|
||||||
- name: Databases and brokers
|
- name: Databases and brokers
|
||||||
services:
|
services:
|
||||||
- name: MySQL
|
- name: MySQL
|
||||||
|
|
@ -3151,6 +3219,117 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
for: 15m
|
for: 15m
|
||||||
|
|
||||||
|
- name: OpenStack
|
||||||
|
exporters:
|
||||||
|
- name: openstack-exporter/openstack-exporter
|
||||||
|
slug: openstack-exporter
|
||||||
|
doc_url: https://github.com/openstack-exporter/openstack-exporter
|
||||||
|
rules:
|
||||||
|
- name: OpenStack exporter down
|
||||||
|
description: The OpenStack exporter is down. OpenStack cloud metrics are no longer being collected.
|
||||||
|
query: 'up{job=~".*openstack.*"} == 0'
|
||||||
|
severity: critical
|
||||||
|
for: 2m
|
||||||
|
- name: OpenStack Nova agent down
|
||||||
|
description: "Nova agent {{ $labels.hostname }} ({{ $labels.service }}) is down in zone {{ $labels.zone }}"
|
||||||
|
query: 'openstack_nova_agent_state{adminState="enabled"} == 0'
|
||||||
|
severity: critical
|
||||||
|
for: 2m
|
||||||
|
- name: OpenStack Neutron agent down
|
||||||
|
description: "Neutron agent {{ $labels.hostname }} ({{ $labels.service }}) is down"
|
||||||
|
query: 'openstack_neutron_agent_state{adminState="enabled"} == 0'
|
||||||
|
severity: critical
|
||||||
|
for: 2m
|
||||||
|
- name: OpenStack Cinder agent down
|
||||||
|
description: "Cinder agent {{ $labels.hostname }} ({{ $labels.service }}) is down in zone {{ $labels.zone }}"
|
||||||
|
query: 'openstack_cinder_agent_state{adminState="enabled"} == 0'
|
||||||
|
severity: critical
|
||||||
|
for: 2m
|
||||||
|
- name: OpenStack hypervisor high vCPU usage
|
||||||
|
description: "Hypervisor {{ $labels.hostname }} vCPU usage is above 90%"
|
||||||
|
query: 'openstack_nova_vcpus_used / openstack_nova_vcpus_available > 0.9 and openstack_nova_vcpus_available > 0'
|
||||||
|
severity: warning
|
||||||
|
for: 5m
|
||||||
|
comments: |
|
||||||
|
The threshold of 90% is a rough default. Adjust based on your overcommit ratio and workload patterns.
|
||||||
|
- name: OpenStack hypervisor high memory usage
|
||||||
|
description: "Hypervisor {{ $labels.hostname }} memory usage is above 90%"
|
||||||
|
query: 'openstack_nova_memory_used_bytes / openstack_nova_memory_available_bytes > 0.9 and openstack_nova_memory_available_bytes > 0'
|
||||||
|
severity: warning
|
||||||
|
for: 5m
|
||||||
|
comments: |
|
||||||
|
The threshold of 90% is a rough default. Adjust based on your overcommit ratio and workload patterns.
|
||||||
|
- name: OpenStack hypervisor high disk usage
|
||||||
|
description: "Hypervisor {{ $labels.hostname }} local disk usage is above 90%"
|
||||||
|
query: 'openstack_nova_local_storage_used_bytes / openstack_nova_local_storage_available_bytes > 0.9 and openstack_nova_local_storage_available_bytes > 0'
|
||||||
|
severity: warning
|
||||||
|
for: 5m
|
||||||
|
- name: OpenStack Nova tenant vCPU quota nearly exhausted
|
||||||
|
description: "Tenant {{ $labels.tenant }} has used over 90% of its vCPU quota"
|
||||||
|
query: 'openstack_nova_limits_vcpus_used / openstack_nova_limits_vcpus_max > 0.9 and openstack_nova_limits_vcpus_max > 0'
|
||||||
|
severity: warning
|
||||||
|
comments: |
|
||||||
|
A value of -1 for limits_vcpus_max means unlimited quota (no limit set).
|
||||||
|
- name: OpenStack Nova tenant memory quota nearly exhausted
|
||||||
|
description: "Tenant {{ $labels.tenant }} has used over 90% of its memory quota"
|
||||||
|
query: 'openstack_nova_limits_memory_used / openstack_nova_limits_memory_max > 0.9 and openstack_nova_limits_memory_max > 0'
|
||||||
|
severity: warning
|
||||||
|
- name: OpenStack Nova tenant instance quota nearly exhausted
|
||||||
|
description: "Tenant {{ $labels.tenant }} has used over 90% of its instance quota"
|
||||||
|
query: 'openstack_nova_limits_instances_used / openstack_nova_limits_instances_max > 0.9 and openstack_nova_limits_instances_max > 0'
|
||||||
|
severity: warning
|
||||||
|
- name: OpenStack Cinder tenant volume quota nearly exhausted
|
||||||
|
description: "Tenant {{ $labels.tenant }} has used over 90% of its volume storage quota"
|
||||||
|
query: 'openstack_cinder_limits_volume_used_gb / openstack_cinder_limits_volume_max_gb > 0.9 and openstack_cinder_limits_volume_max_gb > 0'
|
||||||
|
severity: warning
|
||||||
|
- name: OpenStack Cinder pool low free capacity
|
||||||
|
description: "Cinder storage pool {{ $labels.name }} has less than 10% free capacity"
|
||||||
|
query: 'openstack_cinder_pool_capacity_free_gb / openstack_cinder_pool_capacity_total_gb < 0.1 and openstack_cinder_pool_capacity_total_gb > 0'
|
||||||
|
severity: warning
|
||||||
|
for: 5m
|
||||||
|
- name: OpenStack Neutron floating IPs associated but not active
|
||||||
|
description: "{{ $value }} floating IPs are associated to a private IP but are not in ACTIVE state"
|
||||||
|
query: 'openstack_neutron_floating_ips_associated_not_active > 0'
|
||||||
|
severity: warning
|
||||||
|
for: 5m
|
||||||
|
- name: OpenStack Neutron routers not active
|
||||||
|
description: "{{ $value }} Neutron routers are not in ACTIVE state"
|
||||||
|
query: 'openstack_neutron_routers_not_active > 0'
|
||||||
|
severity: warning
|
||||||
|
for: 5m
|
||||||
|
- name: OpenStack Neutron subnet IP pool exhaustion
|
||||||
|
description: "Subnet {{ $labels.subnet_name }} on network {{ $labels.network_name }} has used over 90% of its IP pool"
|
||||||
|
query: 'openstack_neutron_network_ip_availabilities_used / openstack_neutron_network_ip_availabilities_total > 0.9 and openstack_neutron_network_ip_availabilities_total > 0'
|
||||||
|
severity: warning
|
||||||
|
- name: OpenStack Neutron ports without IPs
|
||||||
|
description: "{{ $value }} active ports have no IP addresses assigned"
|
||||||
|
query: 'openstack_neutron_ports_no_ips > 0'
|
||||||
|
severity: warning
|
||||||
|
for: 5m
|
||||||
|
- name: OpenStack load balancer not online
|
||||||
|
description: "Load balancer {{ $labels.name }} ({{ $labels.id }}) operating status is {{ $labels.operating_status }}"
|
||||||
|
query: 'openstack_loadbalancer_loadbalancer_status{operating_status!="ONLINE"} > 0'
|
||||||
|
severity: warning
|
||||||
|
for: 5m
|
||||||
|
- name: OpenStack Nova instances in ERROR state
|
||||||
|
description: "{{ $value }} Nova instances are in ERROR state"
|
||||||
|
query: 'sum(openstack_nova_server_status{status="ERROR"}) > 0'
|
||||||
|
severity: warning
|
||||||
|
for: 5m
|
||||||
|
- name: OpenStack Cinder volumes in error state
|
||||||
|
description: "{{ $value }} Cinder volumes are in an error state"
|
||||||
|
query: 'openstack_cinder_volume_status_counter{status=~"error.*"} > 0'
|
||||||
|
severity: warning
|
||||||
|
for: 5m
|
||||||
|
- name: OpenStack placement resource high usage
|
||||||
|
description: "Resource {{ $labels.resourcetype }} on host {{ $labels.hostname }} usage exceeds 90% of its allocation"
|
||||||
|
query: 'openstack_placement_resource_usage / (openstack_placement_resource_total * openstack_placement_resource_allocation_ratio) > 0.9 and openstack_placement_resource_total > 0'
|
||||||
|
severity: warning
|
||||||
|
for: 5m
|
||||||
|
comments: |
|
||||||
|
This alert factors in the allocation ratio to compute effective capacity.
|
||||||
|
The threshold of 90% is a rough default. Adjust based on your allocation ratios and workload patterns.
|
||||||
|
|
||||||
- name: Network, security and storage
|
- name: Network, security and storage
|
||||||
services:
|
services:
|
||||||
- name: Ceph
|
- name: Ceph
|
||||||
|
|
|
||||||
6
dist/rules/envoy/embedded-exporter.yml
vendored
6
dist/rules/envoy/embedded-exporter.yml
vendored
|
|
@ -77,9 +77,8 @@ groups:
|
||||||
summary: Envoy high cluster upstream connection failures (instance {{ $labels.instance }})
|
summary: Envoy high cluster upstream connection failures (instance {{ $labels.instance }})
|
||||||
description: "High rate of upstream connection failures in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "High rate of upstream connection failures in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
# The +1 in the denominator guards against division by zero.
|
|
||||||
- alert: EnvoyHighClusterUpstreamRequestTimeoutRate
|
- alert: EnvoyHighClusterUpstreamRequestTimeoutRate
|
||||||
expr: 'increase(envoy_cluster_upstream_rq_timeout[5m]) / (increase(envoy_cluster_upstream_rq_completed[5m]) + 1) * 100 > 5'
|
expr: 'increase(envoy_cluster_upstream_rq_timeout[5m]) / increase(envoy_cluster_upstream_rq_completed[5m]) * 100 > 5 and increase(envoy_cluster_upstream_rq_completed[5m]) > 0'
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
|
@ -87,9 +86,8 @@ groups:
|
||||||
summary: Envoy high cluster upstream request timeout rate (instance {{ $labels.instance }})
|
summary: Envoy high cluster upstream request timeout rate (instance {{ $labels.instance }})
|
||||||
description: "More than 5% of upstream requests are timing out in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "More than 5% of upstream requests are timing out in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
# The +1 in the denominator guards against division by zero.
|
|
||||||
- alert: EnvoyHighClusterUpstream5xxErrorRate
|
- alert: EnvoyHighClusterUpstream5xxErrorRate
|
||||||
expr: 'increase(envoy_cluster_upstream_rq_xx{envoy_response_code_class="5"}[5m]) / (increase(envoy_cluster_upstream_rq_completed[5m]) + 1) * 100 > 5'
|
expr: 'increase(envoy_cluster_upstream_rq_xx{envoy_response_code_class="5"}[5m]) / increase(envoy_cluster_upstream_rq_completed[5m]) * 100 > 5 and increase(envoy_cluster_upstream_rq_completed[5m]) > 0'
|
||||||
for: 1m
|
for: 1m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
|
|
|
||||||
91
dist/rules/memcached/memcached-exporter.yml
vendored
Normal file
91
dist/rules/memcached/memcached-exporter.yml
vendored
Normal file
|
|
@ -0,0 +1,91 @@
|
||||||
|
groups:
|
||||||
|
|
||||||
|
- name: MemcachedExporter
|
||||||
|
|
||||||
|
|
||||||
|
rules:
|
||||||
|
|
||||||
|
# 1m delay allows a restart without triggering an alert.
|
||||||
|
- alert: MemcachedDown
|
||||||
|
expr: 'memcached_up == 0'
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Memcached down (instance {{ $labels.instance }})
|
||||||
|
description: "Memcached instance is down on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: MemcachedConnectionLimitApproaching(>80%)
|
||||||
|
expr: '(memcached_current_connections / memcached_max_connections * 100) > 80'
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Memcached connection limit approaching (> 80%) (instance {{ $labels.instance }})
|
||||||
|
description: "Memcached connection usage is above 80% on {{ $labels.instance }} (current value: {{ $value }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: MemcachedConnectionLimitApproaching(>95%)
|
||||||
|
expr: '(memcached_current_connections / memcached_max_connections * 100) > 95'
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Memcached connection limit approaching (> 95%) (instance {{ $labels.instance }})
|
||||||
|
description: "Memcached connection usage is above 95% on {{ $labels.instance }} (current value: {{ $value }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: MemcachedOutOfMemoryErrors
|
||||||
|
expr: 'sum without (slab) (rate(memcached_slab_items_outofmemory_total[5m])) > 0'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Memcached out of memory errors (instance {{ $labels.instance }})
|
||||||
|
description: "Memcached is returning out-of-memory errors on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
# High memory usage is expected if the cache is well-utilized. This alert fires when it approaches the configured limit, which may cause evictions.
|
||||||
|
- alert: MemcachedMemoryUsageHigh(>90%)
|
||||||
|
expr: '(memcached_current_bytes / memcached_limit_bytes * 100) > 90'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Memcached memory usage high (> 90%) (instance {{ $labels.instance }})
|
||||||
|
description: "Memcached memory usage is above 90% on {{ $labels.instance }} (current value: {{ $value }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
# A sustained eviction rate indicates memory pressure. Consider increasing memcached memory limit or reducing cache usage. Threshold of 10 evictions/s is a rough default — adjust based on your workload.
|
||||||
|
- alert: MemcachedHighEvictionRate
|
||||||
|
expr: 'rate(memcached_items_evicted_total[5m]) > 10'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Memcached high eviction rate (instance {{ $labels.instance }})
|
||||||
|
description: "Memcached is evicting items at a high rate on {{ $labels.instance }} ({{ $value }} evictions/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
# A low hit rate may indicate poor cache utilization, incorrect cache keys, or TTLs that are too short. Threshold of 80% is a rough default — adjust based on your workload and access patterns.
|
||||||
|
- alert: MemcachedLowCacheHitRate(<80%)
|
||||||
|
expr: '(rate(memcached_commands_total{command="get", status="hit"}[5m]) / (rate(memcached_commands_total{command="get", status="hit"}[5m]) + rate(memcached_commands_total{command="get", status="miss"}[5m])) * 100) < 80 and (rate(memcached_commands_total{command="get", status="hit"}[5m]) + rate(memcached_commands_total{command="get", status="miss"}[5m])) > 0'
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Memcached low cache hit rate (< 80%) (instance {{ $labels.instance }})
|
||||||
|
description: "Memcached cache hit rate is below 80% on {{ $labels.instance }} (current value: {{ $value }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: MemcachedConnectionsRejected
|
||||||
|
expr: 'increase(memcached_connections_rejected_total[5m]) > 0'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Memcached connections rejected (instance {{ $labels.instance }})
|
||||||
|
description: "Memcached is rejecting connections on {{ $labels.instance }} ({{ $value }} rejections in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: MemcachedItemsTooLarge
|
||||||
|
expr: 'increase(memcached_item_too_large_total[5m]) > 0'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: info
|
||||||
|
annotations:
|
||||||
|
summary: Memcached items too large (instance {{ $labels.instance }})
|
||||||
|
description: "Memcached is rejecting items exceeding max-item-size on {{ $labels.instance }} ({{ $value }} rejections in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
191
dist/rules/openstack/openstack-exporter.yml
vendored
Normal file
191
dist/rules/openstack/openstack-exporter.yml
vendored
Normal file
|
|
@ -0,0 +1,191 @@
|
||||||
|
groups:
|
||||||
|
|
||||||
|
- name: OpenstackExporter
|
||||||
|
|
||||||
|
|
||||||
|
rules:
|
||||||
|
|
||||||
|
- alert: OpenstackExporterDown
|
||||||
|
expr: 'up{job=~".*openstack.*"} == 0'
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: OpenStack exporter down (instance {{ $labels.instance }})
|
||||||
|
description: "The OpenStack exporter is down. OpenStack cloud metrics are no longer being collected.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: OpenstackNovaAgentDown
|
||||||
|
expr: 'openstack_nova_agent_state{adminState="enabled"} == 0'
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: OpenStack Nova agent down (instance {{ $labels.instance }})
|
||||||
|
description: "Nova agent {{ $labels.hostname }} ({{ $labels.service }}) is down in zone {{ $labels.zone }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: OpenstackNeutronAgentDown
|
||||||
|
expr: 'openstack_neutron_agent_state{adminState="enabled"} == 0'
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: OpenStack Neutron agent down (instance {{ $labels.instance }})
|
||||||
|
description: "Neutron agent {{ $labels.hostname }} ({{ $labels.service }}) is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: OpenstackCinderAgentDown
|
||||||
|
expr: 'openstack_cinder_agent_state{adminState="enabled"} == 0'
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: OpenStack Cinder agent down (instance {{ $labels.instance }})
|
||||||
|
description: "Cinder agent {{ $labels.hostname }} ({{ $labels.service }}) is down in zone {{ $labels.zone }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
# The threshold of 90% is a rough default. Adjust based on your overcommit ratio and workload patterns.
|
||||||
|
- alert: OpenstackHypervisorHighVcpuUsage
|
||||||
|
expr: 'openstack_nova_vcpus_used / openstack_nova_vcpus_available > 0.9 and openstack_nova_vcpus_available > 0'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: OpenStack hypervisor high vCPU usage (instance {{ $labels.instance }})
|
||||||
|
description: "Hypervisor {{ $labels.hostname }} vCPU usage is above 90%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
# The threshold of 90% is a rough default. Adjust based on your overcommit ratio and workload patterns.
|
||||||
|
- alert: OpenstackHypervisorHighMemoryUsage
|
||||||
|
expr: 'openstack_nova_memory_used_bytes / openstack_nova_memory_available_bytes > 0.9 and openstack_nova_memory_available_bytes > 0'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: OpenStack hypervisor high memory usage (instance {{ $labels.instance }})
|
||||||
|
description: "Hypervisor {{ $labels.hostname }} memory usage is above 90%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: OpenstackHypervisorHighDiskUsage
|
||||||
|
expr: 'openstack_nova_local_storage_used_bytes / openstack_nova_local_storage_available_bytes > 0.9 and openstack_nova_local_storage_available_bytes > 0'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: OpenStack hypervisor high disk usage (instance {{ $labels.instance }})
|
||||||
|
description: "Hypervisor {{ $labels.hostname }} local disk usage is above 90%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
# A value of -1 for limits_vcpus_max means unlimited quota (no limit set).
|
||||||
|
- alert: OpenstackNovaTenantVcpuQuotaNearlyExhausted
|
||||||
|
expr: 'openstack_nova_limits_vcpus_used / openstack_nova_limits_vcpus_max > 0.9 and openstack_nova_limits_vcpus_max > 0'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: OpenStack Nova tenant vCPU quota nearly exhausted (instance {{ $labels.instance }})
|
||||||
|
description: "Tenant {{ $labels.tenant }} has used over 90% of its vCPU quota\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: OpenstackNovaTenantMemoryQuotaNearlyExhausted
|
||||||
|
expr: 'openstack_nova_limits_memory_used / openstack_nova_limits_memory_max > 0.9 and openstack_nova_limits_memory_max > 0'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: OpenStack Nova tenant memory quota nearly exhausted (instance {{ $labels.instance }})
|
||||||
|
description: "Tenant {{ $labels.tenant }} has used over 90% of its memory quota\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: OpenstackNovaTenantInstanceQuotaNearlyExhausted
|
||||||
|
expr: 'openstack_nova_limits_instances_used / openstack_nova_limits_instances_max > 0.9 and openstack_nova_limits_instances_max > 0'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: OpenStack Nova tenant instance quota nearly exhausted (instance {{ $labels.instance }})
|
||||||
|
description: "Tenant {{ $labels.tenant }} has used over 90% of its instance quota\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: OpenstackCinderTenantVolumeQuotaNearlyExhausted
|
||||||
|
expr: 'openstack_cinder_limits_volume_used_gb / openstack_cinder_limits_volume_max_gb > 0.9 and openstack_cinder_limits_volume_max_gb > 0'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: OpenStack Cinder tenant volume quota nearly exhausted (instance {{ $labels.instance }})
|
||||||
|
description: "Tenant {{ $labels.tenant }} has used over 90% of its volume storage quota\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: OpenstackCinderPoolLowFreeCapacity
|
||||||
|
expr: 'openstack_cinder_pool_capacity_free_gb / openstack_cinder_pool_capacity_total_gb < 0.1 and openstack_cinder_pool_capacity_total_gb > 0'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: OpenStack Cinder pool low free capacity (instance {{ $labels.instance }})
|
||||||
|
description: "Cinder storage pool {{ $labels.name }} has less than 10% free capacity\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: OpenstackNeutronFloatingIpsAssociatedButNotActive
|
||||||
|
expr: 'openstack_neutron_floating_ips_associated_not_active > 0'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: OpenStack Neutron floating IPs associated but not active (instance {{ $labels.instance }})
|
||||||
|
description: "{{ $value }} floating IPs are associated to a private IP but are not in ACTIVE state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: OpenstackNeutronRoutersNotActive
|
||||||
|
expr: 'openstack_neutron_routers_not_active > 0'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: OpenStack Neutron routers not active (instance {{ $labels.instance }})
|
||||||
|
description: "{{ $value }} Neutron routers are not in ACTIVE state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: OpenstackNeutronSubnetIpPoolExhaustion
|
||||||
|
expr: 'openstack_neutron_network_ip_availabilities_used / openstack_neutron_network_ip_availabilities_total > 0.9 and openstack_neutron_network_ip_availabilities_total > 0'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: OpenStack Neutron subnet IP pool exhaustion (instance {{ $labels.instance }})
|
||||||
|
description: "Subnet {{ $labels.subnet_name }} on network {{ $labels.network_name }} has used over 90% of its IP pool\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: OpenstackNeutronPortsWithoutIps
|
||||||
|
expr: 'openstack_neutron_ports_no_ips > 0'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: OpenStack Neutron ports without IPs (instance {{ $labels.instance }})
|
||||||
|
description: "{{ $value }} active ports have no IP addresses assigned\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: OpenstackLoadBalancerNotOnline
|
||||||
|
expr: 'openstack_loadbalancer_loadbalancer_status{operating_status!="ONLINE"} > 0'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: OpenStack load balancer not online (instance {{ $labels.instance }})
|
||||||
|
description: "Load balancer {{ $labels.name }} ({{ $labels.id }}) operating status is {{ $labels.operating_status }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: OpenstackNovaInstancesInErrorState
|
||||||
|
expr: 'sum(openstack_nova_server_status{status="ERROR"}) > 0'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: OpenStack Nova instances in ERROR state (instance {{ $labels.instance }})
|
||||||
|
description: "{{ $value }} Nova instances are in ERROR state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: OpenstackCinderVolumesInErrorState
|
||||||
|
expr: 'openstack_cinder_volume_status_counter{status=~"error.*"} > 0'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: OpenStack Cinder volumes in error state (instance {{ $labels.instance }})
|
||||||
|
description: "{{ $value }} Cinder volumes are in an error state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
# This alert factors in the allocation ratio to compute effective capacity.
|
||||||
|
# The threshold of 90% is a rough default. Adjust based on your allocation ratios and workload patterns.
|
||||||
|
- alert: OpenstackPlacementResourceHighUsage
|
||||||
|
expr: 'openstack_placement_resource_usage / (openstack_placement_resource_total * openstack_placement_resource_allocation_ratio) > 0.9 and openstack_placement_resource_total > 0'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: OpenStack placement resource high usage (instance {{ $labels.instance }})
|
||||||
|
description: "Resource {{ $labels.resourcetype }} on host {{ $labels.hostname }} usage exceeds 90% of its allocation\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
102
dist/rules/process-exporter/process-exporter.yml
vendored
Normal file
102
dist/rules/process-exporter/process-exporter.yml
vendored
Normal file
|
|
@ -0,0 +1,102 @@
|
||||||
|
groups:
|
||||||
|
|
||||||
|
- name: ProcessExporter
|
||||||
|
|
||||||
|
|
||||||
|
rules:
|
||||||
|
|
||||||
|
- alert: ProcessExporterGroupDown
|
||||||
|
expr: 'namedprocess_namegroup_num_procs == 0'
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Process exporter group down (instance {{ $labels.instance }})
|
||||||
|
description: "No processes found for group {{ $labels.groupname }}. The service may have stopped. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
# Threshold of 4GB is arbitrary and depends on the process being monitored. Adjust per group.
|
||||||
|
- alert: ProcessExporterHighMemoryUsage
|
||||||
|
expr: 'namedprocess_namegroup_memory_bytes{memtype="resident"} > 4e+09'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Process exporter high memory usage (instance {{ $labels.instance }})
|
||||||
|
description: "Process group {{ $labels.groupname }} is using {{ $value | humanize }}B of resident memory. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
# Value is core-equivalent %: 100% = 1 full core, 200% = 2 cores, etc. Threshold of 80% is per-core. Adjust based on expected workload.
|
||||||
|
- alert: ProcessExporterHighCpuUsage
|
||||||
|
expr: 'rate(namedprocess_namegroup_cpu_seconds_total[5m]) * 100 > 80'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Process exporter high CPU usage (instance {{ $labels.instance }})
|
||||||
|
description: "Process group {{ $labels.groupname }} is using {{ $value }}% CPU (core-equivalent). (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: ProcessExporterHighFileDescriptorUsage
|
||||||
|
expr: 'namedprocess_namegroup_worst_fd_ratio > 0.8'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Process exporter high file descriptor usage (instance {{ $labels.instance }})
|
||||||
|
description: "Process group {{ $labels.groupname }} is using more than 80% of its file descriptor limit. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: ProcessExporterFileDescriptorsExhausted
|
||||||
|
expr: 'namedprocess_namegroup_worst_fd_ratio > 0.95'
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Process exporter file descriptors exhausted (instance {{ $labels.instance }})
|
||||||
|
description: "Process group {{ $labels.groupname }} has nearly exhausted its file descriptor limit. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
# Threshold of 512MB is arbitrary. Adjust per group and environment.
|
||||||
|
- alert: ProcessExporterHighSwapUsage
|
||||||
|
expr: 'namedprocess_namegroup_memory_bytes{memtype="swapped"} > 512e+06'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Process exporter high swap usage (instance {{ $labels.instance }})
|
||||||
|
description: "Process group {{ $labels.groupname }} is using {{ $value | humanize }}B of swap. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: ProcessExporterZombieProcesses
|
||||||
|
expr: 'namedprocess_namegroup_states{state="Zombie"} > 0'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Process exporter zombie processes (instance {{ $labels.instance }})
|
||||||
|
description: "Process group {{ $labels.groupname }} has {{ $value }} zombie processes. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
# Threshold of 10000 switches/s is a rough default. Adjust based on the workload profile.
|
||||||
|
- alert: ProcessExporterHighContextSwitching
|
||||||
|
expr: 'rate(namedprocess_namegroup_context_switches_total[5m]) > 10000'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Process exporter high context switching (instance {{ $labels.instance }})
|
||||||
|
description: "Process group {{ $labels.groupname }} has a high rate of context switches ({{ $value }}/s). (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
# Threshold of 100MB/s is arbitrary. Adjust per group.
|
||||||
|
- alert: ProcessExporterHighDiskWriteIo
|
||||||
|
expr: 'rate(namedprocess_namegroup_write_bytes_total[5m]) > 100e+06'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Process exporter high disk write IO (instance {{ $labels.instance }})
|
||||||
|
description: "Process group {{ $labels.groupname }} is performing {{ $value | humanize }}B/s of disk writes. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
# Detects restarts by watching for changes in the oldest process start time within the group.
|
||||||
|
- alert: ProcessExporterProcessRestarting
|
||||||
|
expr: 'changes(namedprocess_namegroup_oldest_start_time_seconds[5m]) > 0 and namedprocess_namegroup_num_procs > 0'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: info
|
||||||
|
annotations:
|
||||||
|
summary: Process exporter process restarting (instance {{ $labels.instance }})
|
||||||
|
description: "Process group {{ $labels.groupname }} has restarted (oldest process start time changed). (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
Loading…
Reference in a new issue