From 20651aa10d54ba61700b8b22a1de114d408ffc2a Mon Sep 17 00:00:00 2001 From: Samuel Berthe Date: Mon, 16 Mar 2026 03:43:51 +0100 Subject: [PATCH] feat: add OpenStack alerting rules (openstack-exporter) (#515) * feat: add OpenStack alerting rules (openstack-exporter) Add 20 alerting rules for openstack-exporter/openstack-exporter covering Nova, Neutron, Cinder, Octavia, and Placement services. * docs: add OpenStack to README services list * fix: align OpenStack load balancer alert name with operating_status semantics The operating_status label uses ONLINE/OFFLINE/DEGRADED/ERROR values, not ACTIVE. Rename alert to "not online" and use the label in the description for clarity. --- README.md | 1 + _data/rules.yml | 111 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 112 insertions(+) diff --git a/README.md b/README.md index d4f1954..8cf7581 100644 --- a/README.md +++ b/README.md @@ -103,6 +103,7 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts - [Istio](https://samber.github.io/awesome-prometheus-alerts/rules#istio) - [ArgoCD](https://samber.github.io/awesome-prometheus-alerts/rules#argocd) - [FluxCD](https://samber.github.io/awesome-prometheus-alerts/rules#fluxcd) +- [OpenStack](https://samber.github.io/awesome-prometheus-alerts/rules#openstack) #### Network, security and storage diff --git a/_data/rules.yml b/_data/rules.yml index ff3c3dc..2005fdc 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -3219,6 +3219,117 @@ groups: severity: warning for: 15m + - name: OpenStack + exporters: + - name: openstack-exporter/openstack-exporter + slug: openstack-exporter + doc_url: https://github.com/openstack-exporter/openstack-exporter + rules: + - name: OpenStack exporter down + description: The OpenStack exporter is down. OpenStack cloud metrics are no longer being collected. + query: 'up{job=~".*openstack.*"} == 0' + severity: critical + for: 2m + - name: OpenStack Nova agent down + description: "Nova agent {{ $labels.hostname }} ({{ $labels.service }}) is down in zone {{ $labels.zone }}" + query: 'openstack_nova_agent_state{adminState="enabled"} == 0' + severity: critical + for: 2m + - name: OpenStack Neutron agent down + description: "Neutron agent {{ $labels.hostname }} ({{ $labels.service }}) is down" + query: 'openstack_neutron_agent_state{adminState="enabled"} == 0' + severity: critical + for: 2m + - name: OpenStack Cinder agent down + description: "Cinder agent {{ $labels.hostname }} ({{ $labels.service }}) is down in zone {{ $labels.zone }}" + query: 'openstack_cinder_agent_state{adminState="enabled"} == 0' + severity: critical + for: 2m + - name: OpenStack hypervisor high vCPU usage + description: "Hypervisor {{ $labels.hostname }} vCPU usage is above 90%" + query: 'openstack_nova_vcpus_used / openstack_nova_vcpus_available > 0.9 and openstack_nova_vcpus_available > 0' + severity: warning + for: 5m + comments: | + The threshold of 90% is a rough default. Adjust based on your overcommit ratio and workload patterns. + - name: OpenStack hypervisor high memory usage + description: "Hypervisor {{ $labels.hostname }} memory usage is above 90%" + query: 'openstack_nova_memory_used_bytes / openstack_nova_memory_available_bytes > 0.9 and openstack_nova_memory_available_bytes > 0' + severity: warning + for: 5m + comments: | + The threshold of 90% is a rough default. Adjust based on your overcommit ratio and workload patterns. + - name: OpenStack hypervisor high disk usage + description: "Hypervisor {{ $labels.hostname }} local disk usage is above 90%" + query: 'openstack_nova_local_storage_used_bytes / openstack_nova_local_storage_available_bytes > 0.9 and openstack_nova_local_storage_available_bytes > 0' + severity: warning + for: 5m + - name: OpenStack Nova tenant vCPU quota nearly exhausted + description: "Tenant {{ $labels.tenant }} has used over 90% of its vCPU quota" + query: 'openstack_nova_limits_vcpus_used / openstack_nova_limits_vcpus_max > 0.9 and openstack_nova_limits_vcpus_max > 0' + severity: warning + comments: | + A value of -1 for limits_vcpus_max means unlimited quota (no limit set). + - name: OpenStack Nova tenant memory quota nearly exhausted + description: "Tenant {{ $labels.tenant }} has used over 90% of its memory quota" + query: 'openstack_nova_limits_memory_used / openstack_nova_limits_memory_max > 0.9 and openstack_nova_limits_memory_max > 0' + severity: warning + - name: OpenStack Nova tenant instance quota nearly exhausted + description: "Tenant {{ $labels.tenant }} has used over 90% of its instance quota" + query: 'openstack_nova_limits_instances_used / openstack_nova_limits_instances_max > 0.9 and openstack_nova_limits_instances_max > 0' + severity: warning + - name: OpenStack Cinder tenant volume quota nearly exhausted + description: "Tenant {{ $labels.tenant }} has used over 90% of its volume storage quota" + query: 'openstack_cinder_limits_volume_used_gb / openstack_cinder_limits_volume_max_gb > 0.9 and openstack_cinder_limits_volume_max_gb > 0' + severity: warning + - name: OpenStack Cinder pool low free capacity + description: "Cinder storage pool {{ $labels.name }} has less than 10% free capacity" + query: 'openstack_cinder_pool_capacity_free_gb / openstack_cinder_pool_capacity_total_gb < 0.1 and openstack_cinder_pool_capacity_total_gb > 0' + severity: warning + for: 5m + - name: OpenStack Neutron floating IPs associated but not active + description: "{{ $value }} floating IPs are associated to a private IP but are not in ACTIVE state" + query: 'openstack_neutron_floating_ips_associated_not_active > 0' + severity: warning + for: 5m + - name: OpenStack Neutron routers not active + description: "{{ $value }} Neutron routers are not in ACTIVE state" + query: 'openstack_neutron_routers_not_active > 0' + severity: warning + for: 5m + - name: OpenStack Neutron subnet IP pool exhaustion + description: "Subnet {{ $labels.subnet_name }} on network {{ $labels.network_name }} has used over 90% of its IP pool" + query: 'openstack_neutron_network_ip_availabilities_used / openstack_neutron_network_ip_availabilities_total > 0.9 and openstack_neutron_network_ip_availabilities_total > 0' + severity: warning + - name: OpenStack Neutron ports without IPs + description: "{{ $value }} active ports have no IP addresses assigned" + query: 'openstack_neutron_ports_no_ips > 0' + severity: warning + for: 5m + - name: OpenStack load balancer not online + description: "Load balancer {{ $labels.name }} ({{ $labels.id }}) operating status is {{ $labels.operating_status }}" + query: 'openstack_loadbalancer_loadbalancer_status{operating_status!="ONLINE"} > 0' + severity: warning + for: 5m + - name: OpenStack Nova instances in ERROR state + description: "{{ $value }} Nova instances are in ERROR state" + query: 'sum(openstack_nova_server_status{status="ERROR"}) > 0' + severity: warning + for: 5m + - name: OpenStack Cinder volumes in error state + description: "{{ $value }} Cinder volumes are in an error state" + query: 'openstack_cinder_volume_status_counter{status=~"error.*"} > 0' + severity: warning + for: 5m + - name: OpenStack placement resource high usage + description: "Resource {{ $labels.resourcetype }} on host {{ $labels.hostname }} usage exceeds 90% of its allocation" + query: 'openstack_placement_resource_usage / (openstack_placement_resource_total * openstack_placement_resource_allocation_ratio) > 0.9 and openstack_placement_resource_total > 0' + severity: warning + for: 5m + comments: | + This alert factors in the allocation ratio to compute effective capacity. + The threshold of 90% is a rough default. Adjust based on your allocation ratios and workload patterns. + - name: Network, security and storage services: - name: Ceph