Merge remote-tracking branch 'samber/master'

# Conflicts:
#	_data/rules.yml
#	dist/rules/host-and-hardware/node-exporter.yml
This commit is contained in:
Evi Vanoost 2024-07-02 13:32:46 -04:00
commit 51d0484bb4
12 changed files with 436 additions and 113 deletions

View file

@ -231,7 +231,7 @@ GEM
jekyll-seo-tag (~> 2.1) jekyll-seo-tag (~> 2.1)
minitest (5.17.0) minitest (5.17.0)
multipart-post (2.1.1) multipart-post (2.1.1)
nokogiri (1.16.2-x86_64-linux) nokogiri (1.16.5-x86_64-linux)
racc (~> 1.4) racc (~> 1.4)
octokit (4.22.0) octokit (4.22.0)
faraday (>= 0.9) faraday (>= 0.9)
@ -243,7 +243,8 @@ GEM
rb-fsevent (0.11.1) rb-fsevent (0.11.1)
rb-inotify (0.10.1) rb-inotify (0.10.1)
ffi (~> 1.0) ffi (~> 1.0)
rexml (3.2.5) rexml (3.2.8)
strscan (>= 3.0.9)
rouge (3.26.0) rouge (3.26.0)
ruby2_keywords (0.0.5) ruby2_keywords (0.0.5)
rubyzip (2.3.2) rubyzip (2.3.2)
@ -258,6 +259,7 @@ GEM
faraday (> 0.8, < 2.0) faraday (> 0.8, < 2.0)
simpleidn (0.2.1) simpleidn (0.2.1)
unf (~> 0.1.4) unf (~> 0.1.4)
strscan (3.1.0)
terminal-table (1.8.0) terminal-table (1.8.0)
unicode-display_width (~> 1.1, >= 1.1.1) unicode-display_width (~> 1.1, >= 1.1.1)
thread_safe (0.3.6) thread_safe (0.3.6)

View file

@ -51,7 +51,9 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts
- [MongoDB](https://samber.github.io/awesome-prometheus-alerts/rules#mongodb) - [MongoDB](https://samber.github.io/awesome-prometheus-alerts/rules#mongodb)
- [RabbitMQ](https://samber.github.io/awesome-prometheus-alerts/rules#rabbitmq) - [RabbitMQ](https://samber.github.io/awesome-prometheus-alerts/rules#rabbitmq)
- [Elasticsearch](https://samber.github.io/awesome-prometheus-alerts/rules#elasticsearch) - [Elasticsearch](https://samber.github.io/awesome-prometheus-alerts/rules#elasticsearch)
- [Meilisearch](https://samber.github.io/awesome-prometheus-alerts/rules#meilisearch)
- [Cassandra](https://samber.github.io/awesome-prometheus-alerts/rules#cassandra) - [Cassandra](https://samber.github.io/awesome-prometheus-alerts/rules#cassandra)
- [Clickhouse](https://samber.github.io/awesome-prometheus-alerts/rules#clickhouse)
- [Zookeeper](https://samber.github.io/awesome-prometheus-alerts/rules#zookeeper) - [Zookeeper](https://samber.github.io/awesome-prometheus-alerts/rules#zookeeper)
- [Kafka](https://samber.github.io/awesome-prometheus-alerts/rules#kafka) - [Kafka](https://samber.github.io/awesome-prometheus-alerts/rules#kafka)
- [Pulsar](https://samber.github.io/awesome-prometheus-alerts/rules#pulsar) - [Pulsar](https://samber.github.io/awesome-prometheus-alerts/rules#pulsar)

View file

@ -197,6 +197,7 @@ groups:
This is usually due to permissions issues or virtual filesystems. This is usually due to permissions issues or virtual filesystems.
Please add ignored mountpoints in node_exporter parameters like Please add ignored mountpoints in node_exporter parameters like
"--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
for: 2m
- name: Host inodes will fill in 24 hours - name: Host inodes will fill in 24 hours
description: Filesystem is predicted to run out of inodes within the next 24 hours at current write rate description: Filesystem is predicted to run out of inodes within the next 24 hours at current write rate
query: 'predict_linear(node_filesystem_files_free{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_files_free > 0' query: 'predict_linear(node_filesystem_files_free{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_files_free > 0'
@ -236,12 +237,16 @@ groups:
description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities." description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities."
query: 'rate(node_disk_io_time_seconds_total[5m]) > 0.8' query: 'rate(node_disk_io_time_seconds_total[5m]) > 0.8'
severity: warning severity: warning
- name: Host context switching for: 5m
description: Context switching is growing on the node (> 10000 / CPU / s) - name: Host context switching high
query: '(rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 10000' description: Context switching is growing on the node (twice the daily average during the last 15m)
query: |
(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"}))
/
(rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2
severity: warning severity: warning
comments: | comments: |
10000 context switches is an arbitrary number. x2 context switches is an arbitrary number.
The alert threshold depends on the nature of the application. The alert threshold depends on the nature of the application.
Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58 Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
- name: Host swap is filling up - name: Host swap is filling up
@ -833,7 +838,7 @@ groups:
The exporter must be started with --include-system-metrics flag or REDIS_EXPORTER_INCL_SYSTEM_METRICS=true environment variable. The exporter must be started with --include-system-metrics flag or REDIS_EXPORTER_INCL_SYSTEM_METRICS=true environment variable.
- name: Redis out of configured maxmemory - name: Redis out of configured maxmemory
description: Redis is running out of configured maxmemory (> 90%) description: Redis is running out of configured maxmemory (> 90%)
query: "redis_memory_used_bytes / redis_memory_max_bytes * 100 > 90" query: "redis_memory_used_bytes / redis_memory_max_bytes * 100 > 90 and on(instance) redis_memory_max_bytes > 0"
severity: warning severity: warning
for: 2m for: 2m
- name: Redis too many connections - name: Redis too many connections
@ -888,11 +893,6 @@ groups:
query: 'avg by(instance) (rate(mongodb_ss_connections{conn_type="current"}[1m])) / avg by(instance) (sum (mongodb_ss_connections) by (instance)) * 100 > 80' query: 'avg by(instance) (rate(mongodb_ss_connections{conn_type="current"}[1m])) / avg by(instance) (sum (mongodb_ss_connections) by (instance)) * 100 > 80'
severity: warning severity: warning
for: 2m for: 2m
- name: MongoDB virtual memory usage
description: High memory usage
query: "(sum(mongodb_ss_mem_virtual) BY (instance) / sum(mongodb_ss_mem_resident) BY (instance)) > 3"
severity: warning
for: 2m
- name: dcu/mongodb_exporter - name: dcu/mongodb_exporter
slug: dcu-mongodb-exporter slug: dcu-mongodb-exporter
@ -1137,9 +1137,44 @@ groups:
severity: warning severity: warning
for: 15m for: 15m
- name: Elasticsearch no new documents - name: Elasticsearch no new documents
description: No new documents for 10 min! description: "No new documents for 10 min!"
query: 'increase(elasticsearch_indices_indexing_index_total{es_data_node="true"}[10m]) < 1' query: 'increase(elasticsearch_indices_indexing_index_total{es_data_node="true"}[10m]) < 1'
severity: warning severity: warning
- name: Elasticsearch High Indexing Latency
description: "The indexing latency on Elasticsearch cluster is higher than the threshold."
query: "elasticsearch_indices_indexing_index_time_seconds_total / elasticsearch_indices_indexing_index_total > 0.0005"
severity: warning
for: 10m
- name: Elasticsearch High Indexing Rate
description: "The indexing rate on Elasticsearch cluster is higher than the threshold."
query: "sum(rate(elasticsearch_indices_indexing_index_total[1m]))> 10000"
severity: warning
for: 5m
- name: Elasticsearch High Query Rate
description: "The query rate on Elasticsearch cluster is higher than the threshold."
query: "sum(rate(elasticsearch_indices_search_query_total[1m])) > 100"
severity: warning
for: 5m
- name: Elasticsearch High Query Latency
description: "The query latency on Elasticsearch cluster is higher than the threshold."
query: "elasticsearch_indices_search_fetch_time_seconds / elasticsearch_indices_search_fetch_total > 1"
severity: warning
for: 5m
- name: Meilisearch
exporters:
- name: Embedded exporter
slug: embedded-exporter
doc_url: https://github.com/orgs/meilisearch/discussions/625
rules:
- name: Meilisearch index is empty
description: Meilisearch instance is down
query: 'meilisearch_index_docs_count == 0'
severity: warning
- name: Meilisearch http response time
description: Meilisearch http response time is too high
query: "meilisearch_http_response_time_seconds > 0.5"
severity: warning
- name: Cassandra - name: Cassandra
exporters: exporters:
@ -1292,6 +1327,88 @@ groups:
severity: critical severity: critical
for: 2m for: 2m
- name: Clickhouse
exporters:
- name: Embedded Exporter
slug: embedded-exporter
doc_url: https://clickhouse.com/docs/en/operations/system-tables/metrics
rules:
- name: ClickHouse Memory Usage Critical
description: Memory usage is critically high, over 90%.
query: "ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 90"
severity: critical
for: 5m
- name: ClickHouse Memory Usage Warning
description: Memory usage is over 80%.
query: "ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 80"
severity: warning
for: 5m
- name: ClickHouse Disk Space Low on Default
description: Disk space on default is below 20%.
query: "ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 20"
severity: warning
for: 2m
- name: ClickHouse Disk Space Critical on Default
description: Disk space on default disk is critically low, below 10%.
query: "ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 10"
severity: critical
for: 2m
- name: ClickHouse Disk Space Low on Backups
description: Disk space on backups is below 20%.
query: "ClickHouseAsyncMetrics_DiskAvailable_backups / (ClickHouseAsyncMetrics_DiskAvailable_backups + ClickHouseAsyncMetrics_DiskUsed_backups) * 100 < 20"
severity: warning
for: 2m
- name: ClickHouse Replica Errors
description: Critical replica errors detected, either all replicas are stale or lost.
query: "ClickHouseErrorMetric_ALL_REPLICAS_ARE_STALE == 1 or ClickHouseErrorMetric_ALL_REPLICAS_LOST == 1"
severity: critical
for: 0m
- name: ClickHouse No Available Replicas
description: No available replicas in ClickHouse.
query: "ClickHouseErrorMetric_NO_AVAILABLE_REPLICA == 1"
severity: critical
for: 0m
- name: ClickHouse No Live Replicas
description: There are too few live replicas available, risking data loss and service disruption.
query: "ClickHouseErrorMetric_TOO_FEW_LIVE_REPLICAS == 1"
severity: critical
for: 0m
- name: ClickHouse High Network Traffic
description: Network traffic is unusually high, may affect cluster performance.
query: "ClickHouseMetrics_NetworkSend > 250 or ClickHouseMetrics_NetworkReceive > 250"
severity: warning
for: 5m
comments: |
Please replace the threshold with an appropriate value
- name: ClickHouse High TCP Connections
description: High number of TCP connections, indicating heavy client or inter-cluster communication.
query: "ClickHouseMetrics_TCPConnection > 400"
severity: warning
for: 5m
comments: |
Please replace the threshold with an appropriate value
- name: ClickHouse Interserver Connection Issues
description: An increase in interserver connections may indicate replication or distributed query handling issues.
query: "increase(ClickHouseMetrics_InterserverConnection[5m]) > 0"
severity: warning
for: 1m
- name: ClickHouse ZooKeeper Connection Issues
description: ClickHouse is experiencing issues with ZooKeeper connections, which may affect cluster state and coordination.
query: "avg(ClickHouseMetrics_ZooKeeperSession) != 1"
severity: warning
for: 3m
- name: ClickHouse Authentication Failures
description: Authentication failures detected, indicating potential security issues or misconfiguration.
query: "increase(ClickHouseErrorMetric_AUTHENTICATION_FAILED[5m]) > 0"
severity: info
for: 0m
- name: ClickHouse Access Denied Errors
description: Access denied errors have been logged, which could indicate permission issues or unauthorized access attempts.
query: "increase(ClickHouseErrorMetric_RESOURCE_ACCESS_DENIED[5m]) > 0"
severity: info
for: 0m
- name: Zookeeper - name: Zookeeper
exporters: exporters:
- name: cloudflare/kafka_zookeeper_exporter - name: cloudflare/kafka_zookeeper_exporter
@ -1695,7 +1812,7 @@ groups:
severity: critical severity: critical
- name: HAProxy backend max active session - name: HAProxy backend max active session
description: HAproxy backend {{ $labels.fqdn }}/{{ $labels.backend }} is reaching session limit (> 80%). description: HAproxy backend {{ $labels.fqdn }}/{{ $labels.backend }} is reaching session limit (> 80%).
query: "((sum by (backend) (avg_over_time(haproxy_backend_max_sessions[2m]) * 100) / sum by (backend) (avg_over_time(haproxy_backend_limit_sessions[2m])))) > 80" query: "((sum by (backend) (avg_over_time(haproxy_backend_current_sessions[2m]) * 100) / sum by (backend) (avg_over_time(haproxy_backend_limit_sessions[2m])))) > 80"
severity: warning severity: warning
for: 2m for: 2m
- name: HAProxy pending requests - name: HAProxy pending requests
@ -1891,7 +2008,7 @@ groups:
for: 1m for: 1m
- name: Kubernetes HPA scale inability - name: Kubernetes HPA scale inability
description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to scale description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to scale
query: 'kube_horizontalpodautoscaler_status_condition{status="false", condition="AbleToScale"} == 1' query: '(kube_horizontalpodautoscaler_spec_max_replicas - kube_horizontalpodautoscaler_status_desired_replicas) * on (horizontalpodautoscaler,namespace) (kube_horizontalpodautoscaler_status_condition{condition="ScalingLimited", status="true"} == 1) == 0'
severity: warning severity: warning
for: 2m for: 2m
- name: Kubernetes HPA metrics unavailability - name: Kubernetes HPA metrics unavailability
@ -1900,7 +2017,7 @@ groups:
severity: warning severity: warning
- name: Kubernetes HPA scale maximum - name: Kubernetes HPA scale maximum
description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has hit maximum number of desired pods description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has hit maximum number of desired pods
query: "kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas" query: '(kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas) and (kube_horizontalpodautoscaler_spec_max_replicas > 1) and (kube_horizontalpodautoscaler_spec_min_replicas != kube_horizontalpodautoscaler_spec_max_replicas)'
severity: info severity: info
for: 2m for: 2m
- name: Kubernetes HPA underutilized - name: Kubernetes HPA underutilized
@ -1981,7 +2098,7 @@ groups:
for: 12h for: 12h
- name: Kubernetes API server errors - name: Kubernetes API server errors
description: Kubernetes API server is experiencing high error rate description: Kubernetes API server is experiencing high error rate
query: 'sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[1m])) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) * 100 > 3' query: 'sum(rate(apiserver_request_total{job="apiserver",code=~"(?:5..)"}[1m])) by (instance, job) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) by (instance, job) * 100 > 3'
severity: critical severity: critical
for: 2m for: 2m
- name: Kubernetes API client errors - name: Kubernetes API client errors
@ -1999,7 +2116,7 @@ groups:
severity: critical severity: critical
- name: Kubernetes API server latency - name: Kubernetes API server latency
description: "Kubernetes API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}." description: "Kubernetes API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}."
query: 'histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"} [10m])) WITHOUT (instance, resource)) > 1' query: 'histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{verb!~"(?:CONNECT|WATCHLIST|WATCH|PROXY)"} [10m])) WITHOUT (subresource)) > 1'
severity: warning severity: warning
for: 2m for: 2m
@ -2336,7 +2453,7 @@ groups:
rules: rules:
- name: Minio cluster disk offline - name: Minio cluster disk offline
description: "Minio cluster disk is offline" description: "Minio cluster disk is offline"
query: "minio_cluster_disk_offline_total > 0" query: "minio_cluster_drive_offline_total > 0"
severity: critical severity: critical
- name: Minio node disk offline - name: Minio node disk offline
description: "Minio cluster node disk is offline" description: "Minio cluster node disk is offline"

View file

@ -0,0 +1,131 @@
groups:
- name: EmbeddedExporter
rules:
- alert: ClickhouseMemoryUsageCritical
expr: 'ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 90'
for: 5m
labels:
severity: critical
annotations:
summary: ClickHouse Memory Usage Critical (instance {{ $labels.instance }})
description: "Memory usage is critically high, over 90%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ClickhouseMemoryUsageWarning
expr: 'ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 80'
for: 5m
labels:
severity: warning
annotations:
summary: ClickHouse Memory Usage Warning (instance {{ $labels.instance }})
description: "Memory usage is over 80%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ClickhouseDiskSpaceLowOnDefault
expr: 'ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 20'
for: 2m
labels:
severity: warning
annotations:
summary: ClickHouse Disk Space Low on Default (instance {{ $labels.instance }})
description: "Disk space on default is below 20%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ClickhouseDiskSpaceCriticalOnDefault
expr: 'ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 10'
for: 2m
labels:
severity: critical
annotations:
summary: ClickHouse Disk Space Critical on Default (instance {{ $labels.instance }})
description: "Disk space on default disk is critically low, below 10%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ClickhouseDiskSpaceLowOnBackups
expr: 'ClickHouseAsyncMetrics_DiskAvailable_backups / (ClickHouseAsyncMetrics_DiskAvailable_backups + ClickHouseAsyncMetrics_DiskUsed_backups) * 100 < 20'
for: 2m
labels:
severity: warning
annotations:
summary: ClickHouse Disk Space Low on Backups (instance {{ $labels.instance }})
description: "Disk space on backups is below 20%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ClickhouseReplicaErrors
expr: 'ClickHouseErrorMetric_ALL_REPLICAS_ARE_STALE == 1 or ClickHouseErrorMetric_ALL_REPLICAS_LOST == 1'
for: 0m
labels:
severity: critical
annotations:
summary: ClickHouse Replica Errors (instance {{ $labels.instance }})
description: "Critical replica errors detected, either all replicas are stale or lost.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ClickhouseNoAvailableReplicas
expr: 'ClickHouseErrorMetric_NO_AVAILABLE_REPLICA == 1'
for: 0m
labels:
severity: critical
annotations:
summary: ClickHouse No Available Replicas (instance {{ $labels.instance }})
description: "No available replicas in ClickHouse.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ClickhouseNoLiveReplicas
expr: 'ClickHouseErrorMetric_TOO_FEW_LIVE_REPLICAS == 1'
for: 0m
labels:
severity: critical
annotations:
summary: ClickHouse No Live Replicas (instance {{ $labels.instance }})
description: "There are too few live replicas available, risking data loss and service disruption.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ClickhouseHighNetworkTraffic
expr: 'ClickHouseMetrics_NetworkSend > 250 or ClickHouseMetrics_NetworkReceive > 250'
for: 5m
labels:
severity: warning
annotations:
summary: ClickHouse High Network Traffic (instance {{ $labels.instance }})
description: "Network traffic is unusually high, may affect cluster performance.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ClickhouseHighTcpConnections
expr: 'ClickHouseMetrics_TCPConnection > 400'
for: 5m
labels:
severity: warning
annotations:
summary: ClickHouse High TCP Connections (instance {{ $labels.instance }})
description: "High number of TCP connections, indicating heavy client or inter-cluster communication.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ClickhouseInterserverConnectionIssues
expr: 'increase(ClickHouseMetrics_InterserverConnection[5m]) > 0'
for: 1m
labels:
severity: warning
annotations:
summary: ClickHouse Interserver Connection Issues (instance {{ $labels.instance }})
description: "An increase in interserver connections may indicate replication or distributed query handling issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ClickhouseZookeeperConnectionIssues
expr: 'avg(ClickHouseMetrics_ZooKeeperSession) != 1'
for: 3m
labels:
severity: warning
annotations:
summary: ClickHouse ZooKeeper Connection Issues (instance {{ $labels.instance }})
description: "ClickHouse is experiencing issues with ZooKeeper connections, which may affect cluster state and coordination.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ClickhouseAuthenticationFailures
expr: 'increase(ClickHouseErrorMetric_AUTHENTICATION_FAILED[5m]) > 0'
for: 0m
labels:
severity: info
annotations:
summary: ClickHouse Authentication Failures (instance {{ $labels.instance }})
description: "Authentication failures detected, indicating potential security issues or misconfiguration.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ClickhouseAccessDeniedErrors
expr: 'increase(ClickHouseErrorMetric_RESOURCE_ACCESS_DENIED[5m]) > 0'
for: 0m
labels:
severity: info
annotations:
summary: ClickHouse Access Denied Errors (instance {{ $labels.instance }})
description: "Access denied errors have been logged, which could indicate permission issues or unauthorized access attempts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -138,3 +138,39 @@ groups:
annotations: annotations:
summary: Elasticsearch no new documents (instance {{ $labels.instance }}) summary: Elasticsearch no new documents (instance {{ $labels.instance }})
description: "No new documents for 10 min!\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "No new documents for 10 min!\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ElasticsearchHighIndexingLatency
expr: 'elasticsearch_indices_indexing_index_time_seconds_total / elasticsearch_indices_indexing_index_total > 0.0005'
for: 10m
labels:
severity: warning
annotations:
summary: Elasticsearch High Indexing Latency (instance {{ $labels.instance }})
description: "The indexing latency on Elasticsearch cluster is higher than the threshold.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ElasticsearchHighIndexingRate
expr: 'sum(rate(elasticsearch_indices_indexing_index_total[1m]))> 10000'
for: 5m
labels:
severity: warning
annotations:
summary: Elasticsearch High Indexing Rate (instance {{ $labels.instance }})
description: "The indexing rate on Elasticsearch cluster is higher than the threshold.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ElasticsearchHighQueryRate
expr: 'sum(rate(elasticsearch_indices_search_query_total[1m])) > 100'
for: 5m
labels:
severity: warning
annotations:
summary: Elasticsearch High Query Rate (instance {{ $labels.instance }})
description: "The query rate on Elasticsearch cluster is higher than the threshold.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ElasticsearchHighQueryLatency
expr: 'elasticsearch_indices_search_fetch_time_seconds / elasticsearch_indices_search_fetch_total > 1'
for: 5m
labels:
severity: warning
annotations:
summary: Elasticsearch High Query Latency (instance {{ $labels.instance }})
description: "The query latency on Elasticsearch cluster is higher than the threshold.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -77,7 +77,7 @@ groups:
description: "Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyBackendMaxActiveSession - alert: HaproxyBackendMaxActiveSession
expr: '((sum by (backend) (avg_over_time(haproxy_backend_max_sessions[2m]) * 100) / sum by (backend) (avg_over_time(haproxy_backend_limit_sessions[2m])))) > 80' expr: '((sum by (backend) (avg_over_time(haproxy_backend_current_sessions[2m]) * 100) / sum by (backend) (avg_over_time(haproxy_backend_limit_sessions[2m])))) > 80'
for: 2m for: 2m
labels: labels:
severity: warning severity: warning

View file

@ -5,7 +5,7 @@ groups:
rules: rules:
- alert: HostOutOfMemory - alert: HostOutOfMemory
expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < .10)' expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m for: 2m
labels: labels:
severity: warning severity: warning
@ -14,88 +14,97 @@ groups:
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostMemoryUnderMemoryPressure - alert: HostMemoryUnderMemoryPressure
expr: '(rate(node_vmstat_pgmajfault[5m]) > 1000)' expr: '(rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m for: 2m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Host memory under memory pressure (instance {{ $labels.instance }}) summary: Host memory under memory pressure (instance {{ $labels.instance }})
description: "The node is under heavy memory pressure. High rate of loading memory pages from disk.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostMemoryIsUnderutilized - alert: HostMemoryIsUnderutilized
expr: 'min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8' expr: '(100 - (avg_over_time(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m for: 1w
labels: labels:
severity: info severity: info
annotations: annotations:
summary: Host Memory is underutilized (instance {{ $labels.instance }}) summary: Host Memory is underutilized (instance {{ $labels.instance }})
description: "Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualNetworkThroughputIn - alert: HostUnusualNetworkThroughputIn
expr: '((rate(node_network_receive_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)' expr: '(sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m for: 5m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Host unusual network throughput in (instance {{ $labels.instance }}) summary: Host unusual network throughput in (instance {{ $labels.instance }})
description: "Host receive bandwidth is high (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualNetworkThroughputOut - alert: HostUnusualNetworkThroughputOut
expr: '((rate(node_network_transmit_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)' expr: '(sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m for: 5m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Host unusual network throughput out (instance {{ $labels.instance }}) summary: Host unusual network throughput out (instance {{ $labels.instance }})
description: "Host transmit bandwidth is high (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskReadRate - alert: HostUnusualDiskReadRate
expr: '(rate(node_disk_io_time_seconds_total[5m]) > .80)' expr: '(sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m for: 5m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Host unusual disk read rate (instance {{ $labels.instance }}) summary: Host unusual disk read rate (instance {{ $labels.instance }})
description: "Disk is too busy (IO wait > 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOutOfDiskSpace - alert: HostUnusualDiskWriteRate
expr: '(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0)' expr: '(sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: critical
annotations:
summary: Host out of disk space (instance {{ $labels.instance }})
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostDiskMayFillIn24Hours
expr: 'predict_linear(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_avail_bytes > 0'
for: 2m for: 2m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Host disk may fill in 24 hours (instance {{ $labels.instance }}) summary: Host unusual disk write rate (instance {{ $labels.instance }})
description: "Filesystem will likely run out of space within the next 24 hours\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOutOfInodes - alert: HostOutOfDiskSpace
expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0)' expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m for: 2m
labels: labels:
severity: critical severity: warning
annotations:
summary: Host out of disk space (instance {{ $labels.instance }})
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostDiskWillFillIn24Hours
expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOutOfInodes
expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations: annotations:
summary: Host out of inodes (instance {{ $labels.instance }}) summary: Host out of inodes (instance {{ $labels.instance }})
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostFilesystemDeviceError - alert: HostFilesystemDeviceError
expr: 'node_filesystem_device_error{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} == 1' expr: 'node_filesystem_device_error == 1'
for: 0m for: 2m
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: Host filesystem device error (instance {{ $labels.instance }}) summary: Host filesystem device error (instance {{ $labels.instance }})
description: "Error stat-ing the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostInodesWillFillIn24Hours - alert: HostInodesWillFillIn24Hours
expr: 'predict_linear(node_filesystem_files_free{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_files_free > 0' expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{fstype!="msdosfs"} == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m for: 2m
labels: labels:
severity: warning severity: warning
@ -104,7 +113,7 @@ groups:
description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskReadLatency - alert: HostUnusualDiskReadLatency
expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0)' expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m for: 2m
labels: labels:
severity: warning severity: warning
@ -113,7 +122,7 @@ groups:
description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskWriteLatency - alert: HostUnusualDiskWriteLatency
expr: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0)' expr: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m for: 2m
labels: labels:
severity: warning severity: warning
@ -122,7 +131,7 @@ groups:
description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostHighCpuLoad - alert: HostHighCpuLoad
expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > .80' expr: '(sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
@ -131,16 +140,16 @@ groups:
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostCpuIsUnderutilized - alert: HostCpuIsUnderutilized
expr: '(min by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8' expr: '(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 1w for: 1w
labels: labels:
severity: info severity: info
annotations: annotations:
summary: Host CPU is underutilized (instance {{ $labels.instance }}) summary: Host CPU is underutilized (instance {{ $labels.instance }})
description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostCpuStealNoisyNeighbor - alert: HostCpuStealNoisyNeighbor
expr: 'avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10' expr: '(avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m for: 0m
labels: labels:
severity: warning severity: warning
@ -149,34 +158,37 @@ groups:
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostCpuHighIowait - alert: HostCpuHighIowait
expr: 'avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10' expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m for: 0m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Host CPU high iowait (instance {{ $labels.instance }}) summary: Host CPU high iowait (instance {{ $labels.instance }})
description: "CPU iowait > 10%. Your CPU is idling waiting for storage to respond.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskIo - alert: HostUnusualDiskIo
expr: 'rate(node_disk_io_time_seconds_total[5m]) > 0.8' expr: '(rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m for: 5m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Host unusual disk IO (instance {{ $labels.instance }}) summary: Host unusual disk IO (instance {{ $labels.instance }})
description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostContextSwitching - alert: HostContextSwitchingHigh
expr: '(rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 10000' expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"}))
/
(rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2
'
for: 0m for: 0m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Host context switching (instance {{ $labels.instance }}) summary: Host context switching high (instance {{ $labels.instance }})
description: "Context switching is growing on the node (> 10000 / CPU / s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Context switching is growing on the node (twice the daily average during the last 15m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostSwapIsFillingUp - alert: HostSwapIsFillingUp
expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80)' expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m for: 2m
labels: labels:
severity: warning severity: warning
@ -185,7 +197,7 @@ groups:
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostSystemdServiceCrashed - alert: HostSystemdServiceCrashed
expr: '(node_systemd_unit_state{state="failed"} == 1)' expr: '(node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m for: 0m
labels: labels:
severity: warning severity: warning
@ -194,7 +206,7 @@ groups:
description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostPhysicalComponentTooHot - alert: HostPhysicalComponentTooHot
expr: 'node_hwmon_temp_celsius > node_hwmon_temp_max_celsius' expr: '((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl"} > 75)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 5m for: 5m
labels: labels:
severity: warning severity: warning
@ -203,7 +215,7 @@ groups:
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNodeOvertemperatureAlarm - alert: HostNodeOvertemperatureAlarm
expr: 'node_hwmon_temp_crit_alarm_celsius == 1 or node_hwmon_temp_alarm == 1' expr: '(node_hwmon_temp_crit_alarm_celsius == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m for: 0m
labels: labels:
severity: critical severity: critical
@ -211,44 +223,44 @@ groups:
summary: Host node overtemperature alarm (instance {{ $labels.instance }}) summary: Host node overtemperature alarm (instance {{ $labels.instance }})
description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostSoftwareRaidInsufficientDrives - alert: HostRaidArrayGotInactive
expr: '((node_md_disks_required - on(device, instance) node_md_disks{state="active"}) > 0)' expr: '(node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m for: 0m
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: Host Software RAID insufficient drives (instance {{ $labels.instance }}) summary: Host RAID array got inactive (instance {{ $labels.instance }})
description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} has insufficient drives remaining.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostSoftwareRaidDiskFailure - alert: HostRaidDiskFailure
expr: '(node_md_disks{state="failed"} > 0)' expr: '(node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m for: 2m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Host Software RAID disk failure (instance {{ $labels.instance }}) summary: Host RAID disk failure (instance {{ $labels.instance }})
description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} needs attention.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostKernelVersionDeviations - alert: HostKernelVersionDeviations
expr: 'changes(node_uname_info[1h]) > 0' expr: '(count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m for: 6h
labels: labels:
severity: info severity: warning
annotations: annotations:
summary: Host kernel version deviations (instance {{ $labels.instance }}) summary: Host kernel version deviations (instance {{ $labels.instance }})
description: "Kernel version for {{ $labels.instance }} has changed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Different kernel versions are running\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOomKillDetected - alert: HostOomKillDetected
expr: '(increase(node_vmstat_oom_kill[1m]) > 0)' expr: '(increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m for: 0m
labels: labels:
severity: critical severity: warning
annotations: annotations:
summary: Host OOM kill detected (instance {{ $labels.instance }}) summary: Host OOM kill detected (instance {{ $labels.instance }})
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostEdacCorrectableErrorsDetected - alert: HostEdacCorrectableErrorsDetected
expr: '(increase(node_edac_correctable_errors_total[1m]) > 0)' expr: '(increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m for: 0m
labels: labels:
severity: info severity: info
@ -257,7 +269,7 @@ groups:
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostEdacUncorrectableErrorsDetected - alert: HostEdacUncorrectableErrorsDetected
expr: '(node_edac_uncorrectable_errors_total > 0)' expr: '(node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m for: 0m
labels: labels:
severity: warning severity: warning
@ -266,7 +278,7 @@ groups:
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkReceiveErrors - alert: HostNetworkReceiveErrors
expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01)' expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m for: 2m
labels: labels:
severity: warning severity: warning
@ -275,7 +287,7 @@ groups:
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkTransmitErrors - alert: HostNetworkTransmitErrors
expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01)' expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m for: 2m
labels: labels:
severity: warning severity: warning
@ -283,8 +295,17 @@ groups:
summary: Host Network Transmit Errors (instance {{ $labels.instance }}) summary: Host Network Transmit Errors (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkInterfaceSaturated
expr: '((rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} > 0.8 < 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 1m
labels:
severity: warning
annotations:
summary: Host Network Interface Saturated (instance {{ $labels.instance }})
description: "The network interface \"{{ $labels.device }}\" on \"{{ $labels.instance }}\" is getting overloaded.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkBondDegraded - alert: HostNetworkBondDegraded
expr: '((node_bonding_active - node_bonding_slaves) != 0)' expr: '((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m for: 2m
labels: labels:
severity: warning severity: warning
@ -293,7 +314,7 @@ groups:
description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostConntrackLimit - alert: HostConntrackLimit
expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8)' expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 5m for: 5m
labels: labels:
severity: warning severity: warning
@ -302,7 +323,7 @@ groups:
description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostClockSkew - alert: HostClockSkew
expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0))' expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
@ -311,7 +332,7 @@ groups:
description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostClockNotSynchronising - alert: HostClockNotSynchronising
expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16)' expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m for: 2m
labels: labels:
severity: warning severity: warning
@ -320,7 +341,7 @@ groups:
description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostRequiresReboot - alert: HostRequiresReboot
expr: '(node_reboot_required > 0)' expr: '(node_reboot_required > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 4h for: 4h
labels: labels:
severity: info severity: info

View file

@ -122,7 +122,7 @@ groups:
description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} went down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} went down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesHpaScaleInability - alert: KubernetesHpaScaleInability
expr: 'kube_horizontalpodautoscaler_status_condition{status="false", condition="AbleToScale"} == 1' expr: '(kube_horizontalpodautoscaler_spec_max_replicas - kube_horizontalpodautoscaler_status_desired_replicas) * on (horizontalpodautoscaler,namespace) (kube_horizontalpodautoscaler_status_condition{condition="ScalingLimited", status="true"} == 1) == 0'
for: 2m for: 2m
labels: labels:
severity: warning severity: warning
@ -140,7 +140,7 @@ groups:
description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to collect metrics\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to collect metrics\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesHpaScaleMaximum - alert: KubernetesHpaScaleMaximum
expr: 'kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas' expr: '(kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas) and (kube_horizontalpodautoscaler_spec_max_replicas > 1) and (kube_horizontalpodautoscaler_spec_min_replicas != kube_horizontalpodautoscaler_spec_max_replicas)'
for: 2m for: 2m
labels: labels:
severity: info severity: info
@ -266,7 +266,7 @@ groups:
description: "Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in time.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in time.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesApiServerErrors - alert: KubernetesApiServerErrors
expr: 'sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[1m])) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) * 100 > 3' expr: 'sum(rate(apiserver_request_total{job="apiserver",code=~"(?:5..)"}[1m])) by (instance, job) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) by (instance, job) * 100 > 3'
for: 2m for: 2m
labels: labels:
severity: critical severity: critical
@ -302,7 +302,7 @@ groups:
description: "A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesApiServerLatency - alert: KubernetesApiServerLatency
expr: 'histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"} [10m])) WITHOUT (instance, resource)) > 1' expr: 'histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{verb!~"(?:CONNECT|WATCHLIST|WATCH|PROXY)"} [10m])) WITHOUT (subresource)) > 1'
for: 2m for: 2m
labels: labels:
severity: warning severity: warning

View file

@ -0,0 +1,23 @@
groups:
- name: EmbeddedExporter
rules:
- alert: MeilisearchIndexIsEmpty
expr: 'meilisearch_index_docs_count == 0'
for: 0m
labels:
severity: warning
annotations:
summary: Meilisearch index is empty (instance {{ $labels.instance }})
description: "Meilisearch instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MeilisearchHttpResponseTime
expr: 'meilisearch_http_response_time_seconds > 0.5'
for: 0m
labels:
severity: warning
annotations:
summary: Meilisearch http response time (instance {{ $labels.instance }})
description: "Meilisearch http response time is too high\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -5,7 +5,7 @@ groups:
rules: rules:
- alert: MinioClusterDiskOffline - alert: MinioClusterDiskOffline
expr: 'minio_cluster_disk_offline_total > 0' expr: 'minio_cluster_drive_offline_total > 0'
for: 0m for: 0m
labels: labels:
severity: critical severity: critical

View file

@ -66,12 +66,3 @@ groups:
annotations: annotations:
summary: MongoDB too many connections (instance {{ $labels.instance }}) summary: MongoDB too many connections (instance {{ $labels.instance }})
description: "Too many connections (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Too many connections (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MongodbVirtualMemoryUsage
expr: '(sum(mongodb_ss_mem_virtual) BY (instance) / sum(mongodb_ss_mem_resident) BY (instance)) > 3'
for: 2m
labels:
severity: warning
annotations:
summary: MongoDB virtual memory usage (instance {{ $labels.instance }})
description: "High memory usage\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -77,7 +77,7 @@ groups:
description: "Redis is running out of system memory (> 90%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Redis is running out of system memory (> 90%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RedisOutOfConfiguredMaxmemory - alert: RedisOutOfConfiguredMaxmemory
expr: 'redis_memory_used_bytes / redis_memory_max_bytes * 100 > 90' expr: 'redis_memory_used_bytes / redis_memory_max_bytes * 100 > 90 and on(instance) redis_memory_max_bytes > 0'
for: 2m for: 2m
labels: labels:
severity: warning severity: warning