From 6b1685261d3c17477a3fff1cbbb85fecbc9a6d3c Mon Sep 17 00:00:00 2001
From: Pavel Timofeev <timp87@gmail.com>
Date: Sat, 19 Aug 2023 16:39:22 -0600
Subject: [PATCH 001/123] Rework kube-state-metrics alerts (#381)

* Rework kube-state-metrics alerts:
- provide meaningful labels in summary as 'instance' label hardly makes sense in most of them
- rename some alerts to tell more accurate what the problem is
- adjust description trying to follow some kind of the message schema found in other alerts

* move changes to _data/rules.yml

* Update rules.yml

---------

Co-authored-by: Samuel Berthe <dev@samuel-berthe.fr>
---
 _data/rules.yml                              |  64 +++++-----
 dist/rules/kubernetes/kubestate-exporter.yml | 121 ++++++++++---------
 2 files changed, 93 insertions(+), 92 deletions(-)

diff --git a/_data/rules.yml b/_data/rules.yml
index a6f46d4..7fb1a6a 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -1693,32 +1693,32 @@ groups:
             slug: kubestate-exporter
             doc_url: https://github.com/kubernetes/kube-state-metrics/tree/master/docs
             rules:
-              - name: Kubernetes node not ready
+              - name: Kubernetes Node not ready
                 description: Node {{ $labels.node }} has been unready for a long time
                 query: 'kube_node_status_condition{condition="Ready",status="true"} == 0'
                 severity: critical
                 for: 10m
-              - name: Kubernetes memory pressure
-                description: "{{ $labels.node }} has MemoryPressure condition"
+              - name: Kubernetes Node memory pressure
+                description: "Node {{ $labels.node }} has MemoryPressure condition"
                 query: 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1'
                 severity: critical
                 for: 2m
-              - name: Kubernetes disk pressure
-                description: "{{ $labels.node }} has DiskPressure condition"
+              - name: Kubernetes Node disk pressure
+                description: "Node {{ $labels.node }} has DiskPressure condition"
                 query: 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1'
                 severity: critical
                 for: 2m
-              - name: Kubernetes network unavailable
-                description: "{{ $labels.node }} has NetworkUnavailable condition"
+              - name: Kubernetes Node network unavailable
+                description: "Node {{ $labels.node }} has NetworkUnavailable condition"
                 query: 'kube_node_status_condition{condition="NetworkUnavailable",status="true"} == 1'
                 severity: critical
                 for: 2m
-              - name: Kubernetes out of capacity
-                description: "{{ $labels.node }} is out of capacity"
+              - name: Kubernetes Node out of pod capacity
+                description: "Node {{ $labels.node }} is out of pod capacity"
                 query: 'sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90'
                 severity: warning
                 for: 2m
-              - name: Kubernetes container oom killer
+              - name: Kubernetes Container oom killer
                 description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes."
                 query: '(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1'
                 severity: warning
@@ -1741,83 +1741,83 @@ groups:
                 severity: warning
                 for: 2m
               - name: Kubernetes Volume full in four days
-                description: "{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available."
+                description: "Volume under {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available."
                 query: 'predict_linear(kubelet_volume_stats_available_bytes[6h], 4 * 24 * 3600) < 0'
                 severity: critical
               - name: Kubernetes PersistentVolume error
-                description: "Persistent volume is in bad state"
+                description: "Persistent volume {{ $labels.persistentvolume }} is in bad state"
                 query: 'kube_persistentvolume_status_phase{phase=~"Failed|Pending", job="kube-state-metrics"} > 0'
                 severity: critical
               - name: Kubernetes StatefulSet down
-                description: A StatefulSet went down
+                description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} went down
                 query: 'kube_statefulset_replicas != kube_statefulset_status_replicas_ready > 0'
                 severity: critical
                 for: 1m
-              - name: Kubernetes HPA scaling ability
-                description: Pod is unable to scale
+              - name: Kubernetes HPA scale inability
+                description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to scale
                 query: 'kube_horizontalpodautoscaler_status_condition{status="false", condition="AbleToScale"} == 1'
                 severity: warning
                 for: 2m
-              - name: Kubernetes HPA metric availability
-                description: HPA is not able to collect metrics
+              - name: Kubernetes HPA metrics unavailability
+                description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to collect metrics
                 query: 'kube_horizontalpodautoscaler_status_condition{status="false", condition="ScalingActive"} == 1'
                 severity: warning
-              - name: Kubernetes HPA scale capability
-                description: The maximum number of desired Pods has been hit
+              - name: Kubernetes HPA scale maximum
+                description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has hit maximum number of desired pods
                 query: 'kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas'
                 severity: info
                 for: 2m
               - name: Kubernetes HPA underutilized
-                description: HPA is constantly at minimum replicas for 50% of the time. Potential cost saving here.
+                description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is constantly at minimum replicas for 50% of the time. Potential cost saving here.
                 query: 'max(quantile_over_time(0.5, kube_horizontalpodautoscaler_status_desired_replicas[1d]) == kube_horizontalpodautoscaler_spec_min_replicas) by (horizontalpodautoscaler) > 3'  # allow minimum 3 replicas running
                 severity: info
               - name: Kubernetes Pod not healthy
-                description: Pod has been in a non-ready state for longer than 15 minutes.
+                description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-running state for longer than 15 minutes.
                 query: 'sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"}) > 0'
                 severity: critical
                 for: 15m
               - name: Kubernetes pod crash looping
-                description: Pod {{ $labels.pod }} is crash looping
+                description: Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping
                 query: 'increase(kube_pod_container_status_restarts_total[1m]) > 3'
                 severity: warning
                 for: 2m
-              - name: Kubernetes ReplicasSet mismatch
-                description: Deployment Replicas mismatch
+              - name: Kubernetes ReplicaSet replicas mismatch
+                description: ReplicaSet {{ $labels.namespace }}/{{ $labels.replicaset }} replicas mismatch
                 query: 'kube_replicaset_spec_replicas != kube_replicaset_status_ready_replicas'
                 severity: warning
                 for: 10m
               - name: Kubernetes Deployment replicas mismatch
-                description: Deployment Replicas mismatch
+                description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replicas mismatch
                 query: 'kube_deployment_spec_replicas != kube_deployment_status_replicas_available'
                 severity: warning
                 for: 10m
               - name: Kubernetes StatefulSet replicas mismatch
-                description: A StatefulSet does not match the expected number of replicas.
+                description: StatefulSet does not match the expected number of replicas.
                 query: 'kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas'
                 severity: warning
                 for: 10m
               - name: Kubernetes Deployment generation mismatch
-                description: A Deployment has failed but has not been rolled back.
+                description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has failed but has not been rolled back.
                 query: 'kube_deployment_status_observed_generation != kube_deployment_metadata_generation'
                 severity: critical
                 for: 10m
               - name: Kubernetes StatefulSet generation mismatch
-                description: A StatefulSet has failed but has not been rolled back.
+                description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has failed but has not been rolled back.
                 query: 'kube_statefulset_status_observed_generation != kube_statefulset_metadata_generation'
                 severity: critical
                 for: 10m
               - name: Kubernetes StatefulSet update not rolled out
-                description: StatefulSet update has not been rolled out.
+                description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.
                 query: 'max without (revision) (kube_statefulset_status_current_revision unless kube_statefulset_status_update_revision) * (kube_statefulset_replicas != kube_statefulset_status_replicas_updated)'
                 severity: warning
                 for: 10m
               - name: Kubernetes DaemonSet rollout stuck
-                description: Some Pods of DaemonSet are not scheduled or not ready
+                description: Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled or not ready
                 query: 'kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 or kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0'
                 severity: warning
                 for: 10m
               - name: Kubernetes DaemonSet misscheduled
-                description: Some DaemonSet Pods are running where they are not supposed to run
+                description: Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run
                 query: 'kube_daemonset_status_number_misscheduled > 0'
                 severity: critical
                 for: 1m
@@ -1827,7 +1827,7 @@ groups:
                 severity: warning
                 comments: |
                   Threshold should be customized for each cronjob name.
-              - name: Kubernetes job slow completion
+              - name: Kubernetes Job slow completion
                 description: Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in time.
                 query: 'kube_job_spec_completions - kube_job_status_succeeded - kube_job_status_failed > 0'
                 severity: critical
diff --git a/dist/rules/kubernetes/kubestate-exporter.yml b/dist/rules/kubernetes/kubestate-exporter.yml
index 783682e..719948e 100644
--- a/dist/rules/kubernetes/kubestate-exporter.yml
+++ b/dist/rules/kubernetes/kubestate-exporter.yml
@@ -10,52 +10,52 @@ groups:
       labels:
         severity: critical
       annotations:
-        summary: Kubernetes node not ready (instance {{ $labels.instance }})
+        summary: Kubernetes Node not ready (node {{ $labels.node }})
         description: "Node {{ $labels.node }} has been unready for a long time\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: KubernetesMemoryPressure
+    - alert: KubernetesNodeMemoryPressure
       expr: 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1'
       for: 2m
       labels:
         severity: critical
       annotations:
-        summary: Kubernetes memory pressure (instance {{ $labels.instance }})
-        description: "{{ $labels.node }} has MemoryPressure condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Kubernetes Node memory pressure (node {{ $labels.node }})
+        description: "Node {{ $labels.node }} has MemoryPressure condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: KubernetesDiskPressure
+    - alert: KubernetesNodeDiskPressure
       expr: 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1'
       for: 2m
       labels:
         severity: critical
       annotations:
-        summary: Kubernetes disk pressure (instance {{ $labels.instance }})
-        description: "{{ $labels.node }} has DiskPressure condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Kubernetes Node disk pressure (node {{ $labels.node }})
+        description: "Node {{ $labels.node }} has DiskPressure condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: KubernetesNetworkUnavailable
+    - alert: KubernetesNodeNetworkUnavailable
       expr: 'kube_node_status_condition{condition="NetworkUnavailable",status="true"} == 1'
       for: 2m
       labels:
         severity: critical
       annotations:
-        summary: Kubernetes network unavailable (instance {{ $labels.instance }})
-        description: "{{ $labels.node }} has NetworkUnavailable condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Kubernetes Node network unavailable (node {{ $labels.node }})
+        description: "Node {{ $labels.node }} has NetworkUnavailable condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: KubernetesOutOfCapacity
+    - alert: KubernetesNodeOutOfPodCapacity
       expr: 'sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90'
       for: 2m
       labels:
         severity: warning
       annotations:
-        summary: Kubernetes out of capacity (instance {{ $labels.instance }})
-        description: "{{ $labels.node }} is out of capacity\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Kubernetes Node out of pod capacity (node {{ $labels.node }})
+        description: "Node {{ $labels.node }} is out of pod capacity\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: KubernetesContainerOomKiller
+    - alert: KubernetesContainerOomKilled
       expr: '(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1'
       for: 0m
       labels:
         severity: warning
       annotations:
-        summary: Kubernetes container oom killer (instance {{ $labels.instance }})
+        summary: Kubernetes Container oom killed (pod {{ $labels.pod }})
         description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesJobFailed
@@ -64,7 +64,7 @@ groups:
       labels:
         severity: warning
       annotations:
-        summary: Kubernetes Job failed (instance {{ $labels.instance }})
+        summary: Kubernetes Job failed (job_name {{ $labels.job_name }})
         description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesCronjobSuspended
@@ -73,7 +73,7 @@ groups:
       labels:
         severity: warning
       annotations:
-        summary: Kubernetes CronJob suspended (instance {{ $labels.instance }})
+        summary: Kubernetes CronJob suspended (cronjob {{ $labels.cronjob }})
         description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is suspended\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesPersistentvolumeclaimPending
@@ -82,7 +82,7 @@ groups:
       labels:
         severity: warning
       annotations:
-        summary: Kubernetes PersistentVolumeClaim pending (instance {{ $labels.instance }})
+        summary: Kubernetes PersistentVolumeClaim pending (pvc {{ $labels.persistentvolumeclaim }})
         description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pending\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesVolumeOutOfDiskSpace
@@ -91,8 +91,8 @@ groups:
       labels:
         severity: warning
       annotations:
-        summary: Kubernetes Volume out of disk space (instance {{ $labels.instance }})
-        description: "Volume is almost full (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Kubernetes Volume out of disk space (pvc {{ $labels.persistentvolumeclaim }})
+        description: "Volume under {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is almost full (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesVolumeFullInFourDays
       expr: 'predict_linear(kubelet_volume_stats_available_bytes[6h], 4 * 24 * 3600) < 0'
@@ -100,8 +100,8 @@ groups:
       labels:
         severity: critical
       annotations:
-        summary: Kubernetes Volume full in four days (instance {{ $labels.instance }})
-        description: "{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Kubernetes Volume full in four days (pvc {{ $labels.persistentvolumeclaim }})
+        description: "Volume under {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesPersistentvolumeError
       expr: 'kube_persistentvolume_status_phase{phase=~"Failed|Pending", job="kube-state-metrics"} > 0'
@@ -109,8 +109,8 @@ groups:
       labels:
         severity: critical
       annotations:
-        summary: Kubernetes PersistentVolume error (instance {{ $labels.instance }})
-        description: "Persistent volume is in bad state\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Kubernetes PersistentVolume error (pv {{ $labels.persistentvolume }})
+        description: "Persistent volume {{ $labels.persistentvolume }} is in bad state\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesStatefulsetDown
       expr: 'kube_statefulset_replicas != kube_statefulset_status_replicas_ready > 0'
@@ -118,35 +118,36 @@ groups:
       labels:
         severity: critical
       annotations:
-        summary: Kubernetes StatefulSet down (instance {{ $labels.instance }})
-        description: "A StatefulSet went down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Kubernetes StatefulSet down (statefulset {{ $labels.statefulset }})
+        description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} went down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: KubernetesHpaScalingAbility
+    - alert: KubernetesHpaScaleInability
       expr: 'kube_horizontalpodautoscaler_status_condition{status="false", condition="AbleToScale"} == 1'
       for: 2m
       labels:
         severity: warning
       annotations:
-        summary: Kubernetes HPA scaling ability (instance {{ $labels.instance }})
-        description: "Pod is unable to scale\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Kubernetes HPA scale inability (hpa {{ $labels.horizontalpodautoscaler }})
+        description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to scale\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: KubernetesHpaMetricAvailability
+    - alert: KubernetesHpaMetricsUnavailability
       expr: 'kube_horizontalpodautoscaler_status_condition{status="false", condition="ScalingActive"} == 1'
       for: 0m
       labels:
         severity: warning
       annotations:
-        summary: Kubernetes HPA metric availability (instance {{ $labels.instance }})
-        description: "HPA is not able to collect metrics\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Kubernetes HPA metrics unavailability (hpa {{ $labels.horizontalpodautoscaler }})
+        description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to collect metrics\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: KubernetesHpaScaleCapability
+
+    - alert: KubernetesHpaScaleMaximum
       expr: 'kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas'
       for: 2m
       labels:
         severity: info
       annotations:
-        summary: Kubernetes HPA scale capability (instance {{ $labels.instance }})
-        description: "The maximum number of desired Pods has been hit\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Kubernetes HPA scale maximum (hpa {{ $labels.horizontalpodautoscaler }})
+        description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has hit maximum number of desired pods\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesHpaUnderutilized
       expr: 'max(quantile_over_time(0.5, kube_horizontalpodautoscaler_status_desired_replicas[1d]) == kube_horizontalpodautoscaler_spec_min_replicas) by (horizontalpodautoscaler) > 3'
@@ -154,8 +155,8 @@ groups:
       labels:
         severity: info
       annotations:
-        summary: Kubernetes HPA underutilized (instance {{ $labels.instance }})
-        description: "HPA is constantly at minimum replicas for 50% of the time. Potential cost saving here.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Kubernetes HPA underutilized (hpa {{ $labels.horizontalpodautoscaler }})
+        description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is constantly at minimum replicas for 50% of the time. Potential cost saving here.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesPodNotHealthy
       expr: 'sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"}) > 0'
@@ -163,8 +164,8 @@ groups:
       labels:
         severity: critical
       annotations:
-        summary: Kubernetes Pod not healthy (instance {{ $labels.instance }})
-        description: "Pod has been in a non-ready state for longer than 15 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Kubernetes Pod not healthy (pod {{ $labels.pod }})
+        description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-running state for longer than 15 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesPodCrashLooping
       expr: 'increase(kube_pod_container_status_restarts_total[1m]) > 3'
@@ -172,17 +173,17 @@ groups:
       labels:
         severity: warning
       annotations:
-        summary: Kubernetes pod crash looping (instance {{ $labels.instance }})
-        description: "Pod {{ $labels.pod }} is crash looping\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Kubernetes pod crash looping (pod {{ $labels.pod }})
+        description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: KubernetesReplicassetMismatch
+    - alert: KubernetesReplicasetReplicasMismatch
       expr: 'kube_replicaset_spec_replicas != kube_replicaset_status_ready_replicas'
       for: 10m
       labels:
         severity: warning
       annotations:
-        summary: Kubernetes ReplicasSet mismatch (instance {{ $labels.instance }})
-        description: "Deployment Replicas mismatch\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Kubernetes ReplicaSet replicas mismatch (replicaset {{ $labels.replicaset }})
+        description: "ReplicaSet {{ $labels.namespace }}/{{ $labels.replicaset }} replicas mismatch\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesDeploymentReplicasMismatch
       expr: 'kube_deployment_spec_replicas != kube_deployment_status_replicas_available'
@@ -190,8 +191,8 @@ groups:
       labels:
         severity: warning
       annotations:
-        summary: Kubernetes Deployment replicas mismatch (instance {{ $labels.instance }})
-        description: "Deployment Replicas mismatch\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Kubernetes Deployment replicas mismatch (deployment {{ $labels.deployment }})
+        description: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replicas mismatch\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesStatefulsetReplicasMismatch
       expr: 'kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas'
@@ -199,8 +200,8 @@ groups:
       labels:
         severity: warning
       annotations:
-        summary: Kubernetes StatefulSet replicas mismatch (instance {{ $labels.instance }})
-        description: "A StatefulSet does not match the expected number of replicas.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Kubernetes StatefulSet replicas mismatch (statefulset {{ $labels.statefulset }})
+        description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} replicas mismatch\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesDeploymentGenerationMismatch
       expr: 'kube_deployment_status_observed_generation != kube_deployment_metadata_generation'
@@ -208,8 +209,8 @@ groups:
       labels:
         severity: critical
       annotations:
-        summary: Kubernetes Deployment generation mismatch (instance {{ $labels.instance }})
-        description: "A Deployment has failed but has not been rolled back.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Kubernetes Deployment generation mismatch (deployment {{ $labels.deployment }})
+        description: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has failed but has not been rolled back.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesStatefulsetGenerationMismatch
       expr: 'kube_statefulset_status_observed_generation != kube_statefulset_metadata_generation'
@@ -217,8 +218,8 @@ groups:
       labels:
         severity: critical
       annotations:
-        summary: Kubernetes StatefulSet generation mismatch (instance {{ $labels.instance }})
-        description: "A StatefulSet has failed but has not been rolled back.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Kubernetes StatefulSet generation mismatch (statefulset {{ $labels.statefulset }})
+        description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has failed but has not been rolled back.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesStatefulsetUpdateNotRolledOut
       expr: 'max without (revision) (kube_statefulset_status_current_revision unless kube_statefulset_status_update_revision) * (kube_statefulset_replicas != kube_statefulset_status_replicas_updated)'
@@ -226,8 +227,8 @@ groups:
       labels:
         severity: warning
       annotations:
-        summary: Kubernetes StatefulSet update not rolled out (instance {{ $labels.instance }})
-        description: "StatefulSet update has not been rolled out.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Kubernetes StatefulSet update not rolled out (statefulset {{ $labels.statefulset }})
+        description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesDaemonsetRolloutStuck
       expr: 'kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 or kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0'
@@ -235,8 +236,8 @@ groups:
       labels:
         severity: warning
       annotations:
-        summary: Kubernetes DaemonSet rollout stuck (instance {{ $labels.instance }})
-        description: "Some Pods of DaemonSet are not scheduled or not ready\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Kubernetes DaemonSet rollout stuck (daemonset {{ $labels.daemonset }})
+        description: "Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled or not ready\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesDaemonsetMisscheduled
       expr: 'kube_daemonset_status_number_misscheduled > 0'
@@ -244,8 +245,8 @@ groups:
       labels:
         severity: critical
       annotations:
-        summary: Kubernetes DaemonSet misscheduled (instance {{ $labels.instance }})
-        description: "Some DaemonSet Pods are running where they are not supposed to run\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Kubernetes DaemonSet misscheduled (daemonset {{ $labels.daemonset }})
+        description: "Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesCronjobTooLong
       expr: 'time() - kube_cronjob_next_schedule_time > 3600'
@@ -253,7 +254,7 @@ groups:
       labels:
         severity: warning
       annotations:
-        summary: Kubernetes CronJob too long (instance {{ $labels.instance }})
+        summary: Kubernetes CronJob too long (cronjob {{ $labels.cronjob }})
         description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesJobSlowCompletion
@@ -262,7 +263,7 @@ groups:
       labels:
         severity: critical
       annotations:
-        summary: Kubernetes job slow completion (instance {{ $labels.instance }})
+        summary: Kubernetes Job slow completion (job_name {{ $labels.job_name }})
         description: "Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in time.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesApiServerErrors

From 4279dedb52c8bb5faf3af92fff36ec175bc86980 Mon Sep 17 00:00:00 2001
From: samber <samber@users.noreply.github.com>
Date: Sat, 19 Aug 2023 22:41:12 +0000
Subject: [PATCH 002/123] Publish

---
 dist/rules/kubernetes/kubestate-exporter.yml | 65 ++++++++++----------
 1 file changed, 32 insertions(+), 33 deletions(-)

diff --git a/dist/rules/kubernetes/kubestate-exporter.yml b/dist/rules/kubernetes/kubestate-exporter.yml
index 719948e..f53ac4e 100644
--- a/dist/rules/kubernetes/kubestate-exporter.yml
+++ b/dist/rules/kubernetes/kubestate-exporter.yml
@@ -10,7 +10,7 @@ groups:
       labels:
         severity: critical
       annotations:
-        summary: Kubernetes Node not ready (node {{ $labels.node }})
+        summary: Kubernetes Node not ready (instance {{ $labels.instance }})
         description: "Node {{ $labels.node }} has been unready for a long time\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesNodeMemoryPressure
@@ -19,7 +19,7 @@ groups:
       labels:
         severity: critical
       annotations:
-        summary: Kubernetes Node memory pressure (node {{ $labels.node }})
+        summary: Kubernetes Node memory pressure (instance {{ $labels.instance }})
         description: "Node {{ $labels.node }} has MemoryPressure condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesNodeDiskPressure
@@ -28,7 +28,7 @@ groups:
       labels:
         severity: critical
       annotations:
-        summary: Kubernetes Node disk pressure (node {{ $labels.node }})
+        summary: Kubernetes Node disk pressure (instance {{ $labels.instance }})
         description: "Node {{ $labels.node }} has DiskPressure condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesNodeNetworkUnavailable
@@ -37,7 +37,7 @@ groups:
       labels:
         severity: critical
       annotations:
-        summary: Kubernetes Node network unavailable (node {{ $labels.node }})
+        summary: Kubernetes Node network unavailable (instance {{ $labels.instance }})
         description: "Node {{ $labels.node }} has NetworkUnavailable condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesNodeOutOfPodCapacity
@@ -46,16 +46,16 @@ groups:
       labels:
         severity: warning
       annotations:
-        summary: Kubernetes Node out of pod capacity (node {{ $labels.node }})
+        summary: Kubernetes Node out of pod capacity (instance {{ $labels.instance }})
         description: "Node {{ $labels.node }} is out of pod capacity\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: KubernetesContainerOomKilled
+    - alert: KubernetesContainerOomKiller
       expr: '(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1'
       for: 0m
       labels:
         severity: warning
       annotations:
-        summary: Kubernetes Container oom killed (pod {{ $labels.pod }})
+        summary: Kubernetes Container oom killer (instance {{ $labels.instance }})
         description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesJobFailed
@@ -64,7 +64,7 @@ groups:
       labels:
         severity: warning
       annotations:
-        summary: Kubernetes Job failed (job_name {{ $labels.job_name }})
+        summary: Kubernetes Job failed (instance {{ $labels.instance }})
         description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesCronjobSuspended
@@ -73,7 +73,7 @@ groups:
       labels:
         severity: warning
       annotations:
-        summary: Kubernetes CronJob suspended (cronjob {{ $labels.cronjob }})
+        summary: Kubernetes CronJob suspended (instance {{ $labels.instance }})
         description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is suspended\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesPersistentvolumeclaimPending
@@ -82,7 +82,7 @@ groups:
       labels:
         severity: warning
       annotations:
-        summary: Kubernetes PersistentVolumeClaim pending (pvc {{ $labels.persistentvolumeclaim }})
+        summary: Kubernetes PersistentVolumeClaim pending (instance {{ $labels.instance }})
         description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pending\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesVolumeOutOfDiskSpace
@@ -91,8 +91,8 @@ groups:
       labels:
         severity: warning
       annotations:
-        summary: Kubernetes Volume out of disk space (pvc {{ $labels.persistentvolumeclaim }})
-        description: "Volume under {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is almost full (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Kubernetes Volume out of disk space (instance {{ $labels.instance }})
+        description: "Volume is almost full (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesVolumeFullInFourDays
       expr: 'predict_linear(kubelet_volume_stats_available_bytes[6h], 4 * 24 * 3600) < 0'
@@ -100,7 +100,7 @@ groups:
       labels:
         severity: critical
       annotations:
-        summary: Kubernetes Volume full in four days (pvc {{ $labels.persistentvolumeclaim }})
+        summary: Kubernetes Volume full in four days (instance {{ $labels.instance }})
         description: "Volume under {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesPersistentvolumeError
@@ -109,7 +109,7 @@ groups:
       labels:
         severity: critical
       annotations:
-        summary: Kubernetes PersistentVolume error (pv {{ $labels.persistentvolume }})
+        summary: Kubernetes PersistentVolume error (instance {{ $labels.instance }})
         description: "Persistent volume {{ $labels.persistentvolume }} is in bad state\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesStatefulsetDown
@@ -118,7 +118,7 @@ groups:
       labels:
         severity: critical
       annotations:
-        summary: Kubernetes StatefulSet down (statefulset {{ $labels.statefulset }})
+        summary: Kubernetes StatefulSet down (instance {{ $labels.instance }})
         description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} went down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesHpaScaleInability
@@ -127,7 +127,7 @@ groups:
       labels:
         severity: warning
       annotations:
-        summary: Kubernetes HPA scale inability (hpa {{ $labels.horizontalpodautoscaler }})
+        summary: Kubernetes HPA scale inability (instance {{ $labels.instance }})
         description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to scale\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesHpaMetricsUnavailability
@@ -136,17 +136,16 @@ groups:
       labels:
         severity: warning
       annotations:
-        summary: Kubernetes HPA metrics unavailability (hpa {{ $labels.horizontalpodautoscaler }})
+        summary: Kubernetes HPA metrics unavailability (instance {{ $labels.instance }})
         description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to collect metrics\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-
     - alert: KubernetesHpaScaleMaximum
       expr: 'kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas'
       for: 2m
       labels:
         severity: info
       annotations:
-        summary: Kubernetes HPA scale maximum (hpa {{ $labels.horizontalpodautoscaler }})
+        summary: Kubernetes HPA scale maximum (instance {{ $labels.instance }})
         description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has hit maximum number of desired pods\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesHpaUnderutilized
@@ -155,7 +154,7 @@ groups:
       labels:
         severity: info
       annotations:
-        summary: Kubernetes HPA underutilized (hpa {{ $labels.horizontalpodautoscaler }})
+        summary: Kubernetes HPA underutilized (instance {{ $labels.instance }})
         description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is constantly at minimum replicas for 50% of the time. Potential cost saving here.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesPodNotHealthy
@@ -164,7 +163,7 @@ groups:
       labels:
         severity: critical
       annotations:
-        summary: Kubernetes Pod not healthy (pod {{ $labels.pod }})
+        summary: Kubernetes Pod not healthy (instance {{ $labels.instance }})
         description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-running state for longer than 15 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesPodCrashLooping
@@ -173,7 +172,7 @@ groups:
       labels:
         severity: warning
       annotations:
-        summary: Kubernetes pod crash looping (pod {{ $labels.pod }})
+        summary: Kubernetes pod crash looping (instance {{ $labels.instance }})
         description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesReplicasetReplicasMismatch
@@ -182,7 +181,7 @@ groups:
       labels:
         severity: warning
       annotations:
-        summary: Kubernetes ReplicaSet replicas mismatch (replicaset {{ $labels.replicaset }})
+        summary: Kubernetes ReplicaSet replicas mismatch (instance {{ $labels.instance }})
         description: "ReplicaSet {{ $labels.namespace }}/{{ $labels.replicaset }} replicas mismatch\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesDeploymentReplicasMismatch
@@ -191,7 +190,7 @@ groups:
       labels:
         severity: warning
       annotations:
-        summary: Kubernetes Deployment replicas mismatch (deployment {{ $labels.deployment }})
+        summary: Kubernetes Deployment replicas mismatch (instance {{ $labels.instance }})
         description: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replicas mismatch\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesStatefulsetReplicasMismatch
@@ -200,8 +199,8 @@ groups:
       labels:
         severity: warning
       annotations:
-        summary: Kubernetes StatefulSet replicas mismatch (statefulset {{ $labels.statefulset }})
-        description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} replicas mismatch\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Kubernetes StatefulSet replicas mismatch (instance {{ $labels.instance }})
+        description: "StatefulSet does not match the expected number of replicas.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesDeploymentGenerationMismatch
       expr: 'kube_deployment_status_observed_generation != kube_deployment_metadata_generation'
@@ -209,7 +208,7 @@ groups:
       labels:
         severity: critical
       annotations:
-        summary: Kubernetes Deployment generation mismatch (deployment {{ $labels.deployment }})
+        summary: Kubernetes Deployment generation mismatch (instance {{ $labels.instance }})
         description: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has failed but has not been rolled back.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesStatefulsetGenerationMismatch
@@ -218,7 +217,7 @@ groups:
       labels:
         severity: critical
       annotations:
-        summary: Kubernetes StatefulSet generation mismatch (statefulset {{ $labels.statefulset }})
+        summary: Kubernetes StatefulSet generation mismatch (instance {{ $labels.instance }})
         description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has failed but has not been rolled back.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesStatefulsetUpdateNotRolledOut
@@ -227,7 +226,7 @@ groups:
       labels:
         severity: warning
       annotations:
-        summary: Kubernetes StatefulSet update not rolled out (statefulset {{ $labels.statefulset }})
+        summary: Kubernetes StatefulSet update not rolled out (instance {{ $labels.instance }})
         description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesDaemonsetRolloutStuck
@@ -236,7 +235,7 @@ groups:
       labels:
         severity: warning
       annotations:
-        summary: Kubernetes DaemonSet rollout stuck (daemonset {{ $labels.daemonset }})
+        summary: Kubernetes DaemonSet rollout stuck (instance {{ $labels.instance }})
         description: "Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled or not ready\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesDaemonsetMisscheduled
@@ -245,7 +244,7 @@ groups:
       labels:
         severity: critical
       annotations:
-        summary: Kubernetes DaemonSet misscheduled (daemonset {{ $labels.daemonset }})
+        summary: Kubernetes DaemonSet misscheduled (instance {{ $labels.instance }})
         description: "Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesCronjobTooLong
@@ -254,7 +253,7 @@ groups:
       labels:
         severity: warning
       annotations:
-        summary: Kubernetes CronJob too long (cronjob {{ $labels.cronjob }})
+        summary: Kubernetes CronJob too long (instance {{ $labels.instance }})
         description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesJobSlowCompletion
@@ -263,7 +262,7 @@ groups:
       labels:
         severity: critical
       annotations:
-        summary: Kubernetes Job slow completion (job_name {{ $labels.job_name }})
+        summary: Kubernetes Job slow completion (instance {{ $labels.instance }})
         description: "Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in time.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesApiServerErrors

From 053cde27e45758af0cd4dc8f770e3a7e4d21a18f Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Tue, 22 Aug 2023 15:51:53 +0200
Subject: [PATCH 003/123] Update rules.yml

---
 _data/rules.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/_data/rules.yml b/_data/rules.yml
index 7fb1a6a..d6d3c2a 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -619,7 +619,7 @@ groups:
                 severity: warning
               - name: Postgresql too many connections
                 description: PostgreSQL instance has too many connections (> 80%).
-                expr: 'sum by (instance, job, server) (pg_stat_activity_count) > min by (instance, job, server) (pg_settings_max_connections * 0.8)'
+                query: 'sum by (instance, job, server) (pg_stat_activity_count) > min by (instance, job, server) (pg_settings_max_connections * 0.8)'
                 severity: warning
                 for: 2m
               - name: Postgresql not enough connections

From 93a62d4271ca039fda968c1ccd291ea6ee1826d3 Mon Sep 17 00:00:00 2001
From: samber <samber@users.noreply.github.com>
Date: Tue, 22 Aug 2023 13:53:16 +0000
Subject: [PATCH 004/123] Publish

---
 dist/rules/postgresql/postgres-exporter.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dist/rules/postgresql/postgres-exporter.yml b/dist/rules/postgresql/postgres-exporter.yml
index 2c4a793..e9dc0b5 100644
--- a/dist/rules/postgresql/postgres-exporter.yml
+++ b/dist/rules/postgresql/postgres-exporter.yml
@@ -50,7 +50,7 @@ groups:
         description: "Table {{ $labels.relname }} has not been auto analyzed for 10 days\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: PostgresqlTooManyConnections
-      expr: ''
+      expr: 'sum by (instance, job, server) (pg_stat_activity_count) > min by (instance, job, server) (pg_settings_max_connections * 0.8)'
       for: 2m
       labels:
         severity: warning

From bacb4330899b466014303dfe2fc7f14a311588c3 Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Mon, 18 Sep 2023 20:14:57 +0200
Subject: [PATCH 005/123] Update rules.yml

---
 _data/rules.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/_data/rules.yml b/_data/rules.yml
index d6d3c2a..d818035 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -1742,7 +1742,7 @@ groups:
                 for: 2m
               - name: Kubernetes Volume full in four days
                 description: "Volume under {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available."
-                query: 'predict_linear(kubelet_volume_stats_available_bytes[6h], 4 * 24 * 3600) < 0'
+                query: 'predict_linear(kubelet_volume_stats_available_bytes[6h:5m], 4 * 24 * 3600) < 0'
                 severity: critical
               - name: Kubernetes PersistentVolume error
                 description: "Persistent volume {{ $labels.persistentvolume }} is in bad state"

From ccdfd22a41fa4aaaf1c2d1c2a938555b9f532b07 Mon Sep 17 00:00:00 2001
From: samber <samber@users.noreply.github.com>
Date: Mon, 18 Sep 2023 18:16:22 +0000
Subject: [PATCH 006/123] Publish

---
 dist/rules/kubernetes/kubestate-exporter.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dist/rules/kubernetes/kubestate-exporter.yml b/dist/rules/kubernetes/kubestate-exporter.yml
index f53ac4e..8ba0458 100644
--- a/dist/rules/kubernetes/kubestate-exporter.yml
+++ b/dist/rules/kubernetes/kubestate-exporter.yml
@@ -95,7 +95,7 @@ groups:
         description: "Volume is almost full (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesVolumeFullInFourDays
-      expr: 'predict_linear(kubelet_volume_stats_available_bytes[6h], 4 * 24 * 3600) < 0'
+      expr: 'predict_linear(kubelet_volume_stats_available_bytes[6h:5m], 4 * 24 * 3600) < 0'
       for: 0m
       labels:
         severity: critical

From 95fab18a2f80c02873272543465be017bbf6f52a Mon Sep 17 00:00:00 2001
From: Nicholas Devenish <ndevenish@gmail.com>
Date: Tue, 26 Sep 2023 19:27:26 +0100
Subject: [PATCH 007/123] Fix typo "critial" in receiver routes (#389)

---
 alertmanager.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/alertmanager.md b/alertmanager.md
index 7de03e9..c593845 100644
--- a/alertmanager.md
+++ b/alertmanager.md
@@ -80,7 +80,7 @@ route:
     - receiver: "pager"
       group_wait: 10s
       match_re:
-        severity: critial
+        severity: critical
       continue: true
 
 receivers:

From 672f26692c2c41551730a655968a1d405b6f1d30 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 2 Oct 2023 10:22:35 +0200
Subject: [PATCH 008/123] build(deps): bump actions/checkout from 3 to 4 (#390)

Bumps [actions/checkout](https://github.com/actions/checkout) from 3 to 4.
- [Release notes](https://github.com/actions/checkout/releases)
- [Changelog](https://github.com/actions/checkout/blob/main/CHANGELOG.md)
- [Commits](https://github.com/actions/checkout/compare/v3...v4)

---
updated-dependencies:
- dependency-name: actions/checkout
  dependency-type: direct:production
  update-type: version-update:semver-major
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/dist.yml | 2 +-
 .github/workflows/test.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/dist.yml b/.github/workflows/dist.yml
index 64a318a..1f64e36 100644
--- a/.github/workflows/dist.yml
+++ b/.github/workflows/dist.yml
@@ -13,7 +13,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout Repo
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Set up Ruby
         uses: ruby/setup-ruby@v1
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 5d0cc3e..c397867 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -8,7 +8,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout Repo
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       - name: Set up Ruby
         uses: ruby/setup-ruby@v1

From 32a097836a9d1861f08955255d5ecb97c1160385 Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Fri, 6 Oct 2023 18:48:38 +0200
Subject: [PATCH 009/123] Update README.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 45b4bf4..a4d2053 100644
--- a/README.md
+++ b/README.md
@@ -41,6 +41,7 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts
 - [Pulsar](https://samber.github.io/awesome-prometheus-alerts/rules#pulsar)
 - [Nats](https://samber.github.io/awesome-prometheus-alerts/rules#nats)
 - [Solr](https://samber.github.io/awesome-prometheus-alerts/rules#solr)
+- [Hadoop](https://samber.github.io/awesome-prometheus-alerts/rules#hadoop)
 
 #### Reverse proxies and load balancers
 

From 7a8f883df6633cdbccd9c9a54c618e0aff496658 Mon Sep 17 00:00:00 2001
From: Vicky Wilson Jacob <vickywilsonj@gmail.com>
Date: Fri, 6 Oct 2023 12:48:54 -0400
Subject: [PATCH 010/123] feat: adding hadoop jmx exporter (#391)

* adding hadoop exporter

* added hadoop rules with jmx exporter

* added hadoop rules with jmx exporter

* Update rules.yml

---------

Co-authored-by: Samuel Berthe <dev@samuel-berthe.fr>
---
 _data/rules.yml                           |  77 ++++++++++++++++
 dist/rules/hadoop/hadoop-jmx-exporter.yml | 102 ++++++++++++++++++++++
 2 files changed, 179 insertions(+)
 create mode 100644 dist/rules/hadoop/hadoop-jmx-exporter.yml

diff --git a/_data/rules.yml b/_data/rules.yml
index d818035..12d8e34 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -1405,6 +1405,83 @@ groups:
                 query: 'solr_collections_live_nodes < 2'
                 severity: critical
 
+      - name: Hadoop
+        exporters:
+          - name: hadoop/jmx_exporter
+            slug: jmx_exporter
+            doc_url: https://github.com/prometheus/jmx_exporter
+            rules:
+              # Alert rule for NameNode availability
+              - name: Hadoop Name Node Down
+                query: up{job="hadoop-namenode"} == 0
+                for: 5m
+                severity: critical
+                description: "The Hadoop NameNode service is unavailable."
+
+              # Alert rule for ResourceManager availability
+              - name: Hadoop Resource Manager Down
+                query: up{job="hadoop-resourcemanager"} == 0
+                for: 5m
+                severity: critical
+                description: "The Hadoop ResourceManager service is unavailable."
+
+              # Alert rule for DataNode status
+              - name: Hadoop Data Node Out Of Service
+                query: hadoop_datanode_last_heartbeat == 0
+                for: 10m
+                severity: warning
+                description: "The Hadoop DataNode is not sending heartbeats."
+
+              # Alert rule for low HDFS disk space
+              - name: Hadoop HDFS Disk Space Low
+                query: (hadoop_hdfs_bytes_total - hadoop_hdfs_bytes_used) / hadoop_hdfs_bytes_total < 0.1
+                for: 15m
+                severity: warning
+                description: "Available HDFS disk space is running low."
+
+              # Alert rule for excessive MapReduce task failures
+              - name: Hadoop Map Reduce Task Failures
+                query: hadoop_mapreduce_task_failures_total > 100
+                for: 10m
+                severity: critical
+                description: "There is an unusually high number of MapReduce task failures."
+
+              # Alert rule for high ResourceManager memory usage
+              - name: Hadoop Resource Manager Memory High
+                query: hadoop_resourcemanager_memory_bytes / hadoop_resourcemanager_memory_max_bytes > 0.8
+                for: 15m
+                severity: warning
+                description: "The Hadoop ResourceManager is approaching its memory limit."
+
+              # Alert rule for high YARN container allocation failures
+              - name: Hadoop YARN Container Allocation Failures
+                query: hadoop_yarn_container_allocation_failures_total > 10
+                for: 10m
+                severity: warning
+                description: "There is a significant number of YARN container allocation failures."
+
+              # Alert rule for excessive HBase region server region count
+              - name: Hadoop HBase Region Count High
+                query: hadoop_hbase_region_count > 5000
+                for: 15m
+                severity: warning
+                description: "The HBase cluster has an unusually high number of regions."
+
+              # Alert rule for low HBase region server heap space
+              - name: Hadoop HBase Region Server Heap Low
+                query: hadoop_hbase_region_server_heap_bytes / hadoop_hbase_region_server_max_heap_bytes < 0.2
+                for: 10m
+                severity: critical
+                description: "HBase Region Servers are running low on heap space."
+
+              # Alert rule for high HBase Write Requests latency
+              - name: Hadoop HBase Write Requests Latency High
+                query: hadoop_hbase_write_requests_latency_seconds > 0.5
+                for: 10m
+                severity: warning
+                description: "HBase Write Requests are experiencing high latency."
+
+
   - name: Reverse proxies and load balancers
     services:
       - name: Nginx
diff --git a/dist/rules/hadoop/hadoop-jmx-exporter.yml b/dist/rules/hadoop/hadoop-jmx-exporter.yml
new file mode 100644
index 0000000..42e3478
--- /dev/null
+++ b/dist/rules/hadoop/hadoop-jmx-exporter.yml
@@ -0,0 +1,102 @@
+groups:
+  - name: HadoopAlerts
+    rules:
+      # Alert rule for NameNode availability
+      - alert: HadoopNameNodeDown
+        expr: up{job="hadoop-namenode"} == 0
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Hadoop NameNode is down"
+          description: "The Hadoop NameNode service is unavailable."
+
+      # Alert rule for ResourceManager availability
+      - alert: HadoopResourceManagerDown
+        expr: up{job="hadoop-resourcemanager"} == 0
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Hadoop ResourceManager is down"
+          description: "The Hadoop ResourceManager service is unavailable."
+
+      # Alert rule for DataNode status
+      - alert: HadoopDataNodeOutOfService
+        expr: hadoop_datanode_last_heartbeat == 0
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Hadoop DataNode is out of service"
+          description: "The Hadoop DataNode is not sending heartbeats."
+
+      # Alert rule for low HDFS disk space
+      - alert: HadoopHDFSDiskSpaceLow
+        expr: (hadoop_hdfs_bytes_total - hadoop_hdfs_bytes_used) / hadoop_hdfs_bytes_total < 0.1
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Low HDFS Disk Space"
+          description: "Available HDFS disk space is running low."
+
+      # Alert rule for excessive MapReduce task failures
+      - alert: HadoopMapReduceTaskFailures
+        expr: hadoop_mapreduce_task_failures_total > 100
+        for: 10m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Excessive MapReduce Task Failures"
+          description: "There is an unusually high number of MapReduce task failures."
+
+      # Alert rule for high ResourceManager memory usage
+      - alert: HadoopResourceManagerMemoryHigh
+        expr: hadoop_resourcemanager_memory_bytes / hadoop_resourcemanager_memory_max_bytes > 0.8
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High ResourceManager Memory Usage"
+          description: "The Hadoop ResourceManager is approaching its memory limit."
+
+      # Alert rule for high YARN container allocation failures
+      - alert: HadoopYARNContainerAllocationFailures
+        expr: hadoop_yarn_container_allocation_failures_total > 10
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High YARN Container Allocation Failures"
+          description: "There is a significant number of YARN container allocation failures."
+
+      # Alert rule for excessive HBase region server region count
+      - alert: HadoopHBaseRegionCountHigh
+        expr: hadoop_hbase_region_count > 5000
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Excessive HBase Region Count"
+          description: "The HBase cluster has an unusually high number of regions."
+
+      # Alert rule for low HBase region server heap space
+      - alert: HadoopHBaseRegionServerHeapLow
+        expr: hadoop_hbase_region_server_heap_bytes / hadoop_hbase_region_server_max_heap_bytes < 0.2
+        for: 10m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Low HBase Region Server Heap Space"
+          description: "HBase Region Servers are running low on heap space."
+
+      # Alert rule for high HBase Write Requests latency
+      - alert: HadoopHBaseWriteRequestsLatencyHigh
+        expr: hadoop_hbase_write_requests_latency_seconds > 0.5
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High HBase Write Requests Latency"
+          description: "HBase Write Requests are experiencing high latency."
\ No newline at end of file

From 82f27986204e2ea12ca9214bbea8837e0c387e05 Mon Sep 17 00:00:00 2001
From: samber <samber@users.noreply.github.com>
Date: Fri, 6 Oct 2023 16:50:22 +0000
Subject: [PATCH 011/123] Publish

---
 dist/rules/hadoop/hadoop-jmx-exporter.yml | 102 ----------------------
 dist/rules/hadoop/jmx_exporter.yml        |  95 ++++++++++++++++++++
 2 files changed, 95 insertions(+), 102 deletions(-)
 delete mode 100644 dist/rules/hadoop/hadoop-jmx-exporter.yml
 create mode 100644 dist/rules/hadoop/jmx_exporter.yml

diff --git a/dist/rules/hadoop/hadoop-jmx-exporter.yml b/dist/rules/hadoop/hadoop-jmx-exporter.yml
deleted file mode 100644
index 42e3478..0000000
--- a/dist/rules/hadoop/hadoop-jmx-exporter.yml
+++ /dev/null
@@ -1,102 +0,0 @@
-groups:
-  - name: HadoopAlerts
-    rules:
-      # Alert rule for NameNode availability
-      - alert: HadoopNameNodeDown
-        expr: up{job="hadoop-namenode"} == 0
-        for: 5m
-        labels:
-          severity: critical
-        annotations:
-          summary: "Hadoop NameNode is down"
-          description: "The Hadoop NameNode service is unavailable."
-
-      # Alert rule for ResourceManager availability
-      - alert: HadoopResourceManagerDown
-        expr: up{job="hadoop-resourcemanager"} == 0
-        for: 5m
-        labels:
-          severity: critical
-        annotations:
-          summary: "Hadoop ResourceManager is down"
-          description: "The Hadoop ResourceManager service is unavailable."
-
-      # Alert rule for DataNode status
-      - alert: HadoopDataNodeOutOfService
-        expr: hadoop_datanode_last_heartbeat == 0
-        for: 10m
-        labels:
-          severity: warning
-        annotations:
-          summary: "Hadoop DataNode is out of service"
-          description: "The Hadoop DataNode is not sending heartbeats."
-
-      # Alert rule for low HDFS disk space
-      - alert: HadoopHDFSDiskSpaceLow
-        expr: (hadoop_hdfs_bytes_total - hadoop_hdfs_bytes_used) / hadoop_hdfs_bytes_total < 0.1
-        for: 15m
-        labels:
-          severity: warning
-        annotations:
-          summary: "Low HDFS Disk Space"
-          description: "Available HDFS disk space is running low."
-
-      # Alert rule for excessive MapReduce task failures
-      - alert: HadoopMapReduceTaskFailures
-        expr: hadoop_mapreduce_task_failures_total > 100
-        for: 10m
-        labels:
-          severity: critical
-        annotations:
-          summary: "Excessive MapReduce Task Failures"
-          description: "There is an unusually high number of MapReduce task failures."
-
-      # Alert rule for high ResourceManager memory usage
-      - alert: HadoopResourceManagerMemoryHigh
-        expr: hadoop_resourcemanager_memory_bytes / hadoop_resourcemanager_memory_max_bytes > 0.8
-        for: 15m
-        labels:
-          severity: warning
-        annotations:
-          summary: "High ResourceManager Memory Usage"
-          description: "The Hadoop ResourceManager is approaching its memory limit."
-
-      # Alert rule for high YARN container allocation failures
-      - alert: HadoopYARNContainerAllocationFailures
-        expr: hadoop_yarn_container_allocation_failures_total > 10
-        for: 10m
-        labels:
-          severity: warning
-        annotations:
-          summary: "High YARN Container Allocation Failures"
-          description: "There is a significant number of YARN container allocation failures."
-
-      # Alert rule for excessive HBase region server region count
-      - alert: HadoopHBaseRegionCountHigh
-        expr: hadoop_hbase_region_count > 5000
-        for: 15m
-        labels:
-          severity: warning
-        annotations:
-          summary: "Excessive HBase Region Count"
-          description: "The HBase cluster has an unusually high number of regions."
-
-      # Alert rule for low HBase region server heap space
-      - alert: HadoopHBaseRegionServerHeapLow
-        expr: hadoop_hbase_region_server_heap_bytes / hadoop_hbase_region_server_max_heap_bytes < 0.2
-        for: 10m
-        labels:
-          severity: critical
-        annotations:
-          summary: "Low HBase Region Server Heap Space"
-          description: "HBase Region Servers are running low on heap space."
-
-      # Alert rule for high HBase Write Requests latency
-      - alert: HadoopHBaseWriteRequestsLatencyHigh
-        expr: hadoop_hbase_write_requests_latency_seconds > 0.5
-        for: 10m
-        labels:
-          severity: warning
-        annotations:
-          summary: "High HBase Write Requests Latency"
-          description: "HBase Write Requests are experiencing high latency."
\ No newline at end of file
diff --git a/dist/rules/hadoop/jmx_exporter.yml b/dist/rules/hadoop/jmx_exporter.yml
new file mode 100644
index 0000000..42d6ee3
--- /dev/null
+++ b/dist/rules/hadoop/jmx_exporter.yml
@@ -0,0 +1,95 @@
+groups:
+
+- name: Jmx_exporter
+
+  rules:
+
+    - alert: HadoopNameNodeDown
+      expr: 'up{job="hadoop-namenode"} == 0'
+      for: 5m
+      labels:
+        severity: critical
+      annotations:
+        summary: Hadoop Name Node Down (instance {{ $labels.instance }})
+        description: "The Hadoop NameNode service is unavailable.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HadoopResourceManagerDown
+      expr: 'up{job="hadoop-resourcemanager"} == 0'
+      for: 5m
+      labels:
+        severity: critical
+      annotations:
+        summary: Hadoop Resource Manager Down (instance {{ $labels.instance }})
+        description: "The Hadoop ResourceManager service is unavailable.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HadoopDataNodeOutOfService
+      expr: 'hadoop_datanode_last_heartbeat == 0'
+      for: 10m
+      labels:
+        severity: warning
+      annotations:
+        summary: Hadoop Data Node Out Of Service (instance {{ $labels.instance }})
+        description: "The Hadoop DataNode is not sending heartbeats.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HadoopHdfsDiskSpaceLow
+      expr: '(hadoop_hdfs_bytes_total - hadoop_hdfs_bytes_used) / hadoop_hdfs_bytes_total < 0.1'
+      for: 15m
+      labels:
+        severity: warning
+      annotations:
+        summary: Hadoop HDFS Disk Space Low (instance {{ $labels.instance }})
+        description: "Available HDFS disk space is running low.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HadoopMapReduceTaskFailures
+      expr: 'hadoop_mapreduce_task_failures_total > 100'
+      for: 10m
+      labels:
+        severity: critical
+      annotations:
+        summary: Hadoop Map Reduce Task Failures (instance {{ $labels.instance }})
+        description: "There is an unusually high number of MapReduce task failures.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HadoopResourceManagerMemoryHigh
+      expr: 'hadoop_resourcemanager_memory_bytes / hadoop_resourcemanager_memory_max_bytes > 0.8'
+      for: 15m
+      labels:
+        severity: warning
+      annotations:
+        summary: Hadoop Resource Manager Memory High (instance {{ $labels.instance }})
+        description: "The Hadoop ResourceManager is approaching its memory limit.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HadoopYarnContainerAllocationFailures
+      expr: 'hadoop_yarn_container_allocation_failures_total > 10'
+      for: 10m
+      labels:
+        severity: warning
+      annotations:
+        summary: Hadoop YARN Container Allocation Failures (instance {{ $labels.instance }})
+        description: "There is a significant number of YARN container allocation failures.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HadoopHbaseRegionCountHigh
+      expr: 'hadoop_hbase_region_count > 5000'
+      for: 15m
+      labels:
+        severity: warning
+      annotations:
+        summary: Hadoop HBase Region Count High (instance {{ $labels.instance }})
+        description: "The HBase cluster has an unusually high number of regions.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HadoopHbaseRegionServerHeapLow
+      expr: 'hadoop_hbase_region_server_heap_bytes / hadoop_hbase_region_server_max_heap_bytes < 0.2'
+      for: 10m
+      labels:
+        severity: critical
+      annotations:
+        summary: Hadoop HBase Region Server Heap Low (instance {{ $labels.instance }})
+        description: "HBase Region Servers are running low on heap space.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HadoopHbaseWriteRequestsLatencyHigh
+      expr: 'hadoop_hbase_write_requests_latency_seconds > 0.5'
+      for: 10m
+      labels:
+        severity: warning
+      annotations:
+        summary: Hadoop HBase Write Requests Latency High (instance {{ $labels.instance }})
+        description: "HBase Write Requests are experiencing high latency.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

From cbf7046afae68172c01c07e62c540118a382fb83 Mon Sep 17 00:00:00 2001
From: Pierre Riteau <pierre@stackhpc.com>
Date: Fri, 13 Oct 2023 17:09:10 +0200
Subject: [PATCH 012/123] Fix capitalisation of RabbitMQ (#392)

---
 _data/rules.yml | 42 +++++++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/_data/rules.yml b/_data/rules.yml
index 12d8e34..c060c1d 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -906,45 +906,45 @@ groups:
             slug: rabbitmq-exporter
             doc_url: https://github.com/rabbitmq/rabbitmq-prometheus
             rules:
-              - name: Rabbitmq node down
+              - name: RabbitMQ node down
                 description: Less than 3 nodes running in RabbitMQ cluster
                 query: 'sum(rabbitmq_build_info) < 3'
                 severity: critical
-              - name: Rabbitmq node not distributed
+              - name: RabbitMQ node not distributed
                 description: Distribution link state is not 'up'
                 query: 'erlang_vm_dist_node_state < 3'
                 severity: critical
-              - name: Rabbitmq instances different versions
-                description: Running different version of Rabbitmq in the same cluster, can lead to failure.
+              - name: RabbitMQ instances different versions
+                description: Running different version of RabbitMQ in the same cluster, can lead to failure.
                 query: 'count(count(rabbitmq_build_info) by (rabbitmq_version)) > 1'
                 severity: warning
                 for: 1h
-              - name: Rabbitmq memory high
+              - name: RabbitMQ memory high
                 description: A node use more than 90% of allocated RAM
                 query: 'rabbitmq_process_resident_memory_bytes / rabbitmq_resident_memory_limit_bytes * 100 > 90'
                 severity: warning
                 for: 2m
-              - name: Rabbitmq file descriptors usage
+              - name: RabbitMQ file descriptors usage
                 description: A node use more than 90% of file descriptors
                 query: 'rabbitmq_process_open_fds / rabbitmq_process_max_fds * 100 > 90'
                 severity: warning
                 for: 2m
-              - name: Rabbitmq too many unack messages
+              - name: RabbitMQ too many unack messages
                 description: Too many unacknowledged messages
                 query: 'sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000'
                 severity: warning
                 for: 1m
-              - name: Rabbitmq too many connections
+              - name: RabbitMQ too many connections
                 description: The total connections of a node is too high
                 query: 'rabbitmq_connections > 1000'
                 severity: warning
                 for: 2m
-              - name: Rabbitmq no queue consumer
+              - name: RabbitMQ no queue consumer
                 description: A queue has less than 1 consumer
                 query: 'rabbitmq_queue_consumers < 1'
                 severity: warning
                 for: 1m   # allows a short service restart
-              - name: Rabbitmq unroutable messages
+              - name: RabbitMQ unroutable messages
                 description: A queue has unroutable messages
                 query: 'increase(rabbitmq_channel_messages_unroutable_returned_total[1m]) > 0 or increase(rabbitmq_channel_messages_unroutable_dropped_total[1m]) > 0'
                 severity: warning
@@ -954,61 +954,61 @@ groups:
             slug: kbudde-rabbitmq-exporter
             doc_url: https://github.com/kbudde/rabbitmq_exporter
             rules:
-              - name: Rabbitmq down
+              - name: RabbitMQ down
                 description: RabbitMQ node down
                 query: 'rabbitmq_up == 0'
                 severity: critical
-              - name: Rabbitmq cluster down
+              - name: RabbitMQ cluster down
                 description: Less than 3 nodes running in RabbitMQ cluster
                 query: 'sum(rabbitmq_running) < 3'
                 severity: critical
-              - name: Rabbitmq cluster partition
+              - name: RabbitMQ cluster partition
                 description: Cluster partition
                 query: 'rabbitmq_partitions > 0'
                 severity: critical
-              - name: Rabbitmq out of memory
+              - name: RabbitMQ out of memory
                 description: Memory available for RabbmitMQ is low (< 10%)
                 query: 'rabbitmq_node_mem_used / rabbitmq_node_mem_limit * 100 > 90'
                 severity: warning
                 for: 2m
-              - name: Rabbitmq too many connections
+              - name: RabbitMQ too many connections
                 description: RabbitMQ instance has too many connections (> 1000)
                 query: 'rabbitmq_connectionsTotal > 1000'
                 severity: warning
                 for: 2m
-              - name: Rabbitmq dead letter queue filling up
+              - name: RabbitMQ dead letter queue filling up
                 description: Dead letter queue is filling up (> 10 msgs)
                 query: 'rabbitmq_queue_messages{queue="my-dead-letter-queue"} > 10'
                 severity: warning
                 for: 1m
                 comments: |
                   Indicate the queue name in dedicated label.
-              - name: Rabbitmq too many messages in queue
+              - name: RabbitMQ too many messages in queue
                 description: Queue is filling up (> 1000 msgs)
                 query: 'rabbitmq_queue_messages_ready{queue="my-queue"} > 1000'
                 severity: warning
                 for: 2m
                 comments: |
                   Indicate the queue name in dedicated label.
-              - name: Rabbitmq slow queue consuming
+              - name: RabbitMQ slow queue consuming
                 description: Queue messages are consumed slowly (> 60s)
                 query: 'time() - rabbitmq_queue_head_message_timestamp{queue="my-queue"} > 60'
                 severity: warning
                 for: 2m
                 comments: |
                   Indicate the queue name in dedicated label.
-              - name: Rabbitmq no consumer
+              - name: RabbitMQ no consumer
                 description: Queue has no consumer
                 query: 'rabbitmq_queue_consumers == 0'
                 severity: critical
                 for: 1m    # allows a short service restart
-              - name: Rabbitmq too many consumers
+              - name: RabbitMQ too many consumers
                 description: Queue should have only 1 consumer
                 query: 'rabbitmq_queue_consumers{queue="my-queue"} > 1'
                 severity: critical
                 comments: |
                   Indicate the queue name in dedicated label.
-              - name: Rabbitmq unactive exchange
+              - name: RabbitMQ unactive exchange
                 description: Exchange receive less than 5 msgs per second
                 query: 'rate(rabbitmq_exchange_messages_published_in_total{exchange="my-exchange"}[1m]) < 5'
                 severity: warning

From 97da7f97b69c1b767475ce57eccb8e7ae3bb6e68 Mon Sep 17 00:00:00 2001
From: samber <samber@users.noreply.github.com>
Date: Fri, 13 Oct 2023 15:10:33 +0000
Subject: [PATCH 013/123] Publish

---
 .../rabbitmq/kbudde-rabbitmq-exporter.yml     | 22 +++++++++----------
 dist/rules/rabbitmq/rabbitmq-exporter.yml     | 20 ++++++++---------
 2 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/dist/rules/rabbitmq/kbudde-rabbitmq-exporter.yml b/dist/rules/rabbitmq/kbudde-rabbitmq-exporter.yml
index 05af1c7..40b6d95 100644
--- a/dist/rules/rabbitmq/kbudde-rabbitmq-exporter.yml
+++ b/dist/rules/rabbitmq/kbudde-rabbitmq-exporter.yml
@@ -10,7 +10,7 @@ groups:
       labels:
         severity: critical
       annotations:
-        summary: Rabbitmq down (instance {{ $labels.instance }})
+        summary: RabbitMQ down (instance {{ $labels.instance }})
         description: "RabbitMQ node down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: RabbitmqClusterDown
@@ -19,7 +19,7 @@ groups:
       labels:
         severity: critical
       annotations:
-        summary: Rabbitmq cluster down (instance {{ $labels.instance }})
+        summary: RabbitMQ cluster down (instance {{ $labels.instance }})
         description: "Less than 3 nodes running in RabbitMQ cluster\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: RabbitmqClusterPartition
@@ -28,7 +28,7 @@ groups:
       labels:
         severity: critical
       annotations:
-        summary: Rabbitmq cluster partition (instance {{ $labels.instance }})
+        summary: RabbitMQ cluster partition (instance {{ $labels.instance }})
         description: "Cluster partition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: RabbitmqOutOfMemory
@@ -37,7 +37,7 @@ groups:
       labels:
         severity: warning
       annotations:
-        summary: Rabbitmq out of memory (instance {{ $labels.instance }})
+        summary: RabbitMQ out of memory (instance {{ $labels.instance }})
         description: "Memory available for RabbmitMQ is low (< 10%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: RabbitmqTooManyConnections
@@ -46,7 +46,7 @@ groups:
       labels:
         severity: warning
       annotations:
-        summary: Rabbitmq too many connections (instance {{ $labels.instance }})
+        summary: RabbitMQ too many connections (instance {{ $labels.instance }})
         description: "RabbitMQ instance has too many connections (> 1000)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: RabbitmqDeadLetterQueueFillingUp
@@ -55,7 +55,7 @@ groups:
       labels:
         severity: warning
       annotations:
-        summary: Rabbitmq dead letter queue filling up (instance {{ $labels.instance }})
+        summary: RabbitMQ dead letter queue filling up (instance {{ $labels.instance }})
         description: "Dead letter queue is filling up (> 10 msgs)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: RabbitmqTooManyMessagesInQueue
@@ -64,7 +64,7 @@ groups:
       labels:
         severity: warning
       annotations:
-        summary: Rabbitmq too many messages in queue (instance {{ $labels.instance }})
+        summary: RabbitMQ too many messages in queue (instance {{ $labels.instance }})
         description: "Queue is filling up (> 1000 msgs)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: RabbitmqSlowQueueConsuming
@@ -73,7 +73,7 @@ groups:
       labels:
         severity: warning
       annotations:
-        summary: Rabbitmq slow queue consuming (instance {{ $labels.instance }})
+        summary: RabbitMQ slow queue consuming (instance {{ $labels.instance }})
         description: "Queue messages are consumed slowly (> 60s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: RabbitmqNoConsumer
@@ -82,7 +82,7 @@ groups:
       labels:
         severity: critical
       annotations:
-        summary: Rabbitmq no consumer (instance {{ $labels.instance }})
+        summary: RabbitMQ no consumer (instance {{ $labels.instance }})
         description: "Queue has no consumer\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: RabbitmqTooManyConsumers
@@ -91,7 +91,7 @@ groups:
       labels:
         severity: critical
       annotations:
-        summary: Rabbitmq too many consumers (instance {{ $labels.instance }})
+        summary: RabbitMQ too many consumers (instance {{ $labels.instance }})
         description: "Queue should have only 1 consumer\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: RabbitmqUnactiveExchange
@@ -100,5 +100,5 @@ groups:
       labels:
         severity: warning
       annotations:
-        summary: Rabbitmq unactive exchange (instance {{ $labels.instance }})
+        summary: RabbitMQ unactive exchange (instance {{ $labels.instance }})
         description: "Exchange receive less than 5 msgs per second\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
diff --git a/dist/rules/rabbitmq/rabbitmq-exporter.yml b/dist/rules/rabbitmq/rabbitmq-exporter.yml
index 6efae97..be95359 100644
--- a/dist/rules/rabbitmq/rabbitmq-exporter.yml
+++ b/dist/rules/rabbitmq/rabbitmq-exporter.yml
@@ -10,7 +10,7 @@ groups:
       labels:
         severity: critical
       annotations:
-        summary: Rabbitmq node down (instance {{ $labels.instance }})
+        summary: RabbitMQ node down (instance {{ $labels.instance }})
         description: "Less than 3 nodes running in RabbitMQ cluster\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: RabbitmqNodeNotDistributed
@@ -19,7 +19,7 @@ groups:
       labels:
         severity: critical
       annotations:
-        summary: Rabbitmq node not distributed (instance {{ $labels.instance }})
+        summary: RabbitMQ node not distributed (instance {{ $labels.instance }})
         description: "Distribution link state is not 'up'\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: RabbitmqInstancesDifferentVersions
@@ -28,8 +28,8 @@ groups:
       labels:
         severity: warning
       annotations:
-        summary: Rabbitmq instances different versions (instance {{ $labels.instance }})
-        description: "Running different version of Rabbitmq in the same cluster, can lead to failure.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: RabbitMQ instances different versions (instance {{ $labels.instance }})
+        description: "Running different version of RabbitMQ in the same cluster, can lead to failure.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: RabbitmqMemoryHigh
       expr: 'rabbitmq_process_resident_memory_bytes / rabbitmq_resident_memory_limit_bytes * 100 > 90'
@@ -37,7 +37,7 @@ groups:
       labels:
         severity: warning
       annotations:
-        summary: Rabbitmq memory high (instance {{ $labels.instance }})
+        summary: RabbitMQ memory high (instance {{ $labels.instance }})
         description: "A node use more than 90% of allocated RAM\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: RabbitmqFileDescriptorsUsage
@@ -46,7 +46,7 @@ groups:
       labels:
         severity: warning
       annotations:
-        summary: Rabbitmq file descriptors usage (instance {{ $labels.instance }})
+        summary: RabbitMQ file descriptors usage (instance {{ $labels.instance }})
         description: "A node use more than 90% of file descriptors\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: RabbitmqTooManyUnackMessages
@@ -55,7 +55,7 @@ groups:
       labels:
         severity: warning
       annotations:
-        summary: Rabbitmq too many unack messages (instance {{ $labels.instance }})
+        summary: RabbitMQ too many unack messages (instance {{ $labels.instance }})
         description: "Too many unacknowledged messages\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: RabbitmqTooManyConnections
@@ -64,7 +64,7 @@ groups:
       labels:
         severity: warning
       annotations:
-        summary: Rabbitmq too many connections (instance {{ $labels.instance }})
+        summary: RabbitMQ too many connections (instance {{ $labels.instance }})
         description: "The total connections of a node is too high\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: RabbitmqNoQueueConsumer
@@ -73,7 +73,7 @@ groups:
       labels:
         severity: warning
       annotations:
-        summary: Rabbitmq no queue consumer (instance {{ $labels.instance }})
+        summary: RabbitMQ no queue consumer (instance {{ $labels.instance }})
         description: "A queue has less than 1 consumer\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: RabbitmqUnroutableMessages
@@ -82,5 +82,5 @@ groups:
       labels:
         severity: warning
       annotations:
-        summary: Rabbitmq unroutable messages (instance {{ $labels.instance }})
+        summary: RabbitMQ unroutable messages (instance {{ $labels.instance }})
         description: "A queue has unroutable messages\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

From 76de11d71b884710f4d4490d34784281c0e3e8b5 Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Tue, 24 Oct 2023 15:03:51 +0200
Subject: [PATCH 014/123] Update rules.yml

---
 _data/rules.yml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/_data/rules.yml b/_data/rules.yml
index c060c1d..4df1771 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -690,6 +690,11 @@ groups:
                 for: 1h
                 comments: |
                   See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
+              - name: Postgresql invalid index
+                description: 'The table {{ $labels.relname }} has an invalid index: {{ $labels.indexrelname }}. You should execute `DROP INDEX {{ $labels.indexrelname }};`'
+                query: 'pg_genaral_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}'
+                severity: warning
+                for: 6h
 
       - name: SQL Server
         exporters:

From 308b3c52dd308ed9120971dea7ec0c1fa2e614df Mon Sep 17 00:00:00 2001
From: samber <samber@users.noreply.github.com>
Date: Tue, 24 Oct 2023 13:05:40 +0000
Subject: [PATCH 015/123] Publish

---
 dist/rules/postgresql/postgres-exporter.yml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/dist/rules/postgresql/postgres-exporter.yml b/dist/rules/postgresql/postgres-exporter.yml
index e9dc0b5..0e1f473 100644
--- a/dist/rules/postgresql/postgres-exporter.yml
+++ b/dist/rules/postgresql/postgres-exporter.yml
@@ -183,3 +183,12 @@ groups:
       annotations:
         summary: Postgresql bloat table high (> 80%) (instance {{ $labels.instance }})
         description: "The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: PostgresqlInvalidIndex
+      expr: 'pg_genaral_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}'
+      for: 6h
+      labels:
+        severity: warning
+      annotations:
+        summary: Postgresql invalid index (instance {{ $labels.instance }})
+        description: "The table {{ $labels.relname }} has an invalid index: {{ $labels.indexrelname }}. You should execute `DROP INDEX {{ $labels.indexrelname }};`\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

From 2ffde4f24c43e6441c2416b915f47c82f451569d Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Tue, 21 Nov 2023 18:38:09 +0100
Subject: [PATCH 016/123] Update alertmanager.md

---
 alertmanager.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/alertmanager.md b/alertmanager.md
index c593845..6b07089 100644
--- a/alertmanager.md
+++ b/alertmanager.md
@@ -135,4 +135,7 @@ If the notification takes too much time to be triggered, check the following del
 - `for: 5m` (alerts/example-mysql.yml)
 - `group_wait = 10s` (alertmanager.yml)
 
-Also read [https://pracucci.com/prometheus-understanding-the-delays-on-alerting.html](https://pracucci.com/prometheus-understanding-the-delays-on-alerting.html).
+Also read:
+- [https://pracucci.com/prometheus-understanding-the-delays-on-alerting.html](https://pracucci.com/prometheus-understanding-the-delays-on-alerting.html).
+- [https://hodovi.cc/blog/creating-awesome-alertmanager-templates-for-slack/](https://hodovi.cc/blog/creating-awesome-alertmanager-templates-for-slack/)
+

From a4de5323ad9561e10dd65ba9a7f5df5808568f52 Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Sun, 26 Nov 2023 02:18:16 +0100
Subject: [PATCH 017/123] Update rules.yml

---
 _data/rules.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/_data/rules.yml b/_data/rules.yml
index 4df1771..ed73ece 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -241,7 +241,7 @@ groups:
                 severity: warning
                 for: 5m
               - name: Host context switching
-                description: Context switching is growing on the node (> 10000 / s)
+                description: Context switching is growing on the node (> 10000 / CPU / s)
                 query: '((rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
                 severity: warning
                 comments: |

From 7d05d142d5b8be59b6a60d9fd80923d57eb0fd95 Mon Sep 17 00:00:00 2001
From: samber <samber@users.noreply.github.com>
Date: Sun, 26 Nov 2023 01:19:24 +0000
Subject: [PATCH 018/123] Publish

---
 dist/rules/host-and-hardware/node-exporter.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dist/rules/host-and-hardware/node-exporter.yml b/dist/rules/host-and-hardware/node-exporter.yml
index 791d893..c63e0c8 100644
--- a/dist/rules/host-and-hardware/node-exporter.yml
+++ b/dist/rules/host-and-hardware/node-exporter.yml
@@ -182,7 +182,7 @@ groups:
         severity: warning
       annotations:
         summary: Host context switching (instance {{ $labels.instance }})
-        description: "Context switching is growing on the node (> 10000 / s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Context switching is growing on the node (> 10000 / CPU / s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: HostSwapIsFillingUp
       expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'

From 7fa11bf6cc2f1f5204d3fbab8a3a3939cd0a0dbd Mon Sep 17 00:00:00 2001
From: michaelact <86778470+michaelact@users.noreply.github.com>
Date: Sat, 2 Dec 2023 00:25:11 +0700
Subject: [PATCH 019/123] Add simple and meaningful `kube-state-metrics` alert
 summary (#394)

* feat: add 'summary' to be overriden from rules.yml

* chore: add simple and meaningful summary for kubernetes alerts
---
 _data/rules.yml   | 20 ++++++++++++++++++++
 dist/template.yml |  2 +-
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/_data/rules.yml b/_data/rules.yml
index ed73ece..2403b71 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -1776,16 +1776,19 @@ groups:
             doc_url: https://github.com/kubernetes/kube-state-metrics/tree/master/docs
             rules:
               - name: Kubernetes Node not ready
+                summary: Kubernetes Node ready (node {{ $labels.node }})
                 description: Node {{ $labels.node }} has been unready for a long time
                 query: 'kube_node_status_condition{condition="Ready",status="true"} == 0'
                 severity: critical
                 for: 10m
               - name: Kubernetes Node memory pressure
+                summary: Kubernetes memory pressure (node {{ $labels.node }})
                 description: "Node {{ $labels.node }} has MemoryPressure condition"
                 query: 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1'
                 severity: critical
                 for: 2m
               - name: Kubernetes Node disk pressure
+                summary: Kubernetes disk pressure (node {{ $labels.node }})
                 description: "Node {{ $labels.node }} has DiskPressure condition"
                 query: 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1'
                 severity: critical
@@ -1801,18 +1804,22 @@ groups:
                 severity: warning
                 for: 2m
               - name: Kubernetes Container oom killer
+                summary: Kubernetes container oom killer ({{ $labels.namespace }}/{{ $labels.pod }}:{{ $labels.container }})
                 description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes."
                 query: '(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1'
                 severity: warning
               - name: Kubernetes Job failed
+                summary: Kubernetes Job failed ({{ $labels.namespace }}/{{ $labels.job_name }})
                 description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete"
                 query: 'kube_job_status_failed > 0'
                 severity: warning
               - name: Kubernetes CronJob suspended
+                summary: Kubernetes CronJob suspended ({{ $labels.namespace }}/{{ $labels.cronjob }})
                 description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is suspended"
                 query: 'kube_cronjob_spec_suspend != 0'
                 severity: warning
               - name: Kubernetes PersistentVolumeClaim pending
+                summary: Kubernetes PersistentVolumeClaim pending ({{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }})
                 description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pending"
                 query: 'kube_persistentvolumeclaim_status_phase{phase="Pending"} == 1'
                 severity: warning
@@ -1827,10 +1834,12 @@ groups:
                 query: 'predict_linear(kubelet_volume_stats_available_bytes[6h:5m], 4 * 24 * 3600) < 0'
                 severity: critical
               - name: Kubernetes PersistentVolume error
+                summary: Kubernetes PersistentVolumeClaim pending ({{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }})
                 description: "Persistent volume {{ $labels.persistentvolume }} is in bad state"
                 query: 'kube_persistentvolume_status_phase{phase=~"Failed|Pending", job="kube-state-metrics"} > 0'
                 severity: critical
               - name: Kubernetes StatefulSet down
+                summary: Kubernetes StatefulSet down ({{ $labels.namespace }}/{{ $labels.statefulset }})
                 description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} went down
                 query: 'kube_statefulset_replicas != kube_statefulset_status_replicas_ready > 0'
                 severity: critical
@@ -1854,21 +1863,25 @@ groups:
                 query: 'max(quantile_over_time(0.5, kube_horizontalpodautoscaler_status_desired_replicas[1d]) == kube_horizontalpodautoscaler_spec_min_replicas) by (horizontalpodautoscaler) > 3'  # allow minimum 3 replicas running
                 severity: info
               - name: Kubernetes Pod not healthy
+                summary: Kubernetes Pod not healthy ({{ $labels.namespace }}/{{ $labels.pod }})
                 description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-running state for longer than 15 minutes.
                 query: 'sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"}) > 0'
                 severity: critical
                 for: 15m
               - name: Kubernetes pod crash looping
+                summary: Kubernetes pod crash looping ({{ $labels.namespace }}/{{ $labels.pod }})
                 description: Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping
                 query: 'increase(kube_pod_container_status_restarts_total[1m]) > 3'
                 severity: warning
                 for: 2m
               - name: Kubernetes ReplicaSet replicas mismatch
+                summary: Kubernetes ReplicasSet mismatch ({{ $labels.namespace }}/{{ $labels.replicaset }})
                 description: ReplicaSet {{ $labels.namespace }}/{{ $labels.replicaset }} replicas mismatch
                 query: 'kube_replicaset_spec_replicas != kube_replicaset_status_ready_replicas'
                 severity: warning
                 for: 10m
               - name: Kubernetes Deployment replicas mismatch
+                summary: Kubernetes Deployment replicas mismatch ({{ $labels.namespace }}/{{ $labels.deployment }})
                 description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replicas mismatch
                 query: 'kube_deployment_spec_replicas != kube_deployment_status_replicas_available'
                 severity: warning
@@ -1879,37 +1892,44 @@ groups:
                 severity: warning
                 for: 10m
               - name: Kubernetes Deployment generation mismatch
+                summary: Kubernetes Deployment generation mismatch ({{ $labels.namespace }}/{{ $labels.deployment }})
                 description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has failed but has not been rolled back.
                 query: 'kube_deployment_status_observed_generation != kube_deployment_metadata_generation'
                 severity: critical
                 for: 10m
               - name: Kubernetes StatefulSet generation mismatch
+                summary: Kubernetes StatefulSet generation mismatch ({{ $labels.namespace }}/{{ $labels.statefulset }})
                 description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has failed but has not been rolled back.
                 query: 'kube_statefulset_status_observed_generation != kube_statefulset_metadata_generation'
                 severity: critical
                 for: 10m
               - name: Kubernetes StatefulSet update not rolled out
+                summary: Kubernetes StatefulSet update not rolled out ({{ $labels.namespace }}/{{ $labels.statefulset }})
                 description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.
                 query: 'max without (revision) (kube_statefulset_status_current_revision unless kube_statefulset_status_update_revision) * (kube_statefulset_replicas != kube_statefulset_status_replicas_updated)'
                 severity: warning
                 for: 10m
               - name: Kubernetes DaemonSet rollout stuck
+                summary: Kubernetes DaemonSet rollout stuck ({{ $labels.namespace }}/{{ $labels.daemonset }})
                 description: Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled or not ready
                 query: 'kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 or kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0'
                 severity: warning
                 for: 10m
               - name: Kubernetes DaemonSet misscheduled
+                summary: Kubernetes DaemonSet misscheduled ({{ $labels.namespace }}/{{ $labels.daemonset }})
                 description: Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run
                 query: 'kube_daemonset_status_number_misscheduled > 0'
                 severity: critical
                 for: 1m
               - name: Kubernetes CronJob too long
+                summary: Kubernetes CronJob too long ({{ $labels.namespace }}/{{ $labels.cronjob }})
                 description: CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete.
                 query: 'time() - kube_cronjob_next_schedule_time > 3600'
                 severity: warning
                 comments: |
                   Threshold should be customized for each cronjob name.
               - name: Kubernetes Job slow completion
+                summary: Kubernetes job slow completion ({{ $labels.namespace }}/{{ $labels.job_name }})
                 description: Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in time.
                 query: 'kube_job_spec_completions - kube_job_status_succeeded - kube_job_status_failed > 0'
                 severity: critical
diff --git a/dist/template.yml b/dist/template.yml
index 0dd9684..cdde4ea 100644
--- a/dist/template.yml
+++ b/dist/template.yml
@@ -11,6 +11,6 @@ groups:
       labels:
         severity: {{ rule.severity }}
       annotations:
-        summary: {{ rule.name }} (instance {% raw %}{{ $labels.instance }}{% endraw %})
+        summary: {% if rule.summary %}{{ rule.summary }}{% else %}{{ rule.name }} (instance {% raw %}{{ $labels.instance }}{% endraw %}){% endif %}
         description: "{{ rule.description | replace: '"', '\"' }}\n  VALUE = {% raw %}{{ $value }}{% endraw %}\n  LABELS = {% raw %}{{ $labels }}{% endraw %}"
 {% endfor %}
\ No newline at end of file

From 6ee065c636c19bb7c3d2e5cfb78a6a4657c81834 Mon Sep 17 00:00:00 2001
From: samber <samber@users.noreply.github.com>
Date: Fri, 1 Dec 2023 17:26:16 +0000
Subject: [PATCH 020/123] Publish

---
 dist/rules/kubernetes/kubestate-exporter.yml | 40 ++++++++++----------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/dist/rules/kubernetes/kubestate-exporter.yml b/dist/rules/kubernetes/kubestate-exporter.yml
index 8ba0458..ba01753 100644
--- a/dist/rules/kubernetes/kubestate-exporter.yml
+++ b/dist/rules/kubernetes/kubestate-exporter.yml
@@ -10,7 +10,7 @@ groups:
       labels:
         severity: critical
       annotations:
-        summary: Kubernetes Node not ready (instance {{ $labels.instance }})
+        summary: Kubernetes Node ready (node {{ $labels.node }})
         description: "Node {{ $labels.node }} has been unready for a long time\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesNodeMemoryPressure
@@ -19,7 +19,7 @@ groups:
       labels:
         severity: critical
       annotations:
-        summary: Kubernetes Node memory pressure (instance {{ $labels.instance }})
+        summary: Kubernetes memory pressure (node {{ $labels.node }})
         description: "Node {{ $labels.node }} has MemoryPressure condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesNodeDiskPressure
@@ -28,7 +28,7 @@ groups:
       labels:
         severity: critical
       annotations:
-        summary: Kubernetes Node disk pressure (instance {{ $labels.instance }})
+        summary: Kubernetes disk pressure (node {{ $labels.node }})
         description: "Node {{ $labels.node }} has DiskPressure condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesNodeNetworkUnavailable
@@ -55,7 +55,7 @@ groups:
       labels:
         severity: warning
       annotations:
-        summary: Kubernetes Container oom killer (instance {{ $labels.instance }})
+        summary: Kubernetes container oom killer ({{ $labels.namespace }}/{{ $labels.pod }}:{{ $labels.container }})
         description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesJobFailed
@@ -64,7 +64,7 @@ groups:
       labels:
         severity: warning
       annotations:
-        summary: Kubernetes Job failed (instance {{ $labels.instance }})
+        summary: Kubernetes Job failed ({{ $labels.namespace }}/{{ $labels.job_name }})
         description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesCronjobSuspended
@@ -73,7 +73,7 @@ groups:
       labels:
         severity: warning
       annotations:
-        summary: Kubernetes CronJob suspended (instance {{ $labels.instance }})
+        summary: Kubernetes CronJob suspended ({{ $labels.namespace }}/{{ $labels.cronjob }})
         description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is suspended\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesPersistentvolumeclaimPending
@@ -82,7 +82,7 @@ groups:
       labels:
         severity: warning
       annotations:
-        summary: Kubernetes PersistentVolumeClaim pending (instance {{ $labels.instance }})
+        summary: Kubernetes PersistentVolumeClaim pending ({{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }})
         description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pending\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesVolumeOutOfDiskSpace
@@ -109,7 +109,7 @@ groups:
       labels:
         severity: critical
       annotations:
-        summary: Kubernetes PersistentVolume error (instance {{ $labels.instance }})
+        summary: Kubernetes PersistentVolumeClaim pending ({{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }})
         description: "Persistent volume {{ $labels.persistentvolume }} is in bad state\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesStatefulsetDown
@@ -118,7 +118,7 @@ groups:
       labels:
         severity: critical
       annotations:
-        summary: Kubernetes StatefulSet down (instance {{ $labels.instance }})
+        summary: Kubernetes StatefulSet down ({{ $labels.namespace }}/{{ $labels.statefulset }})
         description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} went down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesHpaScaleInability
@@ -163,7 +163,7 @@ groups:
       labels:
         severity: critical
       annotations:
-        summary: Kubernetes Pod not healthy (instance {{ $labels.instance }})
+        summary: Kubernetes Pod not healthy ({{ $labels.namespace }}/{{ $labels.pod }})
         description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-running state for longer than 15 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesPodCrashLooping
@@ -172,7 +172,7 @@ groups:
       labels:
         severity: warning
       annotations:
-        summary: Kubernetes pod crash looping (instance {{ $labels.instance }})
+        summary: Kubernetes pod crash looping ({{ $labels.namespace }}/{{ $labels.pod }})
         description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesReplicasetReplicasMismatch
@@ -181,7 +181,7 @@ groups:
       labels:
         severity: warning
       annotations:
-        summary: Kubernetes ReplicaSet replicas mismatch (instance {{ $labels.instance }})
+        summary: Kubernetes ReplicasSet mismatch ({{ $labels.namespace }}/{{ $labels.replicaset }})
         description: "ReplicaSet {{ $labels.namespace }}/{{ $labels.replicaset }} replicas mismatch\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesDeploymentReplicasMismatch
@@ -190,7 +190,7 @@ groups:
       labels:
         severity: warning
       annotations:
-        summary: Kubernetes Deployment replicas mismatch (instance {{ $labels.instance }})
+        summary: Kubernetes Deployment replicas mismatch ({{ $labels.namespace }}/{{ $labels.deployment }})
         description: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replicas mismatch\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesStatefulsetReplicasMismatch
@@ -208,7 +208,7 @@ groups:
       labels:
         severity: critical
       annotations:
-        summary: Kubernetes Deployment generation mismatch (instance {{ $labels.instance }})
+        summary: Kubernetes Deployment generation mismatch ({{ $labels.namespace }}/{{ $labels.deployment }})
         description: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has failed but has not been rolled back.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesStatefulsetGenerationMismatch
@@ -217,7 +217,7 @@ groups:
       labels:
         severity: critical
       annotations:
-        summary: Kubernetes StatefulSet generation mismatch (instance {{ $labels.instance }})
+        summary: Kubernetes StatefulSet generation mismatch ({{ $labels.namespace }}/{{ $labels.statefulset }})
         description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has failed but has not been rolled back.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesStatefulsetUpdateNotRolledOut
@@ -226,7 +226,7 @@ groups:
       labels:
         severity: warning
       annotations:
-        summary: Kubernetes StatefulSet update not rolled out (instance {{ $labels.instance }})
+        summary: Kubernetes StatefulSet update not rolled out ({{ $labels.namespace }}/{{ $labels.statefulset }})
         description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesDaemonsetRolloutStuck
@@ -235,7 +235,7 @@ groups:
       labels:
         severity: warning
       annotations:
-        summary: Kubernetes DaemonSet rollout stuck (instance {{ $labels.instance }})
+        summary: Kubernetes DaemonSet rollout stuck ({{ $labels.namespace }}/{{ $labels.daemonset }})
         description: "Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled or not ready\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesDaemonsetMisscheduled
@@ -244,7 +244,7 @@ groups:
       labels:
         severity: critical
       annotations:
-        summary: Kubernetes DaemonSet misscheduled (instance {{ $labels.instance }})
+        summary: Kubernetes DaemonSet misscheduled ({{ $labels.namespace }}/{{ $labels.daemonset }})
         description: "Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesCronjobTooLong
@@ -253,7 +253,7 @@ groups:
       labels:
         severity: warning
       annotations:
-        summary: Kubernetes CronJob too long (instance {{ $labels.instance }})
+        summary: Kubernetes CronJob too long ({{ $labels.namespace }}/{{ $labels.cronjob }})
         description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesJobSlowCompletion
@@ -262,7 +262,7 @@ groups:
       labels:
         severity: critical
       annotations:
-        summary: Kubernetes Job slow completion (instance {{ $labels.instance }})
+        summary: Kubernetes job slow completion ({{ $labels.namespace }}/{{ $labels.job_name }})
         description: "Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in time.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesApiServerErrors

From 31a27fb9e0e778bd8fe6097aa58c8ea598fe9cec Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Mon, 11 Dec 2023 23:49:28 +0100
Subject: [PATCH 021/123] fix screeb

---
 _layouts/default.html | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/_layouts/default.html b/_layouts/default.html
index 8e73c74..1a41b93 100644
--- a/_layouts/default.html
+++ b/_layouts/default.html
@@ -147,7 +147,7 @@
         s['ScreebObject']=r;s[r]=s[r]||function(){(s[r].q=s[r].q||[]).push(arguments)};
         b=c.createElement('script');b.type='text/javascript';
         b.id=r;b.src=ee;b.async=1;c.getElementsByTagName("head")[0].appendChild(b);
-    }(window,document,'$screeb','https://t.screeb.app/tag.js'));
+    }(window,document,'$screeb','https://t2.screeb.app/tag.js'));
 
     $screeb('init', '232450e3-d3fe-4240-b543-649a5041a7db');
   </script>

From c6ff5a59dca3bf16f173f3447c62941ef58bc08e Mon Sep 17 00:00:00 2001
From: josedev-union <70741025+josedev-union@users.noreply.github.com>
Date: Sat, 20 Jan 2024 20:33:26 +0100
Subject: [PATCH 022/123] feat: Add rules for Graph Node (#387)

Co-authored-by: josedev-union <josedev-union@users.noreply.github.com>
---
 README.md                                   |  1 +
 _data/rules.yml                             | 30 +++++++++++
 dist/rules/graph-node/embedded-exporter.yml | 59 +++++++++++++++++++++
 3 files changed, 90 insertions(+)
 create mode 100644 dist/rules/graph-node/embedded-exporter.yml

diff --git a/README.md b/README.md
index a4d2053..33f5d38 100644
--- a/README.md
+++ b/README.md
@@ -85,6 +85,7 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts
 - [Promtail](https://samber.github.io/awesome-prometheus-alerts/rules#promtail)
 - [Cortex](https://samber.github.io/awesome-prometheus-alerts/rules#cortex)
 - [Jenkins](https://samber.github.io/awesome-prometheus-alerts/rules#jenkins)
+- [Graph Node](https://samber.github.io/awesome-prometheus-alerts/rules#graph-node)
 
 ## 🤝 Contributing
 
diff --git a/_data/rules.yml b/_data/rules.yml
index 2403b71..8d72345 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -2819,3 +2819,33 @@ groups:
                 description: UPS load is > 80%
                 query: 'apcupsd_ups_load_percent > 80'
                 severity: warning
+
+      - name: Graph Node
+        exporters:
+          - name: Embedded exporter
+            slug: embedded-exporter
+            rules:
+              - name: Provider failed because net_version failed
+                description: "Failed net_version for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`"
+                query: 'eth_rpc_status == 1'
+                severity: critical
+              - name: Provider failed because get genesis failed
+                description: "Failed to get genesis for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`"
+                query: 'eth_rpc_status == 2'
+                severity: critical
+              - name: Provider failed because net_version timeout
+                description: "net_version timeout for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`"
+                query: 'eth_rpc_status == 3'
+                severity: critical
+              - name: Provider failed because get genesis timeout
+                description: "Timeout to get genesis for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`"
+                query: 'eth_rpc_status == 4'
+                severity: critical
+              - name: Store connection is too slow
+                description: "Store connection is too slow to `{{$labels.pool}}` pool, `{{$labels.shard}}` shard in Graph node `{{$labels.instance}}`"
+                query: 'store_connection_wait_time_ms > 10'
+                severity: warning
+              - name: Store connection is too slow
+                description: "Store connection is too slow to `{{$labels.pool}}` pool, `{{$labels.shard}}` shard in Graph node `{{$labels.instance}}`"
+                query: 'store_connection_wait_time_ms > 20'
+                severity: critical
diff --git a/dist/rules/graph-node/embedded-exporter.yml b/dist/rules/graph-node/embedded-exporter.yml
new file mode 100644
index 0000000..ec555b6
--- /dev/null
+++ b/dist/rules/graph-node/embedded-exporter.yml
@@ -0,0 +1,59 @@
+groups:
+
+- name: EmbeddedExporter
+
+  rules:
+
+    - alert: ProviderNetVersionFailed
+      expr: 'eth_rpc_status == 1'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Provider net_version failed (provider {{$labels.provider}}, node {{$labels.instance}})
+        description: "Failed net_version for Provider {{$labels.provider}} in Graph node {{$labels.instance}}"
+
+    - alert: ProviderGetGenesisFailed
+      expr: 'eth_rpc_status == 2'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Provider get genesis failed (provider {{$labels.provider}}, node {{$labels.instance}})
+        description: "Failed to get genesis for Provider {{$labels.provider}} in Graph node {{$labels.instance}}"
+
+    - alert: ProviderNetVersionTimeout
+      expr: 'eth_rpc_status == 3'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Provider net_version timeout (provider {{$labels.provider}}, node {{$labels.instance}})
+        description: "net_version timeout for Provider {{$labels.provider}} in Graph node {{$labels.instance}}"
+
+    - alert: ProviderGetGenesisTimeout
+      expr: 'eth_rpc_status == 4'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Provider get genesis timeout (provider {{$labels.provider}}, node {{$labels.instance}})
+        description: "Timeout to get genesis for Provider {{$labels.provider}} in Graph node {{$labels.instance}}"
+
+    - alert: StoreConnectionSlow
+      expr: 'store_connection_wait_time_ms > 10'
+      for: 1m
+      labels:
+        severity: warning
+      annotations:
+        summary: Store connection is slow (pool {{$labels.pool}}, shard {{$labels.shard}}, node {{$labels.instance}})
+        description: "Store connection is slow to {{$labels.pool}} pool, {{$labels.shard}} shard in Graph node {{$labels.instance}}"
+
+    - alert: StoreConnectionTooSlow
+      expr: 'store_connection_wait_time_ms > 20'
+      for: 1m
+      labels:
+        severity: critical
+      annotations:
+        summary: Store connection is too slow (pool {{$labels.pool}}, shard {{$labels.shard}}, node {{$labels.instance}})
+        description: "Store connection is too slow to {{$labels.pool}} pool, {{$labels.shard}} shard in Graph node {{$labels.instance}}"

From df4016bf6afe6822172bed9c7939dfe4f3fd76fe Mon Sep 17 00:00:00 2001
From: samber <samber@users.noreply.github.com>
Date: Sat, 20 Jan 2024 19:34:37 +0000
Subject: [PATCH 023/123] Publish

---
 dist/rules/graph-node/embedded-exporter.yml | 40 ++++++++++-----------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/dist/rules/graph-node/embedded-exporter.yml b/dist/rules/graph-node/embedded-exporter.yml
index ec555b6..a8d0768 100644
--- a/dist/rules/graph-node/embedded-exporter.yml
+++ b/dist/rules/graph-node/embedded-exporter.yml
@@ -4,56 +4,56 @@ groups:
 
   rules:
 
-    - alert: ProviderNetVersionFailed
+    - alert: ProviderFailedBecauseNet_versionFailed
       expr: 'eth_rpc_status == 1'
       for: 0m
       labels:
         severity: critical
       annotations:
-        summary: Provider net_version failed (provider {{$labels.provider}}, node {{$labels.instance}})
-        description: "Failed net_version for Provider {{$labels.provider}} in Graph node {{$labels.instance}}"
+        summary: Provider failed because net_version failed (instance {{ $labels.instance }})
+        description: "Failed net_version for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: ProviderGetGenesisFailed
+    - alert: ProviderFailedBecauseGetGenesisFailed
       expr: 'eth_rpc_status == 2'
       for: 0m
       labels:
         severity: critical
       annotations:
-        summary: Provider get genesis failed (provider {{$labels.provider}}, node {{$labels.instance}})
-        description: "Failed to get genesis for Provider {{$labels.provider}} in Graph node {{$labels.instance}}"
+        summary: Provider failed because get genesis failed (instance {{ $labels.instance }})
+        description: "Failed to get genesis for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: ProviderNetVersionTimeout
+    - alert: ProviderFailedBecauseNet_versionTimeout
       expr: 'eth_rpc_status == 3'
       for: 0m
       labels:
         severity: critical
       annotations:
-        summary: Provider net_version timeout (provider {{$labels.provider}}, node {{$labels.instance}})
-        description: "net_version timeout for Provider {{$labels.provider}} in Graph node {{$labels.instance}}"
+        summary: Provider failed because net_version timeout (instance {{ $labels.instance }})
+        description: "net_version timeout for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: ProviderGetGenesisTimeout
+    - alert: ProviderFailedBecauseGetGenesisTimeout
       expr: 'eth_rpc_status == 4'
       for: 0m
       labels:
         severity: critical
       annotations:
-        summary: Provider get genesis timeout (provider {{$labels.provider}}, node {{$labels.instance}})
-        description: "Timeout to get genesis for Provider {{$labels.provider}} in Graph node {{$labels.instance}}"
+        summary: Provider failed because get genesis timeout (instance {{ $labels.instance }})
+        description: "Timeout to get genesis for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: StoreConnectionSlow
+    - alert: StoreConnectionIsTooSlow
       expr: 'store_connection_wait_time_ms > 10'
-      for: 1m
+      for: 0m
       labels:
         severity: warning
       annotations:
-        summary: Store connection is slow (pool {{$labels.pool}}, shard {{$labels.shard}}, node {{$labels.instance}})
-        description: "Store connection is slow to {{$labels.pool}} pool, {{$labels.shard}} shard in Graph node {{$labels.instance}}"
+        summary: Store connection is too slow (instance {{ $labels.instance }})
+        description: "Store connection is too slow to `{{$labels.pool}}` pool, `{{$labels.shard}}` shard in Graph node `{{$labels.instance}}`\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: StoreConnectionTooSlow
+    - alert: StoreConnectionIsTooSlow
       expr: 'store_connection_wait_time_ms > 20'
-      for: 1m
+      for: 0m
       labels:
         severity: critical
       annotations:
-        summary: Store connection is too slow (pool {{$labels.pool}}, shard {{$labels.shard}}, node {{$labels.instance}})
-        description: "Store connection is too slow to {{$labels.pool}} pool, {{$labels.shard}} shard in Graph node {{$labels.instance}}"
+        summary: Store connection is too slow (instance {{ $labels.instance }})
+        description: "Store connection is too slow to `{{$labels.pool}}` pool, `{{$labels.shard}}` shard in Graph node `{{$labels.instance}}`\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

From 56a7e0d03aebe4fedcca2c1e816d9e56b8545671 Mon Sep 17 00:00:00 2001
From: Brett Beutell <brbeut@gmail.com>
Date: Fri, 26 Jan 2024 04:09:35 +0100
Subject: [PATCH 024/123] Update rule for host memory underutilization to use
 avg_over_time instead of rate, since node_memory_MemAvailable_bytes is a
 gauge (#400)

---
 dist/rules/host-and-hardware/node-exporter.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dist/rules/host-and-hardware/node-exporter.yml b/dist/rules/host-and-hardware/node-exporter.yml
index c63e0c8..fd27d81 100644
--- a/dist/rules/host-and-hardware/node-exporter.yml
+++ b/dist/rules/host-and-hardware/node-exporter.yml
@@ -23,7 +23,7 @@ groups:
         description: "The node is under heavy memory pressure. High rate of major page faults\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: HostMemoryIsUnderutilized
-      expr: '(100 - (rate(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: '(100 - (avg_over_time(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
       for: 1w
       labels:
         severity: info

From 0727f2ef2ecfb2f7013b893b07acac1811a869dc Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Fri, 26 Jan 2024 04:10:22 +0100
Subject: [PATCH 025/123] Update rules.yml

---
 _data/rules.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/_data/rules.yml b/_data/rules.yml
index 8d72345..7d9f46c 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -148,7 +148,7 @@ groups:
                 for: 2m
               - name: Host Memory is underutilized
                 description: 'Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})'
-                query: '(100 - (rate(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+                query: '(100 - (avg_over_time(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
                 severity: info
                 for: 1w
                 comments: |

From 4eb0e910e7d18df237004b1b5a71790771f6a981 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20=C4=8Cervenka?= <cervajs@freevoice.cz>
Date: Fri, 9 Feb 2024 20:23:30 +0100
Subject: [PATCH 026/123] SMART monitoring (#402)

* SMART monitoring

* query regex fix

---------

Co-authored-by: Marek Cervenka <cervenka@ipex.cz>
---
 _data/rules.yml | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/_data/rules.yml b/_data/rules.yml
index 7d9f46c..ef84f1b 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -333,6 +333,38 @@ groups:
                 severity: info
                 for: 4h
 
+      - name: S.M.A.R.T Device Monitoring
+        exporters:
+          - name: smartctl-exporter
+            slug: smartctl-exporter
+            doc_url: https://github.com/prometheus-community/smartctl_exporter
+            rules:
+              - name: Smart device temperature warning
+                description: Device temperature  warning (instance {{ $labels.instance }})
+                query: smartctl_device_temperature > 60
+                severity: warning
+                for: 2m
+              - name: Smart device temperature critical
+                description: Device temperature critical  (instance {{ $labels.instance }})
+                query: smartctl_device_temperature > 80
+                severity: critical
+                for: 2m
+              - name: Smart critical warning
+                description: device has critical warning (instance {{ $labels.instance }})
+                query: smartctl_device_critical_warning > 0
+                severity: critical
+                for: 15m
+              - name: Smart media errors
+                description: device has media errors (instance {{ $labels.instance }})
+                query: smartctl_device_media_errors > 0
+                severity: critical
+                for: 15m
+              - name: Smart NVME Wearout Indicator
+                description: NVMe device is wearing out (instance {{ $labels.instance }})
+                query: smartctl_device_available_spare{device=~"nvme.*"} < smartctl_device_available_spare_threshold{device=~"nvme.*"}
+                severity: critical
+                for: 15m
+ 
       - name: Docker containers
         exporters:
           - name: google/cAdvisor

From 854688d17aa069ce2298f88a19781d750f5072ac Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Fri, 9 Feb 2024 20:24:10 +0100
Subject: [PATCH 027/123] Update README.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 33f5d38..b500cb7 100644
--- a/README.md
+++ b/README.md
@@ -18,6 +18,7 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts
 
 - [Prometheus self-monitoring](https://samber.github.io/awesome-prometheus-alerts/rules#prometheus-internals)
 - [Host/Hardware](https://samber.github.io/awesome-prometheus-alerts/rules#host-and-hardware)
+- [SMART](https://samber.github.io/awesome-prometheus-alerts/rules#smart)
 - [Docker Containers](https://samber.github.io/awesome-prometheus-alerts/rules#docker-containers)
 - [Blackbox](https://samber.github.io/awesome-prometheus-alerts/rules#blackbox)
 - [Windows](https://samber.github.io/awesome-prometheus-alerts/rules#windows-server)

From 0dba950ccc7935cfb4b89b0751e6256dddaacb87 Mon Sep 17 00:00:00 2001
From: samber <samber@users.noreply.github.com>
Date: Fri, 9 Feb 2024 19:25:17 +0000
Subject: [PATCH 028/123] Publish

---
 .../smartctl-exporter.yml                     | 50 +++++++++++++++++++
 1 file changed, 50 insertions(+)
 create mode 100644 dist/rules/s.m.a.r.t-device-monitoring/smartctl-exporter.yml

diff --git a/dist/rules/s.m.a.r.t-device-monitoring/smartctl-exporter.yml b/dist/rules/s.m.a.r.t-device-monitoring/smartctl-exporter.yml
new file mode 100644
index 0000000..1946c38
--- /dev/null
+++ b/dist/rules/s.m.a.r.t-device-monitoring/smartctl-exporter.yml
@@ -0,0 +1,50 @@
+groups:
+
+- name: SmartctlExporter
+
+  rules:
+
+    - alert: SmartDeviceTemperatureWarning
+      expr: 'smartctl_device_temperature > 60'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Smart device temperature warning (instance {{ $labels.instance }})
+        description: "Device temperature  warning (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: SmartDeviceTemperatureCritical
+      expr: 'smartctl_device_temperature > 80'
+      for: 2m
+      labels:
+        severity: critical
+      annotations:
+        summary: Smart device temperature critical (instance {{ $labels.instance }})
+        description: "Device temperature critical  (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: SmartCriticalWarning
+      expr: 'smartctl_device_critical_warning > 0'
+      for: 15m
+      labels:
+        severity: critical
+      annotations:
+        summary: Smart critical warning (instance {{ $labels.instance }})
+        description: "device has critical warning (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: SmartMediaErrors
+      expr: 'smartctl_device_media_errors > 0'
+      for: 15m
+      labels:
+        severity: critical
+      annotations:
+        summary: Smart media errors (instance {{ $labels.instance }})
+        description: "device has media errors (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: SmartNvmeWearoutIndicator
+      expr: 'smartctl_device_available_spare{device=~"nvme.*"} < smartctl_device_available_spare_threshold{device=~"nvme.*"}'
+      for: 15m
+      labels:
+        severity: critical
+      annotations:
+        summary: Smart NVME Wearout Indicator (instance {{ $labels.instance }})
+        description: "NVMe device is wearing out (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

From 2a2a1305bad90cd67ab78f051a914116e2561325 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 9 Feb 2024 20:33:50 +0100
Subject: [PATCH 029/123] build(deps-dev): bump nokogiri from 1.14.3 to 1.16.2
 (#401)

Bumps [nokogiri](https://github.com/sparklemotion/nokogiri) from 1.14.3 to 1.16.2.
- [Release notes](https://github.com/sparklemotion/nokogiri/releases)
- [Changelog](https://github.com/sparklemotion/nokogiri/blob/main/CHANGELOG.md)
- [Commits](https://github.com/sparklemotion/nokogiri/compare/v1.14.3...v1.16.2)

---
updated-dependencies:
- dependency-name: nokogiri
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 Gemfile.lock | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Gemfile.lock b/Gemfile.lock
index 7b76ef9..df07b1f 100644
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -231,7 +231,7 @@ GEM
       jekyll-seo-tag (~> 2.1)
     minitest (5.17.0)
     multipart-post (2.1.1)
-    nokogiri (1.14.3-x86_64-linux)
+    nokogiri (1.16.2-x86_64-linux)
       racc (~> 1.4)
     octokit (4.22.0)
       faraday (>= 0.9)
@@ -239,7 +239,7 @@ GEM
     pathutil (0.16.2)
       forwardable-extended (~> 2.6)
     public_suffix (4.0.7)
-    racc (1.6.2)
+    racc (1.7.3)
     rb-fsevent (0.11.1)
     rb-inotify (0.10.1)
       ffi (~> 1.0)

From 5f57f09db0a4fd48afefa87aaf9e723a0eb97579 Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Sat, 10 Feb 2024 20:01:19 +0100
Subject: [PATCH 030/123] fix(HostOutOfInodes): exclude msdosfs FS

See #398
---
 _data/rules.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/_data/rules.yml b/_data/rules.yml
index ef84f1b..7db340f 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -193,16 +193,16 @@ groups:
                 for: 2m
               - name: Host out of inodes
                 description: Disk is almost running out of available inodes (< 10% left)
-                query: '(node_filesystem_files_free / node_filesystem_files * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+                query: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
                 severity: warning
                 for: 2m
               - name: Host filesystem device error
-                description: '{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem'
-                query: 'node_filesystem_device_error == 1'
+                description: "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem"
+                query: "node_filesystem_device_error == 1"
                 severity: critical
               - name: Host inodes will fill in 24 hours
                 description: Filesystem is predicted to run out of inodes within the next 24 hours at current write rate
-                query: '(node_filesystem_files_free / node_filesystem_files * 100 < 10 and predict_linear(node_filesystem_files_free[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+                query: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{fstype!="msdosfs"} == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
                 severity: warning
                 for: 2m
               - name: Host unusual disk read latency

From 284db65e46fcf54548ec585355b0af269c8bfaf1 Mon Sep 17 00:00:00 2001
From: samber <samber@users.noreply.github.com>
Date: Sat, 10 Feb 2024 19:02:28 +0000
Subject: [PATCH 031/123] Publish

---
 dist/rules/host-and-hardware/node-exporter.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dist/rules/host-and-hardware/node-exporter.yml b/dist/rules/host-and-hardware/node-exporter.yml
index fd27d81..de48231 100644
--- a/dist/rules/host-and-hardware/node-exporter.yml
+++ b/dist/rules/host-and-hardware/node-exporter.yml
@@ -86,7 +86,7 @@ groups:
         description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: HostOutOfInodes
-      expr: '(node_filesystem_files_free / node_filesystem_files * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
       for: 2m
       labels:
         severity: warning
@@ -104,7 +104,7 @@ groups:
         description: "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: HostInodesWillFillIn24Hours
-      expr: '(node_filesystem_files_free / node_filesystem_files * 100 < 10 and predict_linear(node_filesystem_files_free[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{fstype!="msdosfs"} == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
       for: 2m
       labels:
         severity: warning

From 937cd35df72b2e4b6d774cb3cdc06ee3de37cf70 Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Sat, 10 Feb 2024 20:03:13 +0100
Subject: [PATCH 032/123] :lipstick:

---
 _data/rules.yml | 1233 +++++++++++++++++++++++------------------------
 1 file changed, 612 insertions(+), 621 deletions(-)

diff --git a/_data/rules.yml b/_data/rules.yml
index 7db340f..10833f4 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -1,4 +1,3 @@
-
 #
 # The following yaml cannot be copy-pasted to Prometheus configuration.
 #     Please navigate to https://samber.github.io/awesome-prometheus-alerts/rules instead.
@@ -14,121 +13,121 @@ groups:
         exporters:
           - slug: embedded-exporter
             rules:
-            - name: Prometheus job missing
-              description: A Prometheus job has disappeared
-              query: 'absent(up{job="prometheus"})'
-              severity: warning
-            - name: Prometheus target missing
-              description: A Prometheus target has disappeared. An exporter might be crashed.
-              query: 'up == 0'
-              severity: critical
-            - name: Prometheus all targets missing
-              description: A Prometheus job does not have living target anymore.
-              query: 'sum by (job) (up) == 0'
-              severity: critical
-            - name: Prometheus target missing with warmup time
-              description: Allow a job time to start up (10 minutes) before alerting that it's down.
-              query: 'sum by (instance, job) ((up == 0) * on (instance) group_right(job) (node_time_seconds - node_boot_time_seconds > 600))'
-              severity: critical
-            - name: Prometheus configuration reload failure
-              description: Prometheus configuration reload error
-              query: 'prometheus_config_last_reload_successful != 1'
-              severity: warning
-            - name: Prometheus too many restarts
-              description: Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.
-              query: 'changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2'
-              severity: warning
-            - name: Prometheus AlertManager job missing
-              description: A Prometheus AlertManager job has disappeared
-              query: 'absent(up{job="alertmanager"})'
-              severity: warning
-            - name: Prometheus AlertManager configuration reload failure
-              description: AlertManager configuration reload error
-              query: 'alertmanager_config_last_reload_successful != 1'
-              severity: warning
-            - name: Prometheus AlertManager config not synced
-              description: Configurations of AlertManager cluster instances are out of sync
-              query: 'count(count_values("config_hash", alertmanager_config_hash)) > 1'
-              severity: warning
-            - name: Prometheus AlertManager E2E dead man switch
-              description: Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.
-              query: 'vector(1)'
-              severity: critical
-            - name: Prometheus not connected to alertmanager
-              description: Prometheus cannot connect the alertmanager
-              query: 'prometheus_notifications_alertmanagers_discovered < 1'
-              severity: critical
-            - name: Prometheus rule evaluation failures
-              description: 'Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.'
-              query: 'increase(prometheus_rule_evaluation_failures_total[3m]) > 0'
-              severity: critical
-            - name: Prometheus template text expansion failures
-              description: 'Prometheus encountered {{ $value }} template text expansion failures'
-              query: 'increase(prometheus_template_text_expansion_failures_total[3m]) > 0'
-              severity: critical
-            - name: Prometheus rule evaluation slow
-              description: 'Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.'
-              query: 'prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds'
-              severity: warning
-              for: 5m
-            - name: Prometheus notifications backlog
-              description: The Prometheus notification queue has not been empty for 10 minutes
-              query: 'min_over_time(prometheus_notifications_queue_length[10m]) > 0'
-              severity: warning
-            - name: Prometheus AlertManager notification failing
-              description: Alertmanager is failing sending notifications
-              query: 'rate(alertmanager_notifications_failed_total[1m]) > 0'
-              severity: critical
-            - name: Prometheus target empty
-              description: Prometheus has no target in service discovery
-              query: 'prometheus_sd_discovered_targets == 0'
-              severity: critical
-            - name: Prometheus target scraping slow
-              description: Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned.
-              query: 'prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05'
-              severity: warning
-              for: 5m
-            - name: Prometheus large scrape
-              description: Prometheus has many scrapes that exceed the sample limit
-              query: 'increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10'
-              severity: warning
-              for: 5m
-            - name: Prometheus target scrape duplicate
-              description: Prometheus has many samples rejected due to duplicate timestamps but different values
-              query: 'increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0'
-              severity: warning
-            - name: Prometheus TSDB checkpoint creation failures
-              description: 'Prometheus encountered {{ $value }} checkpoint creation failures'
-              query: 'increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0'
-              severity: critical
-            - name: Prometheus TSDB checkpoint deletion failures
-              description: 'Prometheus encountered {{ $value }} checkpoint deletion failures'
-              query: 'increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0'
-              severity: critical
-            - name: Prometheus TSDB compactions failed
-              description: 'Prometheus encountered {{ $value }} TSDB compactions failures'
-              query: 'increase(prometheus_tsdb_compactions_failed_total[1m]) > 0'
-              severity: critical
-            - name: Prometheus TSDB head truncations failed
-              description: 'Prometheus encountered {{ $value }} TSDB head truncation failures'
-              query: 'increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0'
-              severity: critical
-            - name: Prometheus TSDB reload failures
-              description: 'Prometheus encountered {{ $value }} TSDB reload failures'
-              query: 'increase(prometheus_tsdb_reloads_failures_total[1m]) > 0'
-              severity: critical
-            - name: Prometheus TSDB WAL corruptions
-              description: 'Prometheus encountered {{ $value }} TSDB WAL corruptions'
-              query: 'increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0'
-              severity: critical
-            - name: Prometheus TSDB WAL truncations failed
-              description: 'Prometheus encountered {{ $value }} TSDB WAL truncation failures'
-              query: 'increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0'
-              severity: critical
-            - name: Prometheus timeseries cardinality
-              description: 'The "{{ $labels.name }}" timeseries cardinality is getting very high: {{ $value }}'
-              query: 'label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000'
-              severity: warning
+              - name: Prometheus job missing
+                description: A Prometheus job has disappeared
+                query: 'absent(up{job="prometheus"})'
+                severity: warning
+              - name: Prometheus target missing
+                description: A Prometheus target has disappeared. An exporter might be crashed.
+                query: "up == 0"
+                severity: critical
+              - name: Prometheus all targets missing
+                description: A Prometheus job does not have living target anymore.
+                query: "sum by (job) (up) == 0"
+                severity: critical
+              - name: Prometheus target missing with warmup time
+                description: Allow a job time to start up (10 minutes) before alerting that it's down.
+                query: "sum by (instance, job) ((up == 0) * on (instance) group_right(job) (node_time_seconds - node_boot_time_seconds > 600))"
+                severity: critical
+              - name: Prometheus configuration reload failure
+                description: Prometheus configuration reload error
+                query: "prometheus_config_last_reload_successful != 1"
+                severity: warning
+              - name: Prometheus too many restarts
+                description: Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.
+                query: 'changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2'
+                severity: warning
+              - name: Prometheus AlertManager job missing
+                description: A Prometheus AlertManager job has disappeared
+                query: 'absent(up{job="alertmanager"})'
+                severity: warning
+              - name: Prometheus AlertManager configuration reload failure
+                description: AlertManager configuration reload error
+                query: "alertmanager_config_last_reload_successful != 1"
+                severity: warning
+              - name: Prometheus AlertManager config not synced
+                description: Configurations of AlertManager cluster instances are out of sync
+                query: 'count(count_values("config_hash", alertmanager_config_hash)) > 1'
+                severity: warning
+              - name: Prometheus AlertManager E2E dead man switch
+                description: Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.
+                query: "vector(1)"
+                severity: critical
+              - name: Prometheus not connected to alertmanager
+                description: Prometheus cannot connect the alertmanager
+                query: "prometheus_notifications_alertmanagers_discovered < 1"
+                severity: critical
+              - name: Prometheus rule evaluation failures
+                description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts."
+                query: "increase(prometheus_rule_evaluation_failures_total[3m]) > 0"
+                severity: critical
+              - name: Prometheus template text expansion failures
+                description: "Prometheus encountered {{ $value }} template text expansion failures"
+                query: "increase(prometheus_template_text_expansion_failures_total[3m]) > 0"
+                severity: critical
+              - name: Prometheus rule evaluation slow
+                description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query."
+                query: "prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds"
+                severity: warning
+                for: 5m
+              - name: Prometheus notifications backlog
+                description: The Prometheus notification queue has not been empty for 10 minutes
+                query: "min_over_time(prometheus_notifications_queue_length[10m]) > 0"
+                severity: warning
+              - name: Prometheus AlertManager notification failing
+                description: Alertmanager is failing sending notifications
+                query: "rate(alertmanager_notifications_failed_total[1m]) > 0"
+                severity: critical
+              - name: Prometheus target empty
+                description: Prometheus has no target in service discovery
+                query: "prometheus_sd_discovered_targets == 0"
+                severity: critical
+              - name: Prometheus target scraping slow
+                description: Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned.
+                query: 'prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05'
+                severity: warning
+                for: 5m
+              - name: Prometheus large scrape
+                description: Prometheus has many scrapes that exceed the sample limit
+                query: "increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10"
+                severity: warning
+                for: 5m
+              - name: Prometheus target scrape duplicate
+                description: Prometheus has many samples rejected due to duplicate timestamps but different values
+                query: "increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0"
+                severity: warning
+              - name: Prometheus TSDB checkpoint creation failures
+                description: "Prometheus encountered {{ $value }} checkpoint creation failures"
+                query: "increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0"
+                severity: critical
+              - name: Prometheus TSDB checkpoint deletion failures
+                description: "Prometheus encountered {{ $value }} checkpoint deletion failures"
+                query: "increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0"
+                severity: critical
+              - name: Prometheus TSDB compactions failed
+                description: "Prometheus encountered {{ $value }} TSDB compactions failures"
+                query: "increase(prometheus_tsdb_compactions_failed_total[1m]) > 0"
+                severity: critical
+              - name: Prometheus TSDB head truncations failed
+                description: "Prometheus encountered {{ $value }} TSDB head truncation failures"
+                query: "increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0"
+                severity: critical
+              - name: Prometheus TSDB reload failures
+                description: "Prometheus encountered {{ $value }} TSDB reload failures"
+                query: "increase(prometheus_tsdb_reloads_failures_total[1m]) > 0"
+                severity: critical
+              - name: Prometheus TSDB WAL corruptions
+                description: "Prometheus encountered {{ $value }} TSDB WAL corruptions"
+                query: "increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0"
+                severity: critical
+              - name: Prometheus TSDB WAL truncations failed
+                description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures"
+                query: "increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0"
+                severity: critical
+              - name: Prometheus timeseries cardinality
+                description: 'The "{{ $labels.name }}" timeseries cardinality is getting very high: {{ $value }}'
+                query: 'label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000'
+                severity: warning
 
       - name: Host and hardware
         exporters:
@@ -147,7 +146,7 @@ groups:
                 severity: warning
                 for: 2m
               - name: Host Memory is underutilized
-                description: 'Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})'
+                description: "Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})"
                 query: '(100 - (avg_over_time(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
                 severity: info
                 for: 1w
@@ -221,7 +220,7 @@ groups:
                 severity: warning
                 for: 10m
               - name: Host CPU is underutilized
-                description: 'CPU load is < 20% for 1 week. Consider reducing the number of CPUs.'
+                description: "CPU load is < 20% for 1 week. Consider reducing the number of CPUs."
                 query: '(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
                 severity: info
                 for: 1w
@@ -236,7 +235,7 @@ groups:
                 query: '(avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
                 severity: warning
               - name: Host unusual disk IO
-                description: 'Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.'
+                description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues."
                 query: '(rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
                 severity: warning
                 for: 5m
@@ -267,11 +266,11 @@ groups:
                 query: '(node_hwmon_temp_crit_alarm_celsius == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
                 severity: critical
               - name: Host RAID array got inactive
-                description: 'RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically.'
+                description: "RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically."
                 query: '(node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
                 severity: critical
               - name: Host RAID disk failure
-                description: 'At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap'
+                description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap"
                 query: '(node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
                 severity: warning
                 for: 2m
@@ -304,7 +303,7 @@ groups:
                 for: 2m
               - name: Host Network Interface Saturated
                 description: 'The network interface "{{ $labels.device }}" on "{{ $labels.instance }}" is getting overloaded.'
-                query: '((rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} > 0.8 < 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'    # < to 10Gb to prevent +inf when max speed is unknown
+                query: '((rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} > 0.8 < 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' # < to 10Gb to prevent +inf when max speed is unknown
                 severity: warning
                 for: 1m
               - name: Host Network Bond Degraded
@@ -313,22 +312,22 @@ groups:
                 severity: warning
                 for: 2m
               - name: Host conntrack limit
-                description: 'The number of conntrack is approaching limit'
+                description: "The number of conntrack is approaching limit"
                 query: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
                 severity: warning
                 for: 5m
               - name: Host clock skew
-                description: 'Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.'
+                description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host."
                 query: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
                 severity: warning
                 for: 10m
               - name: Host clock not synchronising
-                description: 'Clock not synchronising. Ensure NTP is configured on this host.'
+                description: "Clock not synchronising. Ensure NTP is configured on this host."
                 query: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
                 severity: warning
                 for: 2m
               - name: Host requires reboot
-                description: '{{ $labels.instance }} requires a reboot.'
+                description: "{{ $labels.instance }} requires a reboot."
                 query: '(node_reboot_required > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
                 severity: info
                 for: 4h
@@ -364,7 +363,7 @@ groups:
                 query: smartctl_device_available_spare{device=~"nvme.*"} < smartctl_device_available_spare_threshold{device=~"nvme.*"}
                 severity: critical
                 for: 15m
- 
+
       - name: Docker containers
         exporters:
           - name: google/cAdvisor
@@ -373,13 +372,13 @@ groups:
             rules:
               - name: Container killed
                 description: A container has disappeared
-                query: 'time() - container_last_seen > 60'
+                query: "time() - container_last_seen > 60"
                 severity: warning
                 comments: |
                   This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment.
               - name: Container absent
                 description: A container is absent for 5 min
-                query: 'absent(container_last_seen)'
+                query: "absent(container_last_seen)"
                 severity: warning
                 for: 5m
                 comments: |
@@ -402,7 +401,7 @@ groups:
                 for: 2m
               - name: Container high throttle rate
                 description: Container is being throttled
-                query: 'rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1'
+                query: "rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1"
                 severity: warning
                 for: 2m
               - name: Container Low CPU utilization
@@ -416,7 +415,6 @@ groups:
                 severity: info
                 for: 7d
 
-
       - name: Blackbox
         exporters:
           - name: prometheus/blackbox_exporter
@@ -429,28 +427,28 @@ groups:
                 severity: critical
               - name: Blackbox configuration reload failure
                 description: Blackbox configuration reload failure
-                query: 'blackbox_exporter_config_last_reload_successful != 1'
+                query: "blackbox_exporter_config_last_reload_successful != 1"
                 severity: warning
               - name: Blackbox slow probe
                 description: Blackbox probe took more than 1s to complete
-                query: 'avg_over_time(probe_duration_seconds[1m]) > 1'
+                query: "avg_over_time(probe_duration_seconds[1m]) > 1"
                 severity: warning
                 for: 1m
               - name: Blackbox probe HTTP failure
                 description: HTTP status code is not 200-399
-                query: 'probe_http_status_code <= 199 OR probe_http_status_code >= 400'
+                query: "probe_http_status_code <= 199 OR probe_http_status_code >= 400"
                 severity: critical
               - name: Blackbox SSL certificate will expire soon
                 description: SSL certificate expires in less than 20 days
-                query: '3 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 20'
+                query: "3 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 20"
                 severity: warning
               - name: Blackbox SSL certificate will expire soon
                 description: SSL certificate expires in less than 3 days
-                query: '0 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 3'
+                query: "0 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 3"
                 severity: critical
               - name: Blackbox SSL certificate expired
                 description: SSL certificate has expired already
-                query: 'round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 0'
+                query: "round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 0"
                 severity: critical
                 comments: |
                   For probe_ssl_earliest_cert_expiry to be exposed after expiration, you
@@ -459,12 +457,12 @@ groups:
                   See https://github.com/prometheus/blackbox_exporter/blob/master/CONFIGURATION.md#tls_config
               - name: Blackbox probe slow HTTP
                 description: HTTP request took more than 1s
-                query: 'avg_over_time(probe_http_duration_seconds[1m]) > 1'
+                query: "avg_over_time(probe_http_duration_seconds[1m]) > 1"
                 severity: warning
                 for: 1m
               - name: Blackbox probe slow ping
                 description: Blackbox ping took more than 1s
-                query: 'avg_over_time(probe_icmp_duration_seconds[1m]) > 1'
+                query: "avg_over_time(probe_icmp_duration_seconds[1m]) > 1"
                 severity: warning
                 for: 1m
 
@@ -476,7 +474,7 @@ groups:
             rules:
               - name: Windows Server collector Error
                 description: "Collector {{ $labels.collector }} was not successful"
-                query: 'windows_exporter_collector_success == 0'
+                query: "windows_exporter_collector_success == 0"
                 severity: critical
               - name: Windows Server service Status
                 description: Windows Service state is not OK
@@ -489,12 +487,12 @@ groups:
                 severity: warning
               - name: Windows Server memory Usage
                 description: Memory usage is more than 90%
-                query: '100 - ((windows_os_physical_memory_free_bytes / windows_cs_physical_memory_bytes) * 100) > 90'
+                query: "100 - ((windows_os_physical_memory_free_bytes / windows_cs_physical_memory_bytes) * 100) > 90"
                 severity: warning
                 for: 2m
               - name: Windows Server disk Space Usage
                 description: Disk usage is more than 80%
-                query: '100.0 - 100 * ((windows_logical_disk_free_bytes / 1024 / 1024 ) / (windows_logical_disk_size_bytes / 1024 / 1024)) > 80'
+                query: "100.0 - 100 * ((windows_logical_disk_free_bytes / 1024 / 1024 ) / (windows_logical_disk_size_bytes / 1024 / 1024)) > 80"
                 severity: critical
                 for: 2m
 
@@ -506,22 +504,22 @@ groups:
             rules:
               - name: Virtual Machine Memory Warning
                 description: 'High memory usage on {{ $labels.instance }}: {{ $value | printf "%.2f"}}%'
-                query: 'vmware_vm_mem_usage_average / 100 >= 80 and vmware_vm_mem_usage_average / 100 < 90'
+                query: "vmware_vm_mem_usage_average / 100 >= 80 and vmware_vm_mem_usage_average / 100 < 90"
                 severity: warning
                 for: 5m
               - name: Virtual Machine Memory Critical
                 description: 'High memory usage on {{ $labels.instance }}: {{ $value | printf "%.2f"}}%'
-                query: 'vmware_vm_mem_usage_average / 100 >= 90'
+                query: "vmware_vm_mem_usage_average / 100 >= 90"
                 severity: critical
                 for: 1m
               - name: High Number of Snapshots
                 description: "High snapshots number on {{ $labels.instance }}: {{ $value }}"
-                query: 'vmware_vm_snapshots > 3'
+                query: "vmware_vm_snapshots > 3"
                 severity: warning
                 for: 30m
               - name: Outdated Snapshots
                 description: 'Outdated snapshots on {{ $labels.instance }}: {{ $value | printf "%.0f"}} days'
-                query: '(time() - vmware_vm_snapshot_timestamp_seconds) / (60 * 60 * 24) >= 3'
+                query: "(time() - vmware_vm_snapshot_timestamp_seconds) / (60 * 60 * 24) >= 3"
                 severity: warning
                 for: 5m
 
@@ -557,23 +555,22 @@ groups:
                 severity: warning
               - name: Netdata MD mismatch cnt unsynchronized blocks
                 description: RAID Array have unsynchronized blocks
-                query: 'netdata_md_mismatch_cnt_unsynchronized_blocks_average > 1024'
+                query: "netdata_md_mismatch_cnt_unsynchronized_blocks_average > 1024"
                 severity: warning
                 for: 2m
               - name: Netdata disk reallocated sectors
                 description: Reallocated sectors on disk
-                query: 'increase(netdata_smartd_log_reallocated_sectors_count_sectors_average[1m]) > 0'
+                query: "increase(netdata_smartd_log_reallocated_sectors_count_sectors_average[1m]) > 0"
                 severity: info
               - name: Netdata disk current pending sector
                 description: Disk current pending sector
-                query: 'netdata_smartd_log_current_pending_sector_count_sectors_average > 0'
+                query: "netdata_smartd_log_current_pending_sector_count_sectors_average > 0"
                 severity: warning
               - name: Netdata reported uncorrectable disk sectors
                 description: Reported uncorrectable disk sectors
-                query: 'increase(netdata_smartd_log_offline_uncorrectable_sector_count_sectors_average[2m]) > 0'
+                query: "increase(netdata_smartd_log_offline_uncorrectable_sector_count_sectors_average[2m]) > 0"
                 severity: warning
 
-
   - name: Databases and brokers
     services:
       - name: MySQL
@@ -584,29 +581,29 @@ groups:
             rules:
               - name: MySQL down
                 description: MySQL instance is down on {{ $labels.instance }}
-                query: 'mysql_up == 0'
+                query: "mysql_up == 0"
                 severity: critical
               - name: MySQL too many connections (> 80%)
-                description: 'More than 80% of MySQL connections are in use on {{ $labels.instance }}'
-                query: 'max_over_time(mysql_global_status_threads_connected[1m]) / mysql_global_variables_max_connections * 100 > 80'
+                description: "More than 80% of MySQL connections are in use on {{ $labels.instance }}"
+                query: "max_over_time(mysql_global_status_threads_connected[1m]) / mysql_global_variables_max_connections * 100 > 80"
                 severity: warning
                 for: 2m
               - name: MySQL high threads running
-                description: 'More than 60% of MySQL connections are in running state on {{ $labels.instance }}'
-                query: 'max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections * 100 > 60'
+                description: "More than 60% of MySQL connections are in running state on {{ $labels.instance }}"
+                query: "max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections * 100 > 60"
                 severity: warning
                 for: 2m
               - name: MySQL Slave IO thread not running
-                description: 'MySQL Slave IO thread not running on {{ $labels.instance }}'
-                query: '( mysql_slave_status_slave_io_running and ON (instance) mysql_slave_status_master_server_id > 0 ) == 0'
+                description: "MySQL Slave IO thread not running on {{ $labels.instance }}"
+                query: "( mysql_slave_status_slave_io_running and ON (instance) mysql_slave_status_master_server_id > 0 ) == 0"
                 severity: critical
               - name: MySQL Slave SQL thread not running
-                description: 'MySQL Slave SQL thread not running on {{ $labels.instance }}'
-                query: '( mysql_slave_status_slave_sql_running and ON (instance) mysql_slave_status_master_server_id > 0) == 0'
+                description: "MySQL Slave SQL thread not running on {{ $labels.instance }}"
+                query: "( mysql_slave_status_slave_sql_running and ON (instance) mysql_slave_status_master_server_id > 0) == 0"
                 severity: critical
               - name: MySQL Slave replication lag
-                description: 'MySQL replication lag on {{ $labels.instance }}'
-                query: '( (mysql_slave_status_seconds_behind_master - mysql_slave_status_sql_delay) and ON (instance) mysql_slave_status_master_server_id > 0 ) > 30'
+                description: "MySQL replication lag on {{ $labels.instance }}"
+                query: "( (mysql_slave_status_seconds_behind_master - mysql_slave_status_sql_delay) and ON (instance) mysql_slave_status_master_server_id > 0 ) > 30"
                 severity: critical
                 for: 1m
               - name: MySQL slow queries
@@ -620,7 +617,7 @@ groups:
                 severity: warning
               - name: MySQL restarted
                 description: MySQL has just been restarted, less than one minute ago on {{ $labels.instance }}.
-                query: 'mysql_global_status_uptime < 60'
+                query: "mysql_global_status_uptime < 60"
                 severity: info
 
       - name: PostgreSQL
@@ -631,27 +628,27 @@ groups:
             rules:
               - name: Postgresql down
                 description: Postgresql instance is down
-                query: 'pg_up == 0'
+                query: "pg_up == 0"
                 severity: critical
               - name: Postgresql restarted
                 description: Postgresql restarted
-                query: 'time() - pg_postmaster_start_time_seconds < 60'
+                query: "time() - pg_postmaster_start_time_seconds < 60"
                 severity: critical
               - name: Postgresql exporter error
                 description: Postgresql exporter is showing errors. A query may be buggy in query.yaml
-                query: 'pg_exporter_last_scrape_error > 0'
+                query: "pg_exporter_last_scrape_error > 0"
                 severity: critical
               - name: Postgresql table not auto vacuumed
                 description: Table {{ $labels.relname }} has not been auto vacuumed for 10 days
-                query: '(pg_stat_user_tables_last_autovacuum > 0) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10'
+                query: "(pg_stat_user_tables_last_autovacuum > 0) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10"
                 severity: warning
               - name: Postgresql table not auto analyzed
                 description: Table {{ $labels.relname }} has not been auto analyzed for 10 days
-                query: '(pg_stat_user_tables_last_autoanalyze > 0) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10'
+                query: "(pg_stat_user_tables_last_autoanalyze > 0) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10"
                 severity: warning
               - name: Postgresql too many connections
                 description: PostgreSQL instance has too many connections (> 80%).
-                query: 'sum by (instance, job, server) (pg_stat_activity_count) > min by (instance, job, server) (pg_settings_max_connections * 0.8)'
+                query: "sum by (instance, job, server) (pg_stat_activity_count) > min by (instance, job, server) (pg_settings_max_connections * 0.8)"
                 severity: warning
                 for: 2m
               - name: Postgresql not enough connections
@@ -669,12 +666,12 @@ groups:
                 severity: warning
               - name: Postgresql commit rate low
                 description: Postgresql seems to be processing very few transactions
-                query: 'rate(pg_stat_database_xact_commit[1m]) < 10'
+                query: "rate(pg_stat_database_xact_commit[1m]) < 10"
                 severity: critical
                 for: 2m
               - name: Postgresql low XID consumption
                 description: Postgresql seems to be consuming transaction IDs very slowly
-                query: 'rate(pg_txid_current[1m]) < 5'
+                query: "rate(pg_txid_current[1m]) < 5"
                 severity: warning
                 for: 2m
               - name: Postgresql high rate statement timeout
@@ -687,12 +684,12 @@ groups:
                 severity: critical
               - name: Postgresql unused replication slot
                 description: Unused Replication Slots
-                query: 'pg_replication_slots_active == 0'
+                query: "pg_replication_slots_active == 0"
                 severity: warning
                 for: 1m
               - name: Postgresql too many dead tuples
                 description: PostgreSQL dead tuples is too large
-                query: '((pg_stat_user_tables_n_dead_tup > 10000) / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)) >= 0.1'
+                query: "((pg_stat_user_tables_n_dead_tup > 10000) / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)) >= 0.1"
                 severity: warning
                 for: 2m
               - name: Postgresql configuration changed
@@ -701,29 +698,29 @@ groups:
                 severity: info
               - name: Postgresql SSL compression active
                 description: Database connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.
-                query: 'sum(pg_stat_ssl_compression) > 0'
+                query: "sum(pg_stat_ssl_compression) > 0"
                 severity: critical
               - name: Postgresql too many locks acquired
                 description: Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.
-                query: '((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20'
+                query: "((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20"
                 severity: critical
                 for: 2m
               - name: Postgresql bloat index high (> 80%)
-                description: 'The index {{ $labels.idxname }} is bloated. You should execute `REINDEX INDEX CONCURRENTLY {{ $labels.idxname }};`'
-                query: 'pg_bloat_btree_bloat_pct > 80 and on (idxname) (pg_bloat_btree_real_size > 100000000)'
+                description: "The index {{ $labels.idxname }} is bloated. You should execute `REINDEX INDEX CONCURRENTLY {{ $labels.idxname }};`"
+                query: "pg_bloat_btree_bloat_pct > 80 and on (idxname) (pg_bloat_btree_real_size > 100000000)"
                 severity: warning
                 for: 1h
                 comments: |
                   See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
               - name: Postgresql bloat table high (> 80%)
-                description: 'The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`'
-                query: 'pg_bloat_table_bloat_pct > 80 and on (relname) (pg_bloat_table_real_size > 200000000)'
+                description: "The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`"
+                query: "pg_bloat_table_bloat_pct > 80 and on (relname) (pg_bloat_table_real_size > 200000000)"
                 severity: warning
                 for: 1h
                 comments: |
                   See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
               - name: Postgresql invalid index
-                description: 'The table {{ $labels.relname }} has an invalid index: {{ $labels.indexrelname }}. You should execute `DROP INDEX {{ $labels.indexrelname }};`'
+                description: "The table {{ $labels.relname }} has an invalid index: {{ $labels.indexrelname }}. You should execute `DROP INDEX {{ $labels.indexrelname }};`"
                 query: 'pg_genaral_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}'
                 severity: warning
                 for: 6h
@@ -762,7 +759,7 @@ groups:
             rules:
               - name: PGBouncer active connections
                 description: PGBouncer pools are filling up
-                query: 'pgbouncer_pools_server_active_connections > 200'
+                query: "pgbouncer_pools_server_active_connections > 200"
                 severity: warning
                 for: 2m
               - name: PGBouncer errors
@@ -782,7 +779,7 @@ groups:
             rules:
               - name: Redis down
                 description: Redis instance is down
-                query: 'redis_up == 0'
+                query: "redis_up == 0"
                 severity: critical
               - name: Redis missing master
                 description: Redis cluster has no node marked as master.
@@ -794,46 +791,46 @@ groups:
                 severity: critical
               - name: Redis disconnected slaves
                 description: Redis not replicating for all slaves. Consider reviewing the redis replication status.
-                query: 'count without (instance, job) (redis_connected_slaves) - sum without (instance, job) (redis_connected_slaves) - 1 > 0'
+                query: "count without (instance, job) (redis_connected_slaves) - sum without (instance, job) (redis_connected_slaves) - 1 > 0"
                 severity: critical
               - name: Redis replication broken
                 description: Redis instance lost a slave
-                query: 'delta(redis_connected_slaves[1m]) < 0'
+                query: "delta(redis_connected_slaves[1m]) < 0"
                 severity: critical
               - name: Redis cluster flapping
                 description: Changes have been detected in Redis replica connection. This can occur when replica nodes lose connection to the master and reconnect (a.k.a flapping).
-                query: 'changes(redis_connected_slaves[1m]) > 1'
+                query: "changes(redis_connected_slaves[1m]) > 1"
                 severity: critical
                 for: 2m
               - name: Redis missing backup
                 description: Redis has not been backuped for 24 hours
-                query: 'time() - redis_rdb_last_save_timestamp_seconds > 60 * 60 * 24'
+                query: "time() - redis_rdb_last_save_timestamp_seconds > 60 * 60 * 24"
                 severity: critical
               - name: Redis out of system memory
                 description: Redis is running out of system memory (> 90%)
-                query: 'redis_memory_used_bytes / redis_total_system_memory_bytes * 100 > 90'
+                query: "redis_memory_used_bytes / redis_total_system_memory_bytes * 100 > 90"
                 severity: warning
                 for: 2m
                 comments: |
                   The exporter must be started with --include-system-metrics flag or REDIS_EXPORTER_INCL_SYSTEM_METRICS=true environment variable.
               - name: Redis out of configured maxmemory
                 description: Redis is running out of configured maxmemory (> 90%)
-                query: 'redis_memory_used_bytes / redis_memory_max_bytes * 100 > 90'
+                query: "redis_memory_used_bytes / redis_memory_max_bytes * 100 > 90"
                 severity: warning
                 for: 2m
               - name: Redis too many connections
                 description: Redis is running out of connections (> 90% used)
-                query: 'redis_connected_clients / redis_config_maxclients * 100 > 90'
+                query: "redis_connected_clients / redis_config_maxclients * 100 > 90"
                 severity: warning
                 for: 2m
               - name: Redis not enough connections
                 description: Redis instance should have more connections (> 5)
-                query: 'redis_connected_clients < 5'
+                query: "redis_connected_clients < 5"
                 severity: warning
                 for: 2m
               - name: Redis rejected connections
                 description: Some connections to Redis has been rejected
-                query: 'increase(redis_rejected_connections_total[1m]) > 0'
+                query: "increase(redis_rejected_connections_total[1m]) > 0"
                 severity: critical
 
       - name: MongoDB
@@ -844,11 +841,11 @@ groups:
             rules:
               - name: MongoDB Down
                 description: MongoDB instance is down
-                query: 'mongodb_up == 0'
+                query: "mongodb_up == 0"
                 severity: critical
               - name: Mongodb replica member unhealthy
                 description: MongoDB replica member is not healthy
-                query: 'mongodb_rs_members_health == 0'
+                query: "mongodb_rs_members_health == 0"
                 severity: critical
               - name: MongoDB replication lag
                 description: Mongodb replication lag is more than 10s
@@ -865,7 +862,7 @@ groups:
                 for: 2m
               - name: MongoDB cursors timeouts
                 description: Too many cursors are timing out
-                query: 'increase(mongodb_ss_metrics_cursor_timedOut[1m]) > 100'
+                query: "increase(mongodb_ss_metrics_cursor_timedOut[1m]) > 100"
                 severity: warning
                 for: 2m
               - name: MongoDB too many connections
@@ -875,7 +872,7 @@ groups:
                 for: 2m
               - name: MongoDB virtual memory usage
                 description: High memory usage
-                query: '(sum(mongodb_ss_mem_virtual) BY (instance) / sum(mongodb_ss_mem_resident) BY (instance)) > 3'
+                query: "(sum(mongodb_ss_mem_virtual) BY (instance) / sum(mongodb_ss_mem_resident) BY (instance)) > 3"
                 severity: warning
                 for: 2m
 
@@ -889,23 +886,23 @@ groups:
                 severity: critical
               - name: MongoDB replication Status 3
                 description: MongoDB Replication set member either perform startup self-checks, or transition from completing a rollback or resync
-                query: 'mongodb_replset_member_state == 3'
+                query: "mongodb_replset_member_state == 3"
                 severity: critical
               - name: MongoDB replication Status 6
                 description: MongoDB Replication set member as seen from another member of the set, is not yet known
-                query: 'mongodb_replset_member_state == 6'
+                query: "mongodb_replset_member_state == 6"
                 severity: critical
               - name: MongoDB replication Status 8
                 description: MongoDB Replication set member as seen from another member of the set, is unreachable
-                query: 'mongodb_replset_member_state == 8'
+                query: "mongodb_replset_member_state == 8"
                 severity: critical
               - name: MongoDB replication Status 9
                 description: MongoDB Replication set member is actively performing a rollback. Data is not available for reads
-                query: 'mongodb_replset_member_state == 9'
+                query: "mongodb_replset_member_state == 9"
                 severity: critical
               - name: MongoDB replication Status 10
                 description: MongoDB Replication set member was once in a replica set but was subsequently removed
-                query: 'mongodb_replset_member_state == 10'
+                query: "mongodb_replset_member_state == 10"
                 severity: critical
               - name: MongoDB number cursors open
                 description: Too many cursors opened by MongoDB for clients (> 10k)
@@ -914,7 +911,7 @@ groups:
                 for: 2m
               - name: MongoDB cursors timeouts
                 description: Too many cursors are timing out
-                query: 'increase(mongodb_metrics_cursor_timed_out_total[1m]) > 100'
+                query: "increase(mongodb_metrics_cursor_timed_out_total[1m]) > 100"
                 severity: warning
                 for: 2m
               - name: MongoDB too many connections
@@ -945,45 +942,45 @@ groups:
             rules:
               - name: RabbitMQ node down
                 description: Less than 3 nodes running in RabbitMQ cluster
-                query: 'sum(rabbitmq_build_info) < 3'
+                query: "sum(rabbitmq_build_info) < 3"
                 severity: critical
               - name: RabbitMQ node not distributed
                 description: Distribution link state is not 'up'
-                query: 'erlang_vm_dist_node_state < 3'
+                query: "erlang_vm_dist_node_state < 3"
                 severity: critical
               - name: RabbitMQ instances different versions
                 description: Running different version of RabbitMQ in the same cluster, can lead to failure.
-                query: 'count(count(rabbitmq_build_info) by (rabbitmq_version)) > 1'
+                query: "count(count(rabbitmq_build_info) by (rabbitmq_version)) > 1"
                 severity: warning
                 for: 1h
               - name: RabbitMQ memory high
                 description: A node use more than 90% of allocated RAM
-                query: 'rabbitmq_process_resident_memory_bytes / rabbitmq_resident_memory_limit_bytes * 100 > 90'
+                query: "rabbitmq_process_resident_memory_bytes / rabbitmq_resident_memory_limit_bytes * 100 > 90"
                 severity: warning
                 for: 2m
               - name: RabbitMQ file descriptors usage
                 description: A node use more than 90% of file descriptors
-                query: 'rabbitmq_process_open_fds / rabbitmq_process_max_fds * 100 > 90'
+                query: "rabbitmq_process_open_fds / rabbitmq_process_max_fds * 100 > 90"
                 severity: warning
                 for: 2m
               - name: RabbitMQ too many unack messages
                 description: Too many unacknowledged messages
-                query: 'sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000'
+                query: "sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000"
                 severity: warning
                 for: 1m
               - name: RabbitMQ too many connections
                 description: The total connections of a node is too high
-                query: 'rabbitmq_connections > 1000'
+                query: "rabbitmq_connections > 1000"
                 severity: warning
                 for: 2m
               - name: RabbitMQ no queue consumer
                 description: A queue has less than 1 consumer
-                query: 'rabbitmq_queue_consumers < 1'
+                query: "rabbitmq_queue_consumers < 1"
                 severity: warning
-                for: 1m   # allows a short service restart
+                for: 1m # allows a short service restart
               - name: RabbitMQ unroutable messages
                 description: A queue has unroutable messages
-                query: 'increase(rabbitmq_channel_messages_unroutable_returned_total[1m]) > 0 or increase(rabbitmq_channel_messages_unroutable_dropped_total[1m]) > 0'
+                query: "increase(rabbitmq_channel_messages_unroutable_returned_total[1m]) > 0 or increase(rabbitmq_channel_messages_unroutable_dropped_total[1m]) > 0"
                 severity: warning
                 for: 2m
 
@@ -993,24 +990,24 @@ groups:
             rules:
               - name: RabbitMQ down
                 description: RabbitMQ node down
-                query: 'rabbitmq_up == 0'
+                query: "rabbitmq_up == 0"
                 severity: critical
               - name: RabbitMQ cluster down
                 description: Less than 3 nodes running in RabbitMQ cluster
-                query: 'sum(rabbitmq_running) < 3'
+                query: "sum(rabbitmq_running) < 3"
                 severity: critical
               - name: RabbitMQ cluster partition
                 description: Cluster partition
-                query: 'rabbitmq_partitions > 0'
+                query: "rabbitmq_partitions > 0"
                 severity: critical
               - name: RabbitMQ out of memory
                 description: Memory available for RabbmitMQ is low (< 10%)
-                query: 'rabbitmq_node_mem_used / rabbitmq_node_mem_limit * 100 > 90'
+                query: "rabbitmq_node_mem_used / rabbitmq_node_mem_limit * 100 > 90"
                 severity: warning
                 for: 2m
               - name: RabbitMQ too many connections
                 description: RabbitMQ instance has too many connections (> 1000)
-                query: 'rabbitmq_connectionsTotal > 1000'
+                query: "rabbitmq_connectionsTotal > 1000"
                 severity: warning
                 for: 2m
               - name: RabbitMQ dead letter queue filling up
@@ -1036,9 +1033,9 @@ groups:
                   Indicate the queue name in dedicated label.
               - name: RabbitMQ no consumer
                 description: Queue has no consumer
-                query: 'rabbitmq_queue_consumers == 0'
+                query: "rabbitmq_queue_consumers == 0"
                 severity: critical
-                for: 1m    # allows a short service restart
+                for: 1m # allows a short service restart
               - name: RabbitMQ too many consumers
                 description: Queue should have only 1 consumer
                 query: 'rabbitmq_queue_consumers{queue="my-queue"} > 1'
@@ -1071,11 +1068,11 @@ groups:
                 for: 2m
               - name: Elasticsearch disk out of space
                 description: The disk usage is over 90%
-                query: 'elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 10'
+                query: "elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 10"
                 severity: critical
               - name: Elasticsearch disk space low
                 description: The disk usage is over 80%
-                query: 'elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 20'
+                query: "elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 20"
                 severity: warning
                 for: 2m
               - name: Elasticsearch Cluster Red
@@ -1088,37 +1085,37 @@ groups:
                 severity: warning
               - name: Elasticsearch Healthy Nodes
                 description: "Missing node in Elasticsearch cluster"
-                query: 'elasticsearch_cluster_health_number_of_nodes < 3'
+                query: "elasticsearch_cluster_health_number_of_nodes < 3"
                 severity: critical
               - name: Elasticsearch Healthy Data Nodes
                 description: "Missing data node in Elasticsearch cluster"
-                query: 'elasticsearch_cluster_health_number_of_data_nodes < 3'
+                query: "elasticsearch_cluster_health_number_of_data_nodes < 3"
                 severity: critical
               - name: Elasticsearch relocating shards
                 description: "Elasticsearch is relocating shards"
-                query: 'elasticsearch_cluster_health_relocating_shards > 0'
+                query: "elasticsearch_cluster_health_relocating_shards > 0"
                 severity: info
               - name: Elasticsearch relocating shards too long
                 description: "Elasticsearch has been relocating shards for 15min"
-                query: 'elasticsearch_cluster_health_relocating_shards > 0'
+                query: "elasticsearch_cluster_health_relocating_shards > 0"
                 severity: warning
                 for: 15m
               - name: Elasticsearch initializing shards
                 description: "Elasticsearch is initializing shards"
-                query: 'elasticsearch_cluster_health_initializing_shards > 0'
+                query: "elasticsearch_cluster_health_initializing_shards > 0"
                 severity: info
               - name: Elasticsearch initializing shards too long
                 description: "Elasticsearch has been initializing shards for 15 min"
-                query: 'elasticsearch_cluster_health_initializing_shards > 0'
+                query: "elasticsearch_cluster_health_initializing_shards > 0"
                 severity: warning
                 for: 15m
               - name: Elasticsearch unassigned shards
-                description: 'Elasticsearch has unassigned shards'
-                query: 'elasticsearch_cluster_health_unassigned_shards > 0'
+                description: "Elasticsearch has unassigned shards"
+                query: "elasticsearch_cluster_health_unassigned_shards > 0"
                 severity: critical
               - name: Elasticsearch pending tasks
-                description: 'Elasticsearch has pending tasks. Cluster works slowly.'
-                query: 'elasticsearch_cluster_health_number_of_pending_tasks > 0'
+                description: "Elasticsearch has pending tasks. Cluster works slowly."
+                query: "elasticsearch_cluster_health_number_of_pending_tasks > 0"
                 severity: warning
                 for: 15m
               - name: Elasticsearch no new documents
@@ -1132,60 +1129,60 @@ groups:
             slug: instaclustr-cassandra-exporter
             doc_url: https://github.com/instaclustr/cassandra-exporter
             rules:
-              - name: 'Cassandra Node is unavailable'
-                description: 'Cassandra Node is unavailable - {{ $labels.cassandra_cluster }} {{ $labels.exported_endpoint }}'
-                query: 'sum(cassandra_endpoint_active) by (cassandra_cluster,instance,exported_endpoint) < 1'
+              - name: "Cassandra Node is unavailable"
+                description: "Cassandra Node is unavailable - {{ $labels.cassandra_cluster }} {{ $labels.exported_endpoint }}"
+                query: "sum(cassandra_endpoint_active) by (cassandra_cluster,instance,exported_endpoint) < 1"
                 severity: critical
-              - name: 'Cassandra many compaction tasks are pending'
-                description: 'Many Cassandra compaction tasks are pending - {{ $labels.cassandra_cluster }}'
-                query: 'cassandra_table_estimated_pending_compactions > 100'
+              - name: "Cassandra many compaction tasks are pending"
+                description: "Many Cassandra compaction tasks are pending - {{ $labels.cassandra_cluster }}"
+                query: "cassandra_table_estimated_pending_compactions > 100"
                 severity: warning
-              - name: 'Cassandra commitlog pending tasks'
-                description: 'Cassandra commitlog pending tasks - {{ $labels.cassandra_cluster }}'
-                query: 'cassandra_commit_log_pending_tasks > 15'
+              - name: "Cassandra commitlog pending tasks"
+                description: "Cassandra commitlog pending tasks - {{ $labels.cassandra_cluster }}"
+                query: "cassandra_commit_log_pending_tasks > 15"
                 for: 2m
                 severity: warning
-              - name: 'Cassandra compaction executor blocked tasks'
-                description: 'Some Cassandra compaction executor tasks are blocked - {{ $labels.cassandra_cluster }}'
+              - name: "Cassandra compaction executor blocked tasks"
+                description: "Some Cassandra compaction executor tasks are blocked - {{ $labels.cassandra_cluster }}"
                 query: 'cassandra_thread_pool_blocked_tasks{pool="CompactionExecutor"} > 15'
                 for: 2m
                 severity: warning
-              - name: 'Cassandra flush writer blocked tasks'
-                description: 'Some Cassandra flush writer tasks are blocked - {{ $labels.cassandra_cluster }}'
+              - name: "Cassandra flush writer blocked tasks"
+                description: "Some Cassandra flush writer tasks are blocked - {{ $labels.cassandra_cluster }}"
                 query: 'cassandra_thread_pool_blocked_tasks{pool="MemtableFlushWriter"} > 15'
                 for: 2m
                 severity: warning
-              - name: 'Cassandra connection timeouts total'
-                description: 'Some connection between nodes are ending in timeout - {{ $labels.cassandra_cluster }}'
-                query: 'avg(cassandra_client_request_timeouts_total) by (cassandra_cluster,instance) > 5'
+              - name: "Cassandra connection timeouts total"
+                description: "Some connection between nodes are ending in timeout - {{ $labels.cassandra_cluster }}"
+                query: "avg(cassandra_client_request_timeouts_total) by (cassandra_cluster,instance) > 5"
                 for: 2m
                 severity: critical
-              - name: 'Cassandra storage exceptions'
-                description: 'Something is going wrong with cassandra storage - {{ $labels.cassandra_cluster }}'
-                query: 'changes(cassandra_storage_exceptions_total[1m]) > 1'
+              - name: "Cassandra storage exceptions"
+                description: "Something is going wrong with cassandra storage - {{ $labels.cassandra_cluster }}"
+                query: "changes(cassandra_storage_exceptions_total[1m]) > 1"
                 severity: critical
-              - name: 'Cassandra tombstone dump'
-                description: 'Cassandra tombstone dump - {{ $labels.cassandra_cluster }}'
+              - name: "Cassandra tombstone dump"
+                description: "Cassandra tombstone dump - {{ $labels.cassandra_cluster }}"
                 query: 'avg(cassandra_table_tombstones_scanned{quantile="0.99"}) by (instance,cassandra_cluster,keyspace) > 100'
                 for: 2m
                 severity: critical
-              - name: 'Cassandra client request unavailable write'
-                description: 'Some Cassandra client requests are unavailable to write - {{ $labels.cassandra_cluster }}'
+              - name: "Cassandra client request unavailable write"
+                description: "Some Cassandra client requests are unavailable to write - {{ $labels.cassandra_cluster }}"
                 query: 'changes(cassandra_client_request_unavailable_exceptions_total{operation="write"}[1m]) > 0'
                 for: 2m
                 severity: critical
-              - name: 'Cassandra client request unavailable read'
-                description: 'Some Cassandra client requests are unavailable to read - {{ $labels.cassandra_cluster }}'
+              - name: "Cassandra client request unavailable read"
+                description: "Some Cassandra client requests are unavailable to read - {{ $labels.cassandra_cluster }}"
                 query: 'changes(cassandra_client_request_unavailable_exceptions_total{operation="read"}[1m]) > 0'
                 for: 2m
                 severity: critical
-              - name: 'Cassandra client request write failure'
-                description: 'Read failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}'
+              - name: "Cassandra client request write failure"
+                description: "Read failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}"
                 query: 'increase(cassandra_client_request_failures_total{operation="write"}[1m]) > 0'
                 for: 2m
                 severity: critical
-              - name: 'Cassandra client request read failure'
-                description: 'Read failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}'
+              - name: "Cassandra client request read failure"
+                description: "Read failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}"
                 query: 'increase(cassandra_client_request_failures_total{operation="read"}[1m]) > 0'
                 for: 2m
                 severity: critical
@@ -1283,25 +1280,25 @@ groups:
             slug: cloudflare-kafka-zookeeper-exporter
             doc_url: https://github.com/cloudflare/kafka_zookeeper_exporter
             rules:
-          - name:  dabealu/zookeeper-exporter
+          - name: dabealu/zookeeper-exporter
             slug: dabealu-zookeeper-exporter
             doc_url: https://github.com/dabealu/zookeeper-exporter
             rules:
               - name: Zookeeper Down
                 description: "Zookeeper down on instance {{ $labels.instance }}"
-                query: 'zk_up == 0'
+                query: "zk_up == 0"
                 severity: critical
               - name: Zookeeper missing leader
-                description:  "Zookeeper cluster has no node marked as leader"
-                query: 'sum(zk_server_leader) == 0'
+                description: "Zookeeper cluster has no node marked as leader"
+                query: "sum(zk_server_leader) == 0"
                 severity: critical
               - name: Zookeeper Too Many Leaders
                 description: "Zookeeper cluster has too many nodes marked as leader"
-                query: 'sum(zk_server_leader) > 1'
+                query: "sum(zk_server_leader) > 1"
                 severity: critical
               - name: Zookeeper Not Ok
                 description: "Zookeeper instance is not ok"
-                query: 'zk_ruok == 0'
+                query: "zk_ruok == 0"
                 severity: warning
                 for: 3m
 
@@ -1313,11 +1310,11 @@ groups:
             rules:
               - name: Kafka topics replicas
                 description: Kafka topic in-sync partition
-                query: 'sum(kafka_topic_partition_in_sync_replica) by (topic) < 3'
+                query: "sum(kafka_topic_partition_in_sync_replica) by (topic) < 3"
                 severity: critical
               - name: Kafka consumers group
                 description: Kafka consumers group
-                query: 'sum(kafka_consumergroup_lag) by (consumergroup) > 50'
+                query: "sum(kafka_consumergroup_lag) by (consumergroup) > 50"
                 severity: critical
                 for: 1m
           - name: linkedin/Burrow
@@ -1326,11 +1323,11 @@ groups:
             rules:
               - name: Kafka topic offset decreased
                 description: Kafka topic offset has decreased
-                query: 'delta(kafka_burrow_partition_current_offset[1m]) < 0'
+                query: "delta(kafka_burrow_partition_current_offset[1m]) < 0"
                 severity: warning
               - name: Kafka consumer lag
                 description: Kafka consumer has a 30 minutes and increasing lag
-                query: 'kafka_burrow_topic_partition_offset - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset >= (kafka_burrow_topic_partition_offset offset 15m - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset offset 15m) AND kafka_burrow_topic_partition_offset - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset > 0'
+                query: "kafka_burrow_topic_partition_offset - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset >= (kafka_burrow_topic_partition_offset offset 15m - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset offset 15m) AND kafka_burrow_topic_partition_offset - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset > 0"
                 severity: warning
                 for: 15m
 
@@ -1399,22 +1396,22 @@ groups:
             rules:
               - name: Nats high connection count
                 description: High number of NATS connections ({{ $value }}) for {{ $labels.instance }}
-                query: 'gnatsd_varz_connections > 100'
+                query: "gnatsd_varz_connections > 100"
                 severity: warning
                 for: 3m
               - name: Nats high pending bytes
                 description: High number of NATS pending bytes ({{ $value }}) for {{ $labels.instance }}
-                query: 'gnatsd_connz_pending_bytes > 100000'
+                query: "gnatsd_connz_pending_bytes > 100000"
                 severity: warning
                 for: 3m
               - name: Nats high subscriptions count
                 description: High number of NATS subscriptions ({{ $value }}) for {{ $labels.instance }}
-                query: 'gnatsd_connz_subscriptions > 50'
+                query: "gnatsd_connz_subscriptions > 50"
                 severity: warning
                 for: 3m
               - name: Nats high routes count
                 description: High number of NATS routes ({{ $value }}) for {{ $labels.instance }}
-                query: 'gnatsd_routez_num_routes > 10'
+                query: "gnatsd_routez_num_routes > 10"
                 severity: warning
                 for: 3m
 
@@ -1426,7 +1423,7 @@ groups:
             rules:
               - name: Solr update errors
                 description: Solr collection {{ $labels.collection }} has failed updates for replica {{ $labels.replica }} on {{ $labels.base_url }}.
-                query: 'increase(solr_metrics_core_update_handler_errors_total[1m]) > 1'
+                query: "increase(solr_metrics_core_update_handler_errors_total[1m]) > 1"
                 severity: critical
               - name: Solr query errors
                 description: Solr has increased query errors in collection {{ $labels.collection }} for replica {{ $labels.replica }} on {{ $labels.base_url }}.
@@ -1439,7 +1436,7 @@ groups:
                 severity: critical
               - name: Solr low live node count
                 description: Solr collection {{ $labels.collection }} has less than two live nodes for replica {{ $labels.replica }} on {{ $labels.base_url }}.
-                query: 'solr_collections_live_nodes < 2'
+                query: "solr_collections_live_nodes < 2"
                 severity: critical
 
       - name: Hadoop
@@ -1518,7 +1515,6 @@ groups:
                 severity: warning
                 description: "HBase Write Requests are experiencing high latency."
 
-
   - name: Reverse proxies and load balancers
     services:
       - name: Nginx
@@ -1539,7 +1535,7 @@ groups:
                 for: 1m
               - name: Nginx latency high
                 description: Nginx p99 latency is higher than 3 seconds
-                query: 'histogram_quantile(0.99, sum(rate(nginx_http_request_duration_seconds_bucket[2m])) by (host, node, le)) > 3'
+                query: "histogram_quantile(0.99, sum(rate(nginx_http_request_duration_seconds_bucket[2m])) by (host, node, le)) > 3"
                 severity: warning
                 for: 2m
 
@@ -1551,7 +1547,7 @@ groups:
             rules:
               - name: Apache down
                 description: Apache down
-                query: 'apache_up == 0'
+                query: "apache_up == 0"
                 severity: critical
               - name: Apache workers load
                 description: Apache workers in busy state approach the max workers count 80% workers busy on {{ $labels.instance }}
@@ -1560,7 +1556,7 @@ groups:
                 for: 2m
               - name: Apache restart
                 description: Apache has just been restarted.
-                query: 'apache_uptime_seconds_total / 60 < 1'
+                query: "apache_uptime_seconds_total / 60 < 1"
                 severity: warning
 
       - name: HaProxy
@@ -1667,54 +1663,54 @@ groups:
                 for: 1m
               - name: HAProxy server response errors
                 description: Too many response errors to {{ $labels.server }} server (> 5%).
-                query: 'sum by (server) (rate(haproxy_server_response_errors_total[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5'
+                query: "sum by (server) (rate(haproxy_server_response_errors_total[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5"
                 severity: critical
                 for: 1m
               - name: HAProxy backend connection errors
                 description: Too many connection errors to {{ $labels.fqdn }}/{{ $labels.backend }} backend (> 100 req/s). Request throughput may be too high.
-                query: 'sum by (backend) (rate(haproxy_backend_connection_errors_total[1m])) > 100'
+                query: "sum by (backend) (rate(haproxy_backend_connection_errors_total[1m])) > 100"
                 severity: critical
                 for: 1m
               - name: HAProxy server connection errors
                 description: Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high.
-                query: 'sum by (server) (rate(haproxy_server_connection_errors_total[1m])) > 100'
+                query: "sum by (server) (rate(haproxy_server_connection_errors_total[1m])) > 100"
                 severity: critical
               - name: HAProxy backend max active session
                 description: HAproxy backend {{ $labels.fqdn }}/{{ $labels.backend }} is reaching session limit (> 80%).
-                query: '((sum by (backend) (avg_over_time(haproxy_backend_max_sessions[2m]) * 100) / sum by (backend) (avg_over_time(haproxy_backend_limit_sessions[2m])))) > 80'
+                query: "((sum by (backend) (avg_over_time(haproxy_backend_max_sessions[2m]) * 100) / sum by (backend) (avg_over_time(haproxy_backend_limit_sessions[2m])))) > 80"
                 severity: warning
                 for: 2m
               - name: HAProxy pending requests
                 description: Some HAProxy requests are pending on {{ $labels.fqdn }}/{{ $labels.backend }} backend
-                query: 'sum by (backend) (haproxy_backend_current_queue) > 0'
+                query: "sum by (backend) (haproxy_backend_current_queue) > 0"
                 severity: warning
                 for: 2m
               - name: HAProxy HTTP slowing down
                 description: Average request time is increasing
-                query: 'avg by (backend) (haproxy_backend_http_total_time_average_seconds) > 1'
+                query: "avg by (backend) (haproxy_backend_http_total_time_average_seconds) > 1"
                 severity: warning
                 for: 1m
               - name: HAProxy retry high
                 description: High rate of retry on {{ $labels.fqdn }}/{{ $labels.backend }} backend
-                query: 'sum by (backend) (rate(haproxy_backend_retry_warnings_total[1m])) > 10'
+                query: "sum by (backend) (rate(haproxy_backend_retry_warnings_total[1m])) > 10"
                 severity: warning
                 for: 2m
               - name: HAProxy backend down
                 description: HAProxy backend is down
-                query: 'haproxy_backend_up == 0'
+                query: "haproxy_backend_up == 0"
                 severity: critical
               - name: HAProxy server down
                 description: HAProxy server is down
-                query: 'haproxy_server_up == 0'
+                query: "haproxy_server_up == 0"
                 severity: critical
               - name: HAProxy frontend security blocked requests
                 description: HAProxy is blocking requests for security reason
-                query: 'sum by (frontend) (rate(haproxy_frontend_requests_denied_total[2m])) > 10'
+                query: "sum by (frontend) (rate(haproxy_frontend_requests_denied_total[2m])) > 10"
                 severity: warning
                 for: 2m
               - name: HAProxy server healthcheck failure
                 description: Some server healthcheck are failing on {{ $labels.server }}
-                query: 'increase(haproxy_server_check_failures_total[1m]) > 0'
+                query: "increase(haproxy_server_check_failures_total[1m]) > 0"
                 severity: warning
                 for: 1m
 
@@ -1726,7 +1722,7 @@ groups:
             rules:
               - name: Traefik service down
                 description: All Traefik services are down
-                query: 'count(traefik_service_server_up) by (service) == 0'
+                query: "count(traefik_service_server_up) by (service) == 0"
                 severity: critical
               - name: Traefik high HTTP 4xx error rate service
                 description: Traefik service 4xx error rate is above 5%
@@ -1744,7 +1740,7 @@ groups:
             rules:
               - name: Traefik backend down
                 description: All Traefik backends are down
-                query: 'count(traefik_backend_server_up) by (backend) == 0'
+                query: "count(traefik_backend_server_up) by (backend) == 0"
                 severity: critical
               - name: Traefik high HTTP 4xx error rate backend
                 description: Traefik backend 4xx error rate is above 5%
@@ -1767,7 +1763,7 @@ groups:
             rules:
               - name: PHP-FPM max-children reached
                 description: PHP-FPM reached max children - {{ $labels.instance }}
-                query: 'sum(phpfpm_max_children_reached_total) by (instance) > 0'
+                query: "sum(phpfpm_max_children_reached_total) by (instance) > 0"
                 severity: warning
 
       - name: JVM
@@ -1790,15 +1786,14 @@ groups:
             rules:
               - name: Sidekiq queue size
                 description: Sidekiq queue {{ $labels.name }} is growing
-                query: 'sidekiq_queue_size > 100'
+                query: "sidekiq_queue_size > 100"
                 severity: warning
                 for: 1m
               - name: Sidekiq scheduling latency too high
                 description: Sidekiq jobs are taking more than 1min to be picked up. Users may be seeing delays in background processing.
-                query: 'max(sidekiq_queue_latency) > 60'
+                query: "max(sidekiq_queue_latency) > 60"
                 severity: critical
 
-
   - name: Orchestrators
     services:
       - name: Kubernetes
@@ -1843,12 +1838,12 @@ groups:
               - name: Kubernetes Job failed
                 summary: Kubernetes Job failed ({{ $labels.namespace }}/{{ $labels.job_name }})
                 description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete"
-                query: 'kube_job_status_failed > 0'
+                query: "kube_job_status_failed > 0"
                 severity: warning
               - name: Kubernetes CronJob suspended
                 summary: Kubernetes CronJob suspended ({{ $labels.namespace }}/{{ $labels.cronjob }})
                 description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is suspended"
-                query: 'kube_cronjob_spec_suspend != 0'
+                query: "kube_cronjob_spec_suspend != 0"
                 severity: warning
               - name: Kubernetes PersistentVolumeClaim pending
                 summary: Kubernetes PersistentVolumeClaim pending ({{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }})
@@ -1858,12 +1853,12 @@ groups:
                 for: 2m
               - name: Kubernetes Volume out of disk space
                 description: Volume is almost full (< 10% left)
-                query: 'kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes * 100 < 10'
+                query: "kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes * 100 < 10"
                 severity: warning
                 for: 2m
               - name: Kubernetes Volume full in four days
                 description: "Volume under {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available."
-                query: 'predict_linear(kubelet_volume_stats_available_bytes[6h:5m], 4 * 24 * 3600) < 0'
+                query: "predict_linear(kubelet_volume_stats_available_bytes[6h:5m], 4 * 24 * 3600) < 0"
                 severity: critical
               - name: Kubernetes PersistentVolume error
                 summary: Kubernetes PersistentVolumeClaim pending ({{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }})
@@ -1873,7 +1868,7 @@ groups:
               - name: Kubernetes StatefulSet down
                 summary: Kubernetes StatefulSet down ({{ $labels.namespace }}/{{ $labels.statefulset }})
                 description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} went down
-                query: 'kube_statefulset_replicas != kube_statefulset_status_replicas_ready > 0'
+                query: "kube_statefulset_replicas != kube_statefulset_status_replicas_ready > 0"
                 severity: critical
                 for: 1m
               - name: Kubernetes HPA scale inability
@@ -1887,12 +1882,12 @@ groups:
                 severity: warning
               - name: Kubernetes HPA scale maximum
                 description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has hit maximum number of desired pods
-                query: 'kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas'
+                query: "kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas"
                 severity: info
                 for: 2m
               - name: Kubernetes HPA underutilized
                 description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is constantly at minimum replicas for 50% of the time. Potential cost saving here.
-                query: 'max(quantile_over_time(0.5, kube_horizontalpodautoscaler_status_desired_replicas[1d]) == kube_horizontalpodautoscaler_spec_min_replicas) by (horizontalpodautoscaler) > 3'  # allow minimum 3 replicas running
+                query: "max(quantile_over_time(0.5, kube_horizontalpodautoscaler_status_desired_replicas[1d]) == kube_horizontalpodautoscaler_spec_min_replicas) by (horizontalpodautoscaler) > 3" # allow minimum 3 replicas running
                 severity: info
               - name: Kubernetes Pod not healthy
                 summary: Kubernetes Pod not healthy ({{ $labels.namespace }}/{{ $labels.pod }})
@@ -1903,67 +1898,67 @@ groups:
               - name: Kubernetes pod crash looping
                 summary: Kubernetes pod crash looping ({{ $labels.namespace }}/{{ $labels.pod }})
                 description: Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping
-                query: 'increase(kube_pod_container_status_restarts_total[1m]) > 3'
+                query: "increase(kube_pod_container_status_restarts_total[1m]) > 3"
                 severity: warning
                 for: 2m
               - name: Kubernetes ReplicaSet replicas mismatch
                 summary: Kubernetes ReplicasSet mismatch ({{ $labels.namespace }}/{{ $labels.replicaset }})
                 description: ReplicaSet {{ $labels.namespace }}/{{ $labels.replicaset }} replicas mismatch
-                query: 'kube_replicaset_spec_replicas != kube_replicaset_status_ready_replicas'
+                query: "kube_replicaset_spec_replicas != kube_replicaset_status_ready_replicas"
                 severity: warning
                 for: 10m
               - name: Kubernetes Deployment replicas mismatch
                 summary: Kubernetes Deployment replicas mismatch ({{ $labels.namespace }}/{{ $labels.deployment }})
                 description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replicas mismatch
-                query: 'kube_deployment_spec_replicas != kube_deployment_status_replicas_available'
+                query: "kube_deployment_spec_replicas != kube_deployment_status_replicas_available"
                 severity: warning
                 for: 10m
               - name: Kubernetes StatefulSet replicas mismatch
                 description: StatefulSet does not match the expected number of replicas.
-                query: 'kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas'
+                query: "kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas"
                 severity: warning
                 for: 10m
               - name: Kubernetes Deployment generation mismatch
                 summary: Kubernetes Deployment generation mismatch ({{ $labels.namespace }}/{{ $labels.deployment }})
                 description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has failed but has not been rolled back.
-                query: 'kube_deployment_status_observed_generation != kube_deployment_metadata_generation'
+                query: "kube_deployment_status_observed_generation != kube_deployment_metadata_generation"
                 severity: critical
                 for: 10m
               - name: Kubernetes StatefulSet generation mismatch
                 summary: Kubernetes StatefulSet generation mismatch ({{ $labels.namespace }}/{{ $labels.statefulset }})
                 description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has failed but has not been rolled back.
-                query: 'kube_statefulset_status_observed_generation != kube_statefulset_metadata_generation'
+                query: "kube_statefulset_status_observed_generation != kube_statefulset_metadata_generation"
                 severity: critical
                 for: 10m
               - name: Kubernetes StatefulSet update not rolled out
                 summary: Kubernetes StatefulSet update not rolled out ({{ $labels.namespace }}/{{ $labels.statefulset }})
                 description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.
-                query: 'max without (revision) (kube_statefulset_status_current_revision unless kube_statefulset_status_update_revision) * (kube_statefulset_replicas != kube_statefulset_status_replicas_updated)'
+                query: "max without (revision) (kube_statefulset_status_current_revision unless kube_statefulset_status_update_revision) * (kube_statefulset_replicas != kube_statefulset_status_replicas_updated)"
                 severity: warning
                 for: 10m
               - name: Kubernetes DaemonSet rollout stuck
                 summary: Kubernetes DaemonSet rollout stuck ({{ $labels.namespace }}/{{ $labels.daemonset }})
                 description: Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled or not ready
-                query: 'kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 or kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0'
+                query: "kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 or kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0"
                 severity: warning
                 for: 10m
               - name: Kubernetes DaemonSet misscheduled
                 summary: Kubernetes DaemonSet misscheduled ({{ $labels.namespace }}/{{ $labels.daemonset }})
                 description: Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run
-                query: 'kube_daemonset_status_number_misscheduled > 0'
+                query: "kube_daemonset_status_number_misscheduled > 0"
                 severity: critical
                 for: 1m
               - name: Kubernetes CronJob too long
                 summary: Kubernetes CronJob too long ({{ $labels.namespace }}/{{ $labels.cronjob }})
                 description: CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete.
-                query: 'time() - kube_cronjob_next_schedule_time > 3600'
+                query: "time() - kube_cronjob_next_schedule_time > 3600"
                 severity: warning
                 comments: |
                   Threshold should be customized for each cronjob name.
               - name: Kubernetes Job slow completion
                 summary: Kubernetes job slow completion ({{ $labels.namespace }}/{{ $labels.job_name }})
                 description: Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in time.
-                query: 'kube_job_spec_completions - kube_job_status_succeeded - kube_job_status_failed > 0'
+                query: "kube_job_spec_completions - kube_job_status_succeeded - kube_job_status_failed > 0"
                 severity: critical
                 for: 12h
               - name: Kubernetes API server errors
@@ -1985,12 +1980,11 @@ groups:
                 query: 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 24*60*60'
                 severity: critical
               - name: Kubernetes API server latency
-                description: 'Kubernetes API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}.'
+                description: "Kubernetes API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}."
                 query: 'histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"} [10m])) WITHOUT (instance, resource)) / 1e+06 > 1'
                 severity: warning
                 for: 2m
 
-
       - name: Nomad
         exporters:
           - name: Embedded exporter
@@ -1998,20 +1992,20 @@ groups:
             rules:
               - name: Nomad job failed
                 description: Nomad job failed
-                query: 'nomad_nomad_job_summary_failed > 0'
+                query: "nomad_nomad_job_summary_failed > 0"
                 severity: warning
               - name: Nomad job lost
                 description: Nomad job lost
-                query: 'nomad_nomad_job_summary_lost > 0'
+                query: "nomad_nomad_job_summary_lost > 0"
                 severity: warning
               - name: Nomad job queued
                 description: Nomad job queued
-                query: 'nomad_nomad_job_summary_queued > 0'
+                query: "nomad_nomad_job_summary_queued > 0"
                 severity: warning
                 for: 2m
               - name: Nomad blocked evaluation
                 description: Nomad blocked evaluation
-                query: 'nomad_nomad_blocked_evals_total_blocked > 0'
+                query: "nomad_nomad_blocked_evals_total_blocked > 0"
                 severity: warning
 
       - name: Consul
@@ -2022,12 +2016,12 @@ groups:
             rules:
               - name: Consul service healthcheck failed
                 description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`"
-                query: 'consul_catalog_service_node_healthy == 0'
+                query: "consul_catalog_service_node_healthy == 0"
                 severity: critical
-                for: 1m   # allows a short service restart
+                for: 1m # allows a short service restart
               - name: Consul missing master node
                 description: Numbers of consul raft peers should be 3, in order to preserve quorum.
-                query: 'consul_raft_peers < 3'
+                query: "consul_raft_peers < 3"
                 severity: critical
               - name: Consul agent unhealthy
                 description: A Consul agent is down
@@ -2041,15 +2035,15 @@ groups:
             rules:
               - name: Etcd insufficient Members
                 description: Etcd cluster should have an odd number of members
-                query: 'count(etcd_server_id) % 2 == 0'
+                query: "count(etcd_server_id) % 2 == 0"
                 severity: critical
               - name: Etcd no Leader
                 description: Etcd cluster have no leader
-                query: 'etcd_server_has_leader == 0'
+                query: "etcd_server_has_leader == 0"
                 severity: critical
               - name: Etcd high number of leader changes
                 description: Etcd leader changed more than 2 times during 10 minutes
-                query: 'increase(etcd_server_leader_changes_seen_total[10m]) > 2'
+                query: "increase(etcd_server_leader_changes_seen_total[10m]) > 2"
                 severity: warning
               - name: Etcd high number of failed GRPC requests
                 description: More than 1% GRPC request failure detected in Etcd
@@ -2068,37 +2062,37 @@ groups:
                 for: 2m
               - name: Etcd high number of failed HTTP requests
                 description: More than 1% HTTP failure detected in Etcd
-                query: 'sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.01'
+                query: "sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.01"
                 severity: warning
                 for: 2m
               - name: Etcd high number of failed HTTP requests
                 description: More than 5% HTTP failure detected in Etcd
-                query: 'sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.05'
+                query: "sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.05"
                 severity: critical
                 for: 2m
               - name: Etcd HTTP requests slow
                 description: HTTP requests slowing down, 99th percentile is over 0.15s
-                query: 'histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[1m])) > 0.15'
+                query: "histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[1m])) > 0.15"
                 severity: warning
                 for: 2m
               - name: Etcd member communication slow
                 description: Etcd member communication slowing down, 99th percentile is over 0.15s
-                query: 'histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) > 0.15'
+                query: "histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) > 0.15"
                 severity: warning
                 for: 2m
               - name: Etcd high number of failed proposals
                 description: Etcd server got more than 5 failed proposals past hour
-                query: 'increase(etcd_server_proposals_failed_total[1h]) > 5'
+                query: "increase(etcd_server_proposals_failed_total[1h]) > 5"
                 severity: warning
                 for: 2m
               - name: Etcd high fsync durations
                 description: Etcd WAL fsync duration increasing, 99th percentile is over 0.5s
-                query: 'histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) > 0.5'
+                query: "histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) > 0.5"
                 severity: warning
                 for: 2m
               - name: Etcd high commit durations
                 description: Etcd commit duration increasing, 99th percentile is over 0.25s
-                query: 'histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[1m])) > 0.25'
+                query: "histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[1m])) > 0.25"
                 severity: warning
                 for: 2m
 
@@ -2110,7 +2104,7 @@ groups:
             rules:
               - name: Linkerd high error rate
                 description: Linkerd error rate for {{ $labels.deployment | $labels.statefulset | $labels.daemonset }} is over 10%
-                query: 'sum(rate(request_errors_total[1m])) by (deployment, statefulset, daemonset) / sum(rate(request_total[1m])) by (deployment, statefulset, daemonset) * 100 > 10'
+                query: "sum(rate(request_errors_total[1m])) by (deployment, statefulset, daemonset) / sum(rate(request_total[1m])) by (deployment, statefulset, daemonset) * 100 > 10"
                 severity: warning
                 for: 1m
 
@@ -2127,7 +2121,7 @@ groups:
                 for: 1m
               - name: Istio Pilot high total request rate
                 description: Number of Istio Pilot push errors is too high (> 5%). Envoy sidecars might have outdated configuration.
-                query: 'sum(rate(pilot_xds_push_errors[1m])) / sum(rate(pilot_xds_pushes[1m])) * 100 > 5'
+                query: "sum(rate(pilot_xds_push_errors[1m])) / sum(rate(pilot_xds_pushes[1m])) * 100 > 5"
                 severity: warning
                 for: 1m
               - name: Istio Mixer Prometheus dispatches low
@@ -2162,12 +2156,12 @@ groups:
                 for: 1m
               - name: Istio latency 99 percentile
                 description: Istio 1% slowest requests are longer than 1000ms.
-                query: 'histogram_quantile(0.99, sum(rate(istio_request_duration_milliseconds_bucket[1m])) by (destination_canonical_service, destination_workload_namespace, source_canonical_service, source_workload_namespace, le)) > 1000'
+                query: "histogram_quantile(0.99, sum(rate(istio_request_duration_milliseconds_bucket[1m])) by (destination_canonical_service, destination_workload_namespace, source_canonical_service, source_workload_namespace, le)) > 1000"
                 severity: warning
                 for: 1m
               - name: Istio Pilot Duplicate Entry
                 description: Istio pilot duplicate entry error.
-                query: 'sum(rate(pilot_duplicate_envoy_clusters{}[5m])) > 0'
+                query: "sum(rate(pilot_duplicate_envoy_clusters{}[5m])) > 0"
                 severity: critical
 
       - name: ArgoCD
@@ -2187,7 +2181,6 @@ groups:
                 severity: warning
                 for: 15m
 
-
   - name: Network, security and storage
     services:
       - name: Ceph
@@ -2198,25 +2191,25 @@ groups:
             rules:
               - name: Ceph State
                 description: Ceph instance unhealthy
-                query: 'ceph_health_status != 0'
+                query: "ceph_health_status != 0"
                 severity: critical
               - name: Ceph monitor clock skew
                 description: Ceph monitor clock skew detected. Please check ntp and hardware clock settings
-                query: 'abs(ceph_monitor_clock_skew_seconds) > 0.2'
+                query: "abs(ceph_monitor_clock_skew_seconds) > 0.2"
                 severity: warning
                 for: 2m
               - name: Ceph monitor low space
                 description: Ceph monitor storage is low.
-                query: 'ceph_monitor_avail_percent < 10'
+                query: "ceph_monitor_avail_percent < 10"
                 severity: warning
                 for: 2m
               - name: Ceph OSD Down
                 description: Ceph Object Storage Daemon Down
-                query: 'ceph_osd_up == 0'
+                query: "ceph_osd_up == 0"
                 severity: critical
               - name: Ceph high OSD latency
                 description: "Ceph Object Storage Daemon latency is high. Please check if it doesn't stuck in weird state."
-                query: 'ceph_osd_perf_apply_latency_seconds > 5'
+                query: "ceph_osd_perf_apply_latency_seconds > 5"
                 severity: warning
                 for: 1m
               - name: Ceph OSD low space
@@ -2226,16 +2219,16 @@ groups:
                 for: 2m
               - name: Ceph OSD reweighted
                 description: Ceph Object Storage Daemon takes too much time to resize.
-                query: 'ceph_osd_weight < 1'
+                query: "ceph_osd_weight < 1"
                 severity: warning
                 for: 2m
               - name: Ceph PG down
                 description: Some Ceph placement groups are down. Please ensure that all the data are available.
-                query: 'ceph_pg_down > 0'
+                query: "ceph_pg_down > 0"
                 severity: critical
               - name: Ceph PG incomplete
                 description: Some Ceph placement groups are incomplete. Please ensure that all the data are available.
-                query: 'ceph_pg_incomplete > 0'
+                query: "ceph_pg_incomplete > 0"
                 severity: critical
               - name: Ceph PG inconsistent
                 description: Some Ceph placement groups are inconsistent. Data is available but inconsistent across nodes.
@@ -2243,17 +2236,17 @@ groups:
                 severity: warning
               - name: Ceph PG activation long
                 description: Some Ceph placement groups are too long to activate.
-                query: 'ceph_pg_activating > 0'
+                query: "ceph_pg_activating > 0"
                 severity: warning
                 for: 2m
               - name: Ceph PG backfill full
                 description: Some Ceph placement groups are located on full Object Storage Daemon on cluster. Those PGs can be unavailable shortly. Please check OSDs, change weight or reconfigure CRUSH rules.
-                query: 'ceph_pg_backfill_toofull > 0'
+                query: "ceph_pg_backfill_toofull > 0"
                 severity: warning
                 for: 2m
               - name: Ceph PG unavailable
                 description: Some Ceph placement groups are unavailable.
-                query: 'ceph_pg_total - ceph_pg_active > 0'
+                query: "ceph_pg_total - ceph_pg_active > 0"
                 severity: critical
 
       - name: SpeedTest
@@ -2264,11 +2257,11 @@ groups:
             rules:
               - name: SpeedTest Slow Internet Download
                 description: Internet download speed is currently {{humanize $value}} Mbps.
-                query: 'avg_over_time(speedtest_download[10m]) < 100'
+                query: "avg_over_time(speedtest_download[10m]) < 100"
                 severity: warning
               - name: SpeedTest Slow Internet Upload
                 description: Internet upload speed is currently {{humanize $value}} Mbps.
-                query: 'avg_over_time(speedtest_upload[10m]) < 20'
+                query: "avg_over_time(speedtest_upload[10m]) < 20"
                 severity: warning
 
       - name: ZFS
@@ -2278,7 +2271,7 @@ groups:
             doc_url: https://github.com/prometheus/node_exporter
             rules:
               - name: ZFS offline pool
-                description: 'A ZFS zpool is in a unexpected state: {{ $labels.state }}.'
+                description: "A ZFS zpool is in a unexpected state: {{ $labels.state }}."
                 query: 'node_zfs_zpool_state{state!="online"} > 0'
                 severity: critical
                 for: 1m
@@ -2288,11 +2281,11 @@ groups:
             rules:
               - name: ZFS pool out of space
                 description: Disk is almost full (< 10% left)
-                query: 'zfs_pool_free_bytes * 100 / zfs_pool_size_bytes < 10 and ON (instance, device, mountpoint) zfs_pool_readonly == 0'
+                query: "zfs_pool_free_bytes * 100 / zfs_pool_size_bytes < 10 and ON (instance, device, mountpoint) zfs_pool_readonly == 0"
                 severity: warning
               - name: ZFS pool unhealthy
                 description: ZFS pool state is {{ $value }}. See comments for more information.
-                query: 'zfs_pool_health > 0'
+                query: "zfs_pool_health > 0"
                 severity: critical
                 comments: |
                   0: ONLINE
@@ -2304,7 +2297,7 @@ groups:
                   6: SUSPENDED
               - name: ZFS collector failed
                 description: ZFS collector for {{ $labels.instance }} has failed to collect information
-                query: 'zfs_scrape_collector_success != 1'
+                query: "zfs_scrape_collector_success != 1"
                 severity: warning
 
       - name: OpenEBS
@@ -2313,8 +2306,8 @@ groups:
             slug: embedded-exporter
             rules:
               - name: OpenEBS used pool capacity
-                description: 'OpenEBS Pool use more than 80% of his capacity'
-                query: 'openebs_used_pool_capacity_percent > 80'
+                description: "OpenEBS Pool use more than 80% of his capacity"
+                query: "openebs_used_pool_capacity_percent > 80"
                 severity: warning
                 for: 2m
 
@@ -2324,15 +2317,15 @@ groups:
             slug: embedded-exporter
             rules:
               - name: Minio cluster disk offline
-                description: 'Minio cluster disk is offline'
-                query: 'minio_cluster_disk_offline_total > 0'
+                description: "Minio cluster disk is offline"
+                query: "minio_cluster_disk_offline_total > 0"
                 severity: critical
               - name: Minio node disk offline
-                description: 'Minio cluster node disk is offline'
-                query: 'minio_cluster_nodes_offline_total > 0'
+                description: "Minio cluster node disk is offline"
+                query: "minio_cluster_nodes_offline_total > 0"
                 severity: critical
               - name: Minio disk space usage
-                description: 'Minio available free space is low (< 10%)'
+                description: "Minio available free space is low (< 10%)"
                 query: disk_storage_available / disk_storage_total * 100 < 10
                 severity: warning
 
@@ -2355,7 +2348,7 @@ groups:
                 query: ssl_ocsp_response_status == 1
                 severity: critical
               - name: SSL certificate expiry (< 7 days)
-                description: '{{ $labels.instance }} Certificate is expiring in 7 days'
+                description: "{{ $labels.instance }} Certificate is expiring in 7 days"
                 query: ssl_verified_cert_not_after{chain_no="0"} - time() < 86400 * 7
                 severity: warning
 
@@ -2371,12 +2364,12 @@ groups:
                 severity: critical
               - name: Juniper high Bandwidth Usage 1GiB
                 description: Interface is highly saturated. (> 0.90GiB/s)
-                query: 'rate(junos_interface_transmit_bytes[1m]) * 8 > 1e+9 * 0.90'
+                query: "rate(junos_interface_transmit_bytes[1m]) * 8 > 1e+9 * 0.90"
                 severity: critical
                 for: 1m
               - name: Juniper high Bandwidth Usage 1GiB
                 description: Interface is getting saturated. (> 0.80GiB/s)
-                query: 'rate(junos_interface_transmit_bytes[1m]) * 8 > 1e+9 * 0.80'
+                query: "rate(junos_interface_transmit_bytes[1m]) * 8 > 1e+9 * 0.80"
                 severity: warning
                 for: 1m
 
@@ -2387,7 +2380,7 @@ groups:
             rules:
               - name: CoreDNS Panic Count
                 description: Number of CoreDNS panics encountered
-                query: 'increase(coredns_panics_total[1m]) > 0'
+                query: "increase(coredns_panics_total[1m]) > 0"
                 severity: critical
 
       - name: Freeswitch
@@ -2398,16 +2391,16 @@ groups:
             rules:
               - name: Freeswitch down
                 description: Freeswitch is unresponsive
-                query: 'freeswitch_up == 0'
+                query: "freeswitch_up == 0"
                 severity: critical
               - name: Freeswitch Sessions Warning
                 description: 'High sessions usage on {{ $labels.instance }}: {{ $value | printf "%.2f"}}%'
-                query: '(freeswitch_session_active * 100 / freeswitch_session_limit) > 80'
+                query: "(freeswitch_session_active * 100 / freeswitch_session_limit) > 80"
                 severity: warning
                 for: 10m
               - name: Freeswitch Sessions Critical
                 description: 'High sessions usage on {{ $labels.instance }}: {{ $value | printf "%.2f"}}%'
-                query: '(freeswitch_session_active * 100 / freeswitch_session_limit) > 90'
+                query: "(freeswitch_session_active * 100 / freeswitch_session_limit) > 90"
                 severity: critical
                 for: 5m
 
@@ -2418,12 +2411,12 @@ groups:
             doc_url: https://github.com/hashicorp/vault/blob/master/website/content/docs/configuration/telemetry.mdx#prometheus
             rules:
               - name: Vault sealed
-                description: 'Vault instance is sealed on {{ $labels.instance }}'
-                query: 'vault_core_unsealed == 0'
+                description: "Vault instance is sealed on {{ $labels.instance }}"
+                query: "vault_core_unsealed == 0"
                 severity: critical
               - name: Vault too many pending tokens
                 description: 'Too many pending tokens {{ $labels.instance }}: {{ $value | printf "%.2f"}}%'
-                query: 'avg(vault_token_create_count - vault_token_store_count) > 0'
+                query: "avg(vault_token_create_count - vault_token_store_count) > 0"
                 severity: warning
                 for: 5m
               - name: Vault too many infinity tokens
@@ -2433,10 +2426,9 @@ groups:
                 for: 5m
               - name: Vault cluster health
                 description: 'Vault cluster is not healthy {{ $labels.instance }}: {{ $value | printf "%.2f"}}%'
-                query: 'sum(vault_core_active) / count(vault_core_active) <= 0.5'
+                query: "sum(vault_core_active) / count(vault_core_active) <= 0.5"
                 severity: critical
 
-
       - name: Cloudflare
         exporters:
           - name: lablabs/cloudflare-exporter
@@ -2444,15 +2436,14 @@ groups:
             doc_url: https://github.com/lablabs/cloudflare-exporter
             rules:
               - name: Cloudflare http 4xx error rate
-                description: 'Cloudflare high HTTP 4xx error rate (> 5% for domain {{ $labels.zone }})'
+                description: "Cloudflare high HTTP 4xx error rate (> 5% for domain {{ $labels.zone }})"
                 query: '(sum by(zone) (rate(cloudflare_zone_requests_status{status=~"^4.."}[15m])) / on (zone) sum by (zone) (rate(cloudflare_zone_requests_status[15m]))) * 100 > 5'
                 severity: warning
               - name: Cloudflare http 5xx error rate
-                description: 'Cloudflare high HTTP 5xx error rate (> 5% for domain {{ $labels.zone }})'
+                description: "Cloudflare high HTTP 5xx error rate (> 5% for domain {{ $labels.zone }})"
                 query: '(sum by (zone) (rate(cloudflare_zone_requests_status{status=~"^5.."}[5m])) / on (zone) sum by (zone) (rate(cloudflare_zone_requests_status[5m]))) * 100 > 5'
                 severity: critical
 
-
   - name: Other
     services:
       - name: Thanos
@@ -2460,252 +2451,252 @@ groups:
           - name: Thanos Compactor
             slug: thanos-compactor
             rules:
-            - name: Thanos Compactor Multiple Running
-              description: 'No more than one Thanos Compact instance should be running at once. There are {{$value}} instances running.'
-              query: 'sum by (job) (up{job=~".*thanos-compact.*"}) > 1'
-              severity: warning
-              for: 5m
-            - name: Thanos Compactor Halted
-              description: 'Thanos Compact {{$labels.job}} has failed to run and now is halted.'
-              query: 'thanos_compact_halted{job=~".*thanos-compact.*"} == 1'
-              severity: warning
-              for: 5m
-            - name: Thanos Compactor High Compaction Failures
-              description: 'Thanos Compact {{$labels.job}} is failing to execute {{$value | humanize}}% of compactions.'
-              query: '(sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~".*thanos-compact.*"}[5m])) / sum by (job) (rate(thanos_compact_group_compactions_total{job=~".*thanos-compact.*"}[5m])) * 100 > 5)'
-              severity: warning
-              for: 15m
-            - name: Thanos Compact Bucket High Operation Failures
-              description: 'Thanos Compact {{$labels.job}} Bucket is failing to execute {{$value | humanize}}% of operations.'
-              query: '(sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-compact.*"}[5m])) / sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-compact.*"}[5m])) * 100 > 5)'
-              severity: warning
-              for: 15m
-            - name: Thanos Compact Has Not Run
-              description: 'Thanos Compact {{$labels.job}} has not uploaded anything for 24 hours.'
-              query: '(time() - max by (job) (max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~".*thanos-compact.*"}[24h]))) / 60 / 60 > 24'
-              severity: warning
-              for: 0m
+              - name: Thanos Compactor Multiple Running
+                description: "No more than one Thanos Compact instance should be running at once. There are {{$value}} instances running."
+                query: 'sum by (job) (up{job=~".*thanos-compact.*"}) > 1'
+                severity: warning
+                for: 5m
+              - name: Thanos Compactor Halted
+                description: "Thanos Compact {{$labels.job}} has failed to run and now is halted."
+                query: 'thanos_compact_halted{job=~".*thanos-compact.*"} == 1'
+                severity: warning
+                for: 5m
+              - name: Thanos Compactor High Compaction Failures
+                description: "Thanos Compact {{$labels.job}} is failing to execute {{$value | humanize}}% of compactions."
+                query: '(sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~".*thanos-compact.*"}[5m])) / sum by (job) (rate(thanos_compact_group_compactions_total{job=~".*thanos-compact.*"}[5m])) * 100 > 5)'
+                severity: warning
+                for: 15m
+              - name: Thanos Compact Bucket High Operation Failures
+                description: "Thanos Compact {{$labels.job}} Bucket is failing to execute {{$value | humanize}}% of operations."
+                query: '(sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-compact.*"}[5m])) / sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-compact.*"}[5m])) * 100 > 5)'
+                severity: warning
+                for: 15m
+              - name: Thanos Compact Has Not Run
+                description: "Thanos Compact {{$labels.job}} has not uploaded anything for 24 hours."
+                query: '(time() - max by (job) (max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~".*thanos-compact.*"}[24h]))) / 60 / 60 > 24'
+                severity: warning
+                for: 0m
           - name: Thanos Query
             slug: thanos-query
             rules:
-            - name: Thanos Query Http Request Query Error Rate High
-              description: 'Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of "query" requests.'
-              query: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*", handler="query"}[5m]))/  sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query"}[5m]))) * 100 > 5'
-              severity: critical
-              for: 5m
-            - name: Thanos Query Http Request Query Range Error Rate High
-              description: 'Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of "query_range" requests.'
-              query: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*", handler="query_range"}[5m]))/  sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query_range"}[5m]))) * 100 > 5'
-              severity: critical
-              for: 5m
-            - name: Thanos Query Grpc Server Error Rate
-              description: 'Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.'
-              query: '(sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-query.*"}[5m]))/  sum by (job) (rate(grpc_server_started_total{job=~".*thanos-query.*"}[5m])) * 100 > 5)'
-              severity: warning
-              for: 5m
-            - name: Thanos Query Grpc Client Error Rate
-              description: 'Thanos Query {{$labels.job}} is failing to send {{$value | humanize}}% of requests.'
-              query: '(sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~".*thanos-query.*"}[5m])) / sum by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*"}[5m]))) * 100 > 5'
-              severity: warning
-              for: 5m
-            - name: Thanos Query High D N S Failures
-              description: 'Thanos Query {{$labels.job}} have {{$value | humanize}}% of failing DNS queries for store endpoints.'
-              query: '(sum by (job) (rate(thanos_query_store_apis_dns_failures_total{job=~".*thanos-query.*"}[5m])) / sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~".*thanos-query.*"}[5m]))) * 100 > 1'
-              severity: warning
-              for: 15m
-            - name: Thanos Query Instant Latency High
-              description: 'Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for instant queries.'
-              query: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m]))) > 40 and sum by (job) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m])) > 0)'
-              severity: critical
-              for: 10m
-            - name: Thanos Query Range Latency High
-              description: 'Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for range queries.'
-              query: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query_range"}[5m]))) > 90 and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-query.*", handler="query_range"}[5m])) > 0)'
-              severity: critical
-              for: 10m
-            - name: Thanos Query Overload
-              description: 'Thanos Query {{$labels.job}} has been overloaded for more than 15 minutes. This may be a symptom of excessive simultanous complex requests, low performance of the Prometheus API, or failures within these components. Assess the health of the Thanos query instances, the connnected Prometheus instances, look for potential senders of these requests and then contact support.'
-              query: '(max_over_time(thanos_query_concurrent_gate_queries_max[5m]) - avg_over_time(thanos_query_concurrent_gate_queries_in_flight[5m]) < 1)'
-              severity: warning
-              for: 15m
+              - name: Thanos Query Http Request Query Error Rate High
+                description: 'Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of "query" requests.'
+                query: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*", handler="query"}[5m]))/  sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query"}[5m]))) * 100 > 5'
+                severity: critical
+                for: 5m
+              - name: Thanos Query Http Request Query Range Error Rate High
+                description: 'Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of "query_range" requests.'
+                query: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*", handler="query_range"}[5m]))/  sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query_range"}[5m]))) * 100 > 5'
+                severity: critical
+                for: 5m
+              - name: Thanos Query Grpc Server Error Rate
+                description: "Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of requests."
+                query: '(sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-query.*"}[5m]))/  sum by (job) (rate(grpc_server_started_total{job=~".*thanos-query.*"}[5m])) * 100 > 5)'
+                severity: warning
+                for: 5m
+              - name: Thanos Query Grpc Client Error Rate
+                description: "Thanos Query {{$labels.job}} is failing to send {{$value | humanize}}% of requests."
+                query: '(sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~".*thanos-query.*"}[5m])) / sum by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*"}[5m]))) * 100 > 5'
+                severity: warning
+                for: 5m
+              - name: Thanos Query High D N S Failures
+                description: "Thanos Query {{$labels.job}} have {{$value | humanize}}% of failing DNS queries for store endpoints."
+                query: '(sum by (job) (rate(thanos_query_store_apis_dns_failures_total{job=~".*thanos-query.*"}[5m])) / sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~".*thanos-query.*"}[5m]))) * 100 > 1'
+                severity: warning
+                for: 15m
+              - name: Thanos Query Instant Latency High
+                description: "Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for instant queries."
+                query: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m]))) > 40 and sum by (job) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m])) > 0)'
+                severity: critical
+                for: 10m
+              - name: Thanos Query Range Latency High
+                description: "Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for range queries."
+                query: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query_range"}[5m]))) > 90 and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-query.*", handler="query_range"}[5m])) > 0)'
+                severity: critical
+                for: 10m
+              - name: Thanos Query Overload
+                description: "Thanos Query {{$labels.job}} has been overloaded for more than 15 minutes. This may be a symptom of excessive simultanous complex requests, low performance of the Prometheus API, or failures within these components. Assess the health of the Thanos query instances, the connnected Prometheus instances, look for potential senders of these requests and then contact support."
+                query: "(max_over_time(thanos_query_concurrent_gate_queries_max[5m]) - avg_over_time(thanos_query_concurrent_gate_queries_in_flight[5m]) < 1)"
+                severity: warning
+                for: 15m
           - name: Thanos Receiver
             slug: thanos-receiver
             rules:
-            - name: Thanos Receive Http Request Error Rate High
-              description: 'Thanos Receive {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.'
-              query: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-receive.*", handler="receive"}[5m]))/  sum by (job) (rate(http_requests_total{job=~".*thanos-receive.*", handler="receive"}[5m]))) * 100 > 5'
-              severity: critical
-              for: 5m
-            - name: Thanos Receive Http Request Latency High
-              description: 'Thanos Receive {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for requests.'
-              query: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-receive.*", handler="receive"}[5m]))) > 10 and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-receive.*", handler="receive"}[5m])) > 0)'
-              severity: critical
-              for: 10m
-            - name: Thanos Receive High Replication Failures
-              description: 'Thanos Receive {{$labels.job}} is failing to replicate {{$value | humanize}}% of requests.'
-              query: 'thanos_receive_replication_factor > 1 and ((sum by (job) (rate(thanos_receive_replications_total{result="error", job=~".*thanos-receive.*"}[5m])) / sum by (job) (rate(thanos_receive_replications_total{job=~".*thanos-receive.*"}[5m]))) > (max by (job) (floor((thanos_receive_replication_factor{job=~".*thanos-receive.*"}+1)/ 2)) / max by (job) (thanos_receive_hashring_nodes{job=~".*thanos-receive.*"}))) * 100'
-              severity: warning
-              for: 5m
-            - name: Thanos Receive High Forward Request Failures
-              description: 'Thanos Receive {{$labels.job}} is failing to forward {{$value | humanize}}% of requests.'
-              query: '(sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~".*thanos-receive.*"}[5m]))/  sum by (job) (rate(thanos_receive_forward_requests_total{job=~".*thanos-receive.*"}[5m]))) * 100 > 20'
-              severity: info
-              for: 5m
-            - name: Thanos Receive High Hashring File Refresh Failures
-              description: 'Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{$value | humanize}} of attempts failed.'
-              query: '(sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~".*thanos-receive.*"}[5m])) / sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{job=~".*thanos-receive.*"}[5m])) > 0)'
-              severity: warning
-              for: 15m
-            - name: Thanos Receive Config Reload Failure
-              description: 'Thanos Receive {{$labels.job}} has not been able to reload hashring configurations.'
-              query: 'avg by (job) (thanos_receive_config_last_reload_successful{job=~".*thanos-receive.*"}) != 1'
-              severity: warning
-              for: 5m
-            - name: Thanos Receive No Upload
-              description: 'Thanos Receive {{$labels.instance}} has not uploaded latest data to object storage.'
-              query: '(up{job=~".*thanos-receive.*"} - 1) + on (job, instance) (sum by (job, instance) (increase(thanos_shipper_uploads_total{job=~".*thanos-receive.*"}[3h])) == 0)'
-              severity: critical
-              for: 3h
+              - name: Thanos Receive Http Request Error Rate High
+                description: "Thanos Receive {{$labels.job}} is failing to handle {{$value | humanize}}% of requests."
+                query: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-receive.*", handler="receive"}[5m]))/  sum by (job) (rate(http_requests_total{job=~".*thanos-receive.*", handler="receive"}[5m]))) * 100 > 5'
+                severity: critical
+                for: 5m
+              - name: Thanos Receive Http Request Latency High
+                description: "Thanos Receive {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for requests."
+                query: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-receive.*", handler="receive"}[5m]))) > 10 and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-receive.*", handler="receive"}[5m])) > 0)'
+                severity: critical
+                for: 10m
+              - name: Thanos Receive High Replication Failures
+                description: "Thanos Receive {{$labels.job}} is failing to replicate {{$value | humanize}}% of requests."
+                query: 'thanos_receive_replication_factor > 1 and ((sum by (job) (rate(thanos_receive_replications_total{result="error", job=~".*thanos-receive.*"}[5m])) / sum by (job) (rate(thanos_receive_replications_total{job=~".*thanos-receive.*"}[5m]))) > (max by (job) (floor((thanos_receive_replication_factor{job=~".*thanos-receive.*"}+1)/ 2)) / max by (job) (thanos_receive_hashring_nodes{job=~".*thanos-receive.*"}))) * 100'
+                severity: warning
+                for: 5m
+              - name: Thanos Receive High Forward Request Failures
+                description: "Thanos Receive {{$labels.job}} is failing to forward {{$value | humanize}}% of requests."
+                query: '(sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~".*thanos-receive.*"}[5m]))/  sum by (job) (rate(thanos_receive_forward_requests_total{job=~".*thanos-receive.*"}[5m]))) * 100 > 20'
+                severity: info
+                for: 5m
+              - name: Thanos Receive High Hashring File Refresh Failures
+                description: "Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{$value | humanize}} of attempts failed."
+                query: '(sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~".*thanos-receive.*"}[5m])) / sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{job=~".*thanos-receive.*"}[5m])) > 0)'
+                severity: warning
+                for: 15m
+              - name: Thanos Receive Config Reload Failure
+                description: "Thanos Receive {{$labels.job}} has not been able to reload hashring configurations."
+                query: 'avg by (job) (thanos_receive_config_last_reload_successful{job=~".*thanos-receive.*"}) != 1'
+                severity: warning
+                for: 5m
+              - name: Thanos Receive No Upload
+                description: "Thanos Receive {{$labels.instance}} has not uploaded latest data to object storage."
+                query: '(up{job=~".*thanos-receive.*"} - 1) + on (job, instance) (sum by (job, instance) (increase(thanos_shipper_uploads_total{job=~".*thanos-receive.*"}[3h])) == 0)'
+                severity: critical
+                for: 3h
           - name: Thanos Sidecar
             slug: thanos-sidecar
             rules:
-            - name: Thanos Sidecar Bucket Operations Failed
-              description: 'Thanos Sidecar {{$labels.instance}} bucket operations are failing'
-              query: 'sum by (job, instance) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-sidecar.*"}[5m])) > 0'
-              severity: critical
-              for: 5m
-            - name: Thanos Sidecar No Connection To Started Prometheus
-              description: 'Thanos Sidecar {{$labels.instance}} is unhealthy.'
-              query: 'thanos_sidecar_prometheus_up{job=~".*thanos-sidecar.*"} == 0 and on (namespace, pod)prometheus_tsdb_data_replay_duration_seconds != 0'
-              severity: critical
-              for: 5m
+              - name: Thanos Sidecar Bucket Operations Failed
+                description: "Thanos Sidecar {{$labels.instance}} bucket operations are failing"
+                query: 'sum by (job, instance) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-sidecar.*"}[5m])) > 0'
+                severity: critical
+                for: 5m
+              - name: Thanos Sidecar No Connection To Started Prometheus
+                description: "Thanos Sidecar {{$labels.instance}} is unhealthy."
+                query: 'thanos_sidecar_prometheus_up{job=~".*thanos-sidecar.*"} == 0 and on (namespace, pod)prometheus_tsdb_data_replay_duration_seconds != 0'
+                severity: critical
+                for: 5m
           - name: Thanos Store
             slug: thanos-store
             rules:
-            - name: Thanos Store Grpc Error Rate
-              description: 'Thanos Store {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.'
-              query: '(sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-store.*"}[5m]))/  sum by (job) (rate(grpc_server_started_total{job=~".*thanos-store.*"}[5m])) * 100 > 5)'
-              severity: warning
-              for: 5m
-            - name: Thanos Store Series Gate Latency High
-              description: 'Thanos Store {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for store series gate requests.'
-              query: '(histogram_quantile(0.99, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=~".*thanos-store.*"}[5m]))) > 2 and sum by (job) (rate(thanos_bucket_store_series_gate_duration_seconds_count{job=~".*thanos-store.*"}[5m])) > 0)'
-              severity: warning
-              for: 10m
-            - name: Thanos Store Bucket High Operation Failures
-              description: 'Thanos Store {{$labels.job}} Bucket is failing to execute {{$value | humanize}}% of operations.'
-              query: '(sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-store.*"}[5m])) / sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-store.*"}[5m])) * 100 > 5)'
-              severity: warning
-              for: 15m
-            - name: Thanos Store Objstore Operation Latency High
-              description: 'Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of {{$value}} seconds for the bucket operations.'
-              query: '(histogram_quantile(0.99, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~".*thanos-store.*"}[5m]))) > 2 and  sum by (job) (rate(thanos_objstore_bucket_operation_duration_seconds_count{job=~".*thanos-store.*"}[5m])) > 0)'
-              severity: warning
-              for: 10m
+              - name: Thanos Store Grpc Error Rate
+                description: "Thanos Store {{$labels.job}} is failing to handle {{$value | humanize}}% of requests."
+                query: '(sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-store.*"}[5m]))/  sum by (job) (rate(grpc_server_started_total{job=~".*thanos-store.*"}[5m])) * 100 > 5)'
+                severity: warning
+                for: 5m
+              - name: Thanos Store Series Gate Latency High
+                description: "Thanos Store {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for store series gate requests."
+                query: '(histogram_quantile(0.99, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=~".*thanos-store.*"}[5m]))) > 2 and sum by (job) (rate(thanos_bucket_store_series_gate_duration_seconds_count{job=~".*thanos-store.*"}[5m])) > 0)'
+                severity: warning
+                for: 10m
+              - name: Thanos Store Bucket High Operation Failures
+                description: "Thanos Store {{$labels.job}} Bucket is failing to execute {{$value | humanize}}% of operations."
+                query: '(sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-store.*"}[5m])) / sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-store.*"}[5m])) * 100 > 5)'
+                severity: warning
+                for: 15m
+              - name: Thanos Store Objstore Operation Latency High
+                description: "Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of {{$value}} seconds for the bucket operations."
+                query: '(histogram_quantile(0.99, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~".*thanos-store.*"}[5m]))) > 2 and  sum by (job) (rate(thanos_objstore_bucket_operation_duration_seconds_count{job=~".*thanos-store.*"}[5m])) > 0)'
+                severity: warning
+                for: 10m
           - name: Thanos Ruler
             slug: thanos-ruler
             rules:
-            - name: Thanos Rule Queue Is Dropping Alerts
-              description: 'Thanos Rule {{$labels.instance}} is failing to queue alerts.'
-              query: 'sum by (job, instance) (rate(thanos_alert_queue_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0'
-              severity: critical
-              for: 5m
-            - name: Thanos Rule Sender Is Failing Alerts
-              description: 'Thanos Rule {{$labels.instance}} is failing to send alerts to alertmanager.'
-              query: 'sum by (job, instance) (rate(thanos_alert_sender_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0'
-              severity: critical
-              for: 5m
-            - name: Thanos Rule High Rule Evaluation Failures
-              description: 'Thanos Rule {{$labels.instance}} is failing to evaluate rules.'
-              query: '(sum by (job, instance) (rate(prometheus_rule_evaluation_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) * 100 > 5)'
-              severity: critical
-              for: 5m
-            - name: Thanos Rule High Rule Evaluation Warnings
-              description: 'Thanos Rule {{$labels.instance}} has high number of evaluation warnings.'
-              query: 'sum by (job, instance) (rate(thanos_rule_evaluation_with_warnings_total{job=~".*thanos-rule.*"}[5m])) > 0'
-              severity: info
-              for: 15m
-            - name: Thanos Rule Rule Evaluation Latency High
-              description: 'Thanos Rule {{$labels.instance}} has higher evaluation latency than interval for {{$labels.rule_group}}.'
-              query: '(sum by (job, instance, rule_group) (prometheus_rule_group_last_duration_seconds{job=~".*thanos-rule.*"}) > sum by (job, instance, rule_group) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"}))'
-              severity: warning
-              for: 5m
-            - name: Thanos Rule Grpc Error Rate
-              description: 'Thanos Rule {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.'
-              query: '(sum by (job, instance) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-rule.*"}[5m]))/  sum by (job, instance) (rate(grpc_server_started_total{job=~".*thanos-rule.*"}[5m])) * 100 > 5)'
-              severity: warning
-              for: 5m
-            - name: Thanos Rule Config Reload Failure
-              description: 'Thanos Rule {{$labels.job}} has not been able to reload its configuration.'
-              query: 'avg by (job, instance) (thanos_rule_config_last_reload_successful{job=~".*thanos-rule.*"}) != 1'
-              severity: info
-              for: 5m
-            - name: Thanos Rule Query High D N S Failures
-              description: 'Thanos Rule {{$labels.job}} has {{$value | humanize}}% of failing DNS queries for query endpoints.'
-              query: '(sum by (job, instance) (rate(thanos_rule_query_apis_dns_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) * 100 > 1)'
-              severity: warning
-              for: 15m
-            - name: Thanos Rule Alertmanager High D N S Failures
-              description: 'Thanos Rule {{$labels.instance}} has {{$value | humanize}}% of failing DNS queries for Alertmanager endpoints.'
-              query: '(sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) * 100 > 1)'
-              severity: warning
-              for: 15m
-            - name: Thanos Rule No Evaluation For10 Intervals
-              description: 'Thanos Rule {{$labels.job}} has rule groups that did not evaluate for at least 10x of their expected interval.'
-              query: 'time() -  max by (job, instance, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job=~".*thanos-rule.*"})>10 * max by (job, instance, group) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"})'
-              severity: info
-              for: 5m
-            - name: Thanos No Rule Evaluations
-              description: 'Thanos Rule {{$labels.instance}} did not perform any rule evaluations in the past 10 minutes.'
-              query: 'sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) <= 0  and sum by (job, instance) (thanos_rule_loaded_rules{job=~".*thanos-rule.*"}) > 0'
-              severity: critical
-              for: 5m
+              - name: Thanos Rule Queue Is Dropping Alerts
+                description: "Thanos Rule {{$labels.instance}} is failing to queue alerts."
+                query: 'sum by (job, instance) (rate(thanos_alert_queue_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0'
+                severity: critical
+                for: 5m
+              - name: Thanos Rule Sender Is Failing Alerts
+                description: "Thanos Rule {{$labels.instance}} is failing to send alerts to alertmanager."
+                query: 'sum by (job, instance) (rate(thanos_alert_sender_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0'
+                severity: critical
+                for: 5m
+              - name: Thanos Rule High Rule Evaluation Failures
+                description: "Thanos Rule {{$labels.instance}} is failing to evaluate rules."
+                query: '(sum by (job, instance) (rate(prometheus_rule_evaluation_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) * 100 > 5)'
+                severity: critical
+                for: 5m
+              - name: Thanos Rule High Rule Evaluation Warnings
+                description: "Thanos Rule {{$labels.instance}} has high number of evaluation warnings."
+                query: 'sum by (job, instance) (rate(thanos_rule_evaluation_with_warnings_total{job=~".*thanos-rule.*"}[5m])) > 0'
+                severity: info
+                for: 15m
+              - name: Thanos Rule Rule Evaluation Latency High
+                description: "Thanos Rule {{$labels.instance}} has higher evaluation latency than interval for {{$labels.rule_group}}."
+                query: '(sum by (job, instance, rule_group) (prometheus_rule_group_last_duration_seconds{job=~".*thanos-rule.*"}) > sum by (job, instance, rule_group) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"}))'
+                severity: warning
+                for: 5m
+              - name: Thanos Rule Grpc Error Rate
+                description: "Thanos Rule {{$labels.job}} is failing to handle {{$value | humanize}}% of requests."
+                query: '(sum by (job, instance) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-rule.*"}[5m]))/  sum by (job, instance) (rate(grpc_server_started_total{job=~".*thanos-rule.*"}[5m])) * 100 > 5)'
+                severity: warning
+                for: 5m
+              - name: Thanos Rule Config Reload Failure
+                description: "Thanos Rule {{$labels.job}} has not been able to reload its configuration."
+                query: 'avg by (job, instance) (thanos_rule_config_last_reload_successful{job=~".*thanos-rule.*"}) != 1'
+                severity: info
+                for: 5m
+              - name: Thanos Rule Query High D N S Failures
+                description: "Thanos Rule {{$labels.job}} has {{$value | humanize}}% of failing DNS queries for query endpoints."
+                query: '(sum by (job, instance) (rate(thanos_rule_query_apis_dns_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) * 100 > 1)'
+                severity: warning
+                for: 15m
+              - name: Thanos Rule Alertmanager High D N S Failures
+                description: "Thanos Rule {{$labels.instance}} has {{$value | humanize}}% of failing DNS queries for Alertmanager endpoints."
+                query: '(sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) * 100 > 1)'
+                severity: warning
+                for: 15m
+              - name: Thanos Rule No Evaluation For10 Intervals
+                description: "Thanos Rule {{$labels.job}} has rule groups that did not evaluate for at least 10x of their expected interval."
+                query: 'time() -  max by (job, instance, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job=~".*thanos-rule.*"})>10 * max by (job, instance, group) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"})'
+                severity: info
+                for: 5m
+              - name: Thanos No Rule Evaluations
+                description: "Thanos Rule {{$labels.instance}} did not perform any rule evaluations in the past 10 minutes."
+                query: 'sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) <= 0  and sum by (job, instance) (thanos_rule_loaded_rules{job=~".*thanos-rule.*"}) > 0'
+                severity: critical
+                for: 5m
           - name: Thanos Bucket Replicate
             slug: thanos-bucket-replicate
             rules:
-            - name: Thanos Bucket Replicate Error Rate
-              description: 'Thanos Replicate is failing to run, {{$value | humanize}}% of attempts failed.'
-              query: '(sum by (job) (rate(thanos_replicate_replication_runs_total{result="error", job=~".*thanos-bucket-replicate.*"}[5m]))/ on (job) group_left  sum by (job) (rate(thanos_replicate_replication_runs_total{job=~".*thanos-bucket-replicate.*"}[5m]))) * 100 >= 10'
-              severity: critical
-              for: 5m
-            - name: Thanos Bucket Replicate Run Latency
-              description: 'Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for the replicate operations.'
-              query: '(histogram_quantile(0.99, sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m]))) > 20 and  sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m])) > 0)'
-              severity: critical
-              for: 5m
+              - name: Thanos Bucket Replicate Error Rate
+                description: "Thanos Replicate is failing to run, {{$value | humanize}}% of attempts failed."
+                query: '(sum by (job) (rate(thanos_replicate_replication_runs_total{result="error", job=~".*thanos-bucket-replicate.*"}[5m]))/ on (job) group_left  sum by (job) (rate(thanos_replicate_replication_runs_total{job=~".*thanos-bucket-replicate.*"}[5m]))) * 100 >= 10'
+                severity: critical
+                for: 5m
+              - name: Thanos Bucket Replicate Run Latency
+                description: "Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for the replicate operations."
+                query: '(histogram_quantile(0.99, sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m]))) > 20 and  sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m])) > 0)'
+                severity: critical
+                for: 5m
           - name: Thanos Component Absent
             slug: thanos-component-absent
             rules:
-            - name: Thanos Compact Is Down
-              description: 'ThanosCompact has disappeared. Prometheus target for the component cannot be discovered.'
-              query: 'absent(up{job=~".*thanos-compact.*"} == 1)'
-              severity: critical
-              for: 5m
-            - name: Thanos Query Is Down
-              description: 'ThanosQuery has disappeared. Prometheus target for the component cannot be discovered.'
-              query: 'absent(up{job=~".*thanos-query.*"} == 1)'
-              severity: critical
-              for: 5m
-            - name: Thanos Receive Is Down
-              description: 'ThanosReceive has disappeared. Prometheus target for the component cannot be discovered.'
-              query: 'absent(up{job=~".*thanos-receive.*"} == 1)'
-              severity: critical
-              for: 5m
-            - name: Thanos Rule Is Down
-              description: 'ThanosRule has disappeared. Prometheus target for the component cannot be discovered.'
-              query: 'absent(up{job=~".*thanos-rule.*"} == 1)'
-              severity: critical
-              for: 5m
-            - name: Thanos Sidecar Is Down
-              description: 'ThanosSidecar has disappeared. Prometheus target for the component cannot be discovered.'
-              query: 'absent(up{job=~".*thanos-sidecar.*"} == 1)'
-              severity: critical
-              for: 5m
-            - name: Thanos Store Is Down
-              description: 'ThanosStore has disappeared. Prometheus target for the component cannot be discovered.'
-              query: absent(up{job=~".*thanos-store.*"} == 1)
-              severity: critical
-              for: 5m
+              - name: Thanos Compact Is Down
+                description: "ThanosCompact has disappeared. Prometheus target for the component cannot be discovered."
+                query: 'absent(up{job=~".*thanos-compact.*"} == 1)'
+                severity: critical
+                for: 5m
+              - name: Thanos Query Is Down
+                description: "ThanosQuery has disappeared. Prometheus target for the component cannot be discovered."
+                query: 'absent(up{job=~".*thanos-query.*"} == 1)'
+                severity: critical
+                for: 5m
+              - name: Thanos Receive Is Down
+                description: "ThanosReceive has disappeared. Prometheus target for the component cannot be discovered."
+                query: 'absent(up{job=~".*thanos-receive.*"} == 1)'
+                severity: critical
+                for: 5m
+              - name: Thanos Rule Is Down
+                description: "ThanosRule has disappeared. Prometheus target for the component cannot be discovered."
+                query: 'absent(up{job=~".*thanos-rule.*"} == 1)'
+                severity: critical
+                for: 5m
+              - name: Thanos Sidecar Is Down
+                description: "ThanosSidecar has disappeared. Prometheus target for the component cannot be discovered."
+                query: 'absent(up{job=~".*thanos-sidecar.*"} == 1)'
+                severity: critical
+                for: 5m
+              - name: Thanos Store Is Down
+                description: "ThanosStore has disappeared. Prometheus target for the component cannot be discovered."
+                query: absent(up{job=~".*thanos-store.*"} == 1)
+                severity: critical
+                for: 5m
 
       - name: Loki
         exporters:
@@ -2785,32 +2776,32 @@ groups:
             rules:
               - name: Jenkins offline
                 description: "Jenkins offline: `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})"
-                query: 'jenkins_node_offline_value > 1'
+                query: "jenkins_node_offline_value > 1"
                 severity: critical
               - name: Jenkins healthcheck
                 description: "Jenkins healthcheck score: {{$value}}. Healthcheck failure for `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})"
-                query: 'jenkins_health_check_score < 1'
+                query: "jenkins_health_check_score < 1"
                 severity: critical
               - name: Jenkins outdated plugins
                 description: "{{ $value }} plugins need update"
-                query: 'sum(jenkins_plugins_withUpdate) by (instance) > 3'
+                query: "sum(jenkins_plugins_withUpdate) by (instance) > 3"
                 severity: warning
                 for: 1d
               - name: Jenkins builds health score
                 description: "Healthcheck failure for `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})"
-                query: 'default_jenkins_builds_health_score < 1'
+                query: "default_jenkins_builds_health_score < 1"
                 severity: critical
               - name: Jenkins run failure total
                 description: "Job run failures: ({{$value}}) {{$labels.jenkins_job}}. Healthcheck failure for `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})"
-                query: 'delta(jenkins_runs_failure_total[1h]) > 100'
+                query: "delta(jenkins_runs_failure_total[1h]) > 100"
                 severity: warning
               - name: Jenkins build tests failing
                 description: "Last build tests failed: {{$labels.jenkins_job}}. Failed build Tests for job `{{$labels.jenkins_job}}` on {{$labels.instance}}/{{$labels.env}} ({{$labels.region}})"
-                query: 'default_jenkins_builds_last_build_tests_failing > 0'
+                query: "default_jenkins_builds_last_build_tests_failing > 0"
                 severity: warning
               - name: Jenkins last build failed
                 description: "Last build failed: {{$labels.jenkins_job}}. Failed build for job `{{$labels.jenkins_job}}` on {{$labels.instance}}/{{$labels.env}} ({{$labels.region}})"
-                query: 'default_jenkins_builds_last_build_result_ordinal == 2'
+                query: "default_jenkins_builds_last_build_result_ordinal == 2"
                 severity: warning
                 comments: |
                   * RUNNING  -1 true  - The build had no errors.
@@ -2828,28 +2819,28 @@ groups:
             rules:
               - name: APC UPS Battery nearly empty
                 description: Battery is almost empty (< 10% left)
-                query: 'apcupsd_battery_charge_percent < 10'
+                query: "apcupsd_battery_charge_percent < 10"
                 severity: critical
               - name: APC UPS Less than 15 Minutes of battery time remaining
                 description: Battery is almost empty (< 15 Minutes remaining)
-                query: 'apcupsd_battery_time_left_seconds < 900'
+                query: "apcupsd_battery_time_left_seconds < 900"
                 severity: critical
               - name: APC UPS AC input outage
                 description: UPS now running on battery (since {{$value | humanizeDuration}})
-                query: 'apcupsd_battery_time_on_seconds > 0'
+                query: "apcupsd_battery_time_on_seconds > 0"
                 severity: warning
               - name: APC UPS low battery voltage
                 description: Battery voltage is lower than nominal (< 95%)
-                query: '(apcupsd_battery_volts / apcupsd_battery_nominal_volts) < 0.95'
+                query: "(apcupsd_battery_volts / apcupsd_battery_nominal_volts) < 0.95"
                 severity: warning
               - name: APC UPS high temperature
                 description: Internal temperature is high ({{$value}}°C)
-                query: 'apcupsd_internal_temperature_celsius >= 40'
+                query: "apcupsd_internal_temperature_celsius >= 40"
                 severity: warning
                 for: 2m
               - name: APC UPS high load
                 description: UPS load is > 80%
-                query: 'apcupsd_ups_load_percent > 80'
+                query: "apcupsd_ups_load_percent > 80"
                 severity: warning
 
       - name: Graph Node
@@ -2859,25 +2850,25 @@ groups:
             rules:
               - name: Provider failed because net_version failed
                 description: "Failed net_version for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`"
-                query: 'eth_rpc_status == 1'
+                query: "eth_rpc_status == 1"
                 severity: critical
               - name: Provider failed because get genesis failed
                 description: "Failed to get genesis for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`"
-                query: 'eth_rpc_status == 2'
+                query: "eth_rpc_status == 2"
                 severity: critical
               - name: Provider failed because net_version timeout
                 description: "net_version timeout for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`"
-                query: 'eth_rpc_status == 3'
+                query: "eth_rpc_status == 3"
                 severity: critical
               - name: Provider failed because get genesis timeout
                 description: "Timeout to get genesis for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`"
-                query: 'eth_rpc_status == 4'
+                query: "eth_rpc_status == 4"
                 severity: critical
               - name: Store connection is too slow
                 description: "Store connection is too slow to `{{$labels.pool}}` pool, `{{$labels.shard}}` shard in Graph node `{{$labels.instance}}`"
-                query: 'store_connection_wait_time_ms > 10'
+                query: "store_connection_wait_time_ms > 10"
                 severity: warning
               - name: Store connection is too slow
                 description: "Store connection is too slow to `{{$labels.pool}}` pool, `{{$labels.shard}}` shard in Graph node `{{$labels.instance}}`"
-                query: 'store_connection_wait_time_ms > 20'
+                query: "store_connection_wait_time_ms > 20"
                 severity: critical

From f5f6b338a3d5b1a9157e5c3c74dadfa818ad1df0 Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Sat, 10 Feb 2024 23:24:10 +0100
Subject: [PATCH 033/123] fix: high/low cpu alert

---
 _data/rules.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/_data/rules.yml b/_data/rules.yml
index 10833f4..678dbba 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -385,7 +385,7 @@ groups:
                   This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment.
               - name: Container High CPU utilization
                 description: Container CPU utilization is above 80%
-                query: '(sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) > 80'
+                query: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) > 80'
                 severity: warning
                 for: 2m
               - name: Container High Memory usage
@@ -406,7 +406,7 @@ groups:
                 for: 2m
               - name: Container Low CPU utilization
                 description: Container CPU utilization is under 20% for 1 week. Consider reducing the allocated CPU.
-                query: '(sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) < 20'
+                query: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) < 20'
                 severity: info
                 for: 7d
               - name: Container Low Memory usage

From c3258de6c7f050d390706983236ee9774caa3901 Mon Sep 17 00:00:00 2001
From: samber <samber@users.noreply.github.com>
Date: Sat, 10 Feb 2024 22:25:26 +0000
Subject: [PATCH 034/123] Publish

---
 dist/rules/docker-containers/google-cadvisor.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dist/rules/docker-containers/google-cadvisor.yml b/dist/rules/docker-containers/google-cadvisor.yml
index 4215f15..7c7bcb7 100644
--- a/dist/rules/docker-containers/google-cadvisor.yml
+++ b/dist/rules/docker-containers/google-cadvisor.yml
@@ -23,7 +23,7 @@ groups:
         description: "A container is absent for 5 min\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: ContainerHighCpuUtilization
-      expr: '(sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) > 80'
+      expr: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) > 80'
       for: 2m
       labels:
         severity: warning
@@ -59,7 +59,7 @@ groups:
         description: "Container is being throttled\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: ContainerLowCpuUtilization
-      expr: '(sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) < 20'
+      expr: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) < 20'
       for: 7d
       labels:
         severity: info

From 05c4716c2b4da6b8f94205d99174259d0a98cac1 Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Mon, 12 Feb 2024 09:41:03 +0100
Subject: [PATCH 035/123] Fix KubernetesAPIserverlatency

---
 _data/rules.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/_data/rules.yml b/_data/rules.yml
index 678dbba..42ff8f0 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -1981,7 +1981,7 @@ groups:
                 severity: critical
               - name: Kubernetes API server latency
                 description: "Kubernetes API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}."
-                query: 'histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"} [10m])) WITHOUT (instance, resource)) / 1e+06 > 1'
+                query: 'histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"} [10m])) WITHOUT (instance, resource)) > 1'
                 severity: warning
                 for: 2m
 

From e2d3dadbc547f570858639fcb4343e3942c0b866 Mon Sep 17 00:00:00 2001
From: samber <samber@users.noreply.github.com>
Date: Mon, 12 Feb 2024 08:42:15 +0000
Subject: [PATCH 036/123] Publish

---
 dist/rules/kubernetes/kubestate-exporter.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dist/rules/kubernetes/kubestate-exporter.yml b/dist/rules/kubernetes/kubestate-exporter.yml
index ba01753..e43a1fb 100644
--- a/dist/rules/kubernetes/kubestate-exporter.yml
+++ b/dist/rules/kubernetes/kubestate-exporter.yml
@@ -302,7 +302,7 @@ groups:
         description: "A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesApiServerLatency
-      expr: 'histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"} [10m])) WITHOUT (instance, resource)) / 1e+06 > 1'
+      expr: 'histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"} [10m])) WITHOUT (instance, resource)) > 1'
       for: 2m
       labels:
         severity: warning

From 90706282ad429e22996edcd037048b0c385593c2 Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Mon, 11 Mar 2024 22:55:05 +0100
Subject: [PATCH 037/123] Update rules.yml

---
 _data/rules.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/_data/rules.yml b/_data/rules.yml
index 42ff8f0..4a879d5 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -401,9 +401,9 @@ groups:
                 for: 2m
               - name: Container high throttle rate
                 description: Container is being throttled
-                query: "rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1"
+                query: 'sum(increase(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) by (container, pod, namespace) / sum(increase(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > ( 75 / 100 )'
                 severity: warning
-                for: 2m
+                for: 5m
               - name: Container Low CPU utilization
                 description: Container CPU utilization is under 20% for 1 week. Consider reducing the allocated CPU.
                 query: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) < 20'

From 7b3cef8bf9c4f5902d02e29ba226d3b6436d1dcb Mon Sep 17 00:00:00 2001
From: samber <samber@users.noreply.github.com>
Date: Mon, 11 Mar 2024 21:56:16 +0000
Subject: [PATCH 038/123] Publish

---
 dist/rules/docker-containers/google-cadvisor.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dist/rules/docker-containers/google-cadvisor.yml b/dist/rules/docker-containers/google-cadvisor.yml
index 7c7bcb7..b495569 100644
--- a/dist/rules/docker-containers/google-cadvisor.yml
+++ b/dist/rules/docker-containers/google-cadvisor.yml
@@ -50,8 +50,8 @@ groups:
         description: "Container Volume usage is above 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: ContainerHighThrottleRate
-      expr: 'rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1'
-      for: 2m
+      expr: 'sum(increase(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) by (container, pod, namespace) / sum(increase(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > ( 75 / 100 )'
+      for: 5m
       labels:
         severity: warning
       annotations:

From 1eb5c5834fcb92116da32b460a1a7bbf15f4b77e Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Mon, 11 Mar 2024 23:28:06 +0100
Subject: [PATCH 039/123] Update rules.yml

---
 _data/rules.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/_data/rules.yml b/_data/rules.yml
index 4a879d5..9f430eb 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -401,7 +401,7 @@ groups:
                 for: 2m
               - name: Container high throttle rate
                 description: Container is being throttled
-                query: 'sum(increase(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) by (container, pod, namespace) / sum(increase(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > ( 75 / 100 )'
+                query: 'sum(increase(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) by (container, pod, namespace) / sum(increase(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > ( 25 / 100 )'
                 severity: warning
                 for: 5m
               - name: Container Low CPU utilization

From 693c9e51b2260fd1f11622430957b4da285fa0ae Mon Sep 17 00:00:00 2001
From: samber <samber@users.noreply.github.com>
Date: Mon, 11 Mar 2024 22:29:17 +0000
Subject: [PATCH 040/123] Publish

---
 dist/rules/docker-containers/google-cadvisor.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dist/rules/docker-containers/google-cadvisor.yml b/dist/rules/docker-containers/google-cadvisor.yml
index b495569..cfbc333 100644
--- a/dist/rules/docker-containers/google-cadvisor.yml
+++ b/dist/rules/docker-containers/google-cadvisor.yml
@@ -50,7 +50,7 @@ groups:
         description: "Container Volume usage is above 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: ContainerHighThrottleRate
-      expr: 'sum(increase(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) by (container, pod, namespace) / sum(increase(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > ( 75 / 100 )'
+      expr: 'sum(increase(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) by (container, pod, namespace) / sum(increase(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > ( 25 / 100 )'
       for: 5m
       labels:
         severity: warning

From 46781af565eba9c5c3f5d48cfd7e8d4dbfd0c6cf Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Thu, 21 Mar 2024 16:15:51 +0100
Subject: [PATCH 041/123] =?UTF-8?q?welcome=20@betterstack-community=20?=
 =?UTF-8?q?=E2=9C=8C=EF=B8=8F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 _layouts/default.html          |  10 ++++++++++
 assets/css/app.css             |  22 ++++++++++++++++++++++
 assets/sponsor-betterstack.png | Bin 0 -> 16118 bytes
 3 files changed, 32 insertions(+)
 create mode 100644 assets/sponsor-betterstack.png

diff --git a/_layouts/default.html b/_layouts/default.html
index 1a41b93..aa0ae44 100644
--- a/_layouts/default.html
+++ b/_layouts/default.html
@@ -125,6 +125,16 @@
           class="fa fa-linkedin" target="_blank"></a>
       </li>
     </ul>
+
+
+    <ul id="sponsoring">
+      <li>
+        Kindly supported by&nbsp; 👉
+      </li>
+      <li>
+        <img width="" src="assets/sponsor-betterstack.png" />
+      </li>
+    </ul>
   </header>
 
   <main id="content" class="main-content" role="main">
diff --git a/assets/css/app.css b/assets/css/app.css
index 81f18ee..7f6c276 100644
--- a/assets/css/app.css
+++ b/assets/css/app.css
@@ -115,3 +115,25 @@ h2 {
         max-width: 85rem;
     }
 }
+
+ul#sponsoring {
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    margin-top: 50px;
+}
+
+ul#sponsoring li {
+    display: flex;
+    padding: 0px 15px;
+    font-size: 16px;
+}
+
+ul#sponsoring li img {
+    max-width: 180px;
+    max-height: 80px;
+}
+
+.page-header {
+    padding-bottom: 30px;
+}
\ No newline at end of file
diff --git a/assets/sponsor-betterstack.png b/assets/sponsor-betterstack.png
new file mode 100644
index 0000000000000000000000000000000000000000..b0c12b050434998779024ee12aa6f843c450c241
GIT binary patch
literal 16118
zcmbt*c|4R|{Px%~m9?la)|4gLLdd?Br7VN&#8}GCkX;DLmVMvJuI&4ghhfBIPfRM=
zw=$R*llOR@`u*O2-_JXr=kqk?zVCCN``qVzzt{J=uJc0YzB=_e=5r7Tg!<lH6+H-q
z91gw@LeGHz4_s>W2mg?JD&2!a!6yJ}8wLKh^n*Y2(zEvAaPx4rb96#*c=@^^I23hs
zL?94o#-hf95zU)uEkj;j^n>B|_m=b^WHuKCFKS-YEK(sOBRfw96&4c~mLN}5Q)rTc
z^Nv9%(wWnZGnulOGRZ`uFcgrxO$<ZQlggqH2nXbziX!}J_7?U8nKk%{@`t(<{v@XP
zS-u!u1ND_wl3~-HbFV{gc%6?*U`c48P~oMh`hJt+9d(r(*)y>>T+|&M!IDoIQS#wW
zmoFa$jegy1Z9Pi`jrf{jZ)fsPKJPh26<IDnbrl-=Ru%3pSMX^6e%fpon!}@9Uhuj!
zniykqnrMiT3t_xuT)Y2!ip1S02Q~4~8!2S`cb+AOBs?SibFKHwC<im~H+p>edJO$h
zbazSq`#og&tZDW1(}nI1BZU&)bCFT0Ubk<b4*Kaw;UYq163b({G^H6_z#A<`?+oo_
z`0o(JK_W6rLRELq;^-yj47`i>=L@KEz@KAZB%xd;LGdqS)`mFc7f!4P&3OMDftboj
z)uM|?Qg+I3YL=`1?VNDl!sEtEv?o7ahoJFi9d3tP%l*ATiXO>!A5M6vaIK{M-owAY
zhVVKzn&e>Re_88&+QF4d@@ta+Ib<c-%p2{7r`^5<I<<d~@iO$!fu1u`^o^#~hWGbu
zBGC22B!30~0uxbGQ9P74Q5n4d_ot72fvFl91>YP6sQz3WLcLuNM~9#lERFwmp~_uD
zW%8XH``<G(?px4D5dI#8kNK;r)NF%k``_o`KY^+$J6>sjpSXu>^ahv5|NZxliK;F`
zo95p>SiORgF#fqXB-&z4@NY{Y&U89|Pb!1{U+4V){QT4qh7yC(tj$;kmskcJKDX%f
zQ?0e=ENpgN7@wVG8nL^DuG8`n`E%do$Be4FtAga|JkeZjZEc_KJp4wG3;gxt84UHx
z<ZnkWa^#cB57xze?hbz74rAZjQ7@v8RlE9G=}#k9&X6#+q#>Aca&o5F-rc%F<sh2z
zgW=EA>rqHorih9XN>@@5%NlWzgiQ<mnRp=u2?eAoDqh~+-amx%ULPN><V|kuKg6H?
zZ+xss$>AY|yoLCkh58*MsG?<804FCoq3h30z1}6T=T}zV;3JISvhC_Kk^{;9yKXg{
zD&^{BlnCg}y*c^1(@+CG`7aU*JicFg`FxUiRg>bs-!h?0{Oa3jmO%wFm=ZQ+sABL5
z@wQal)qTnTjzcOql$WAZb-88JmAukQxSr%*`u9MHO)~e3Z7!HBIXv#;>j!zstT&>$
z4iFjt^(^k*zb~L1>!Sa9qQIrOvi+_)wQ^7#JQcBbuw`0;+LMWw3;tcTmwwuq=y%`y
z#;LlA+<W^QM(DbjsMX)y@$Rt5a60bre4m}quNEc0x7B2ll7%#WlPulX*icu$A{nRb
znE9BW=~&%|L?M1-<H-^#W-<Mn<aG=#nVsc3Xa$T%BBw1fIrigt)@|4AllMsTDmC!b
z4>S_PU0hsv<7_k%zt8sEGpN}}+-Upm%a)Uy`{=V2<f%jy>9Rru`jsEy;N}x$E6YJW
z$I_oqwXCnK3#vIItV-rTlu<w=UE1Uv&|fsV7u3(3@1d6}9#xEpwt$ww3MAv!HaE5A
ze#!^`juA+SaMVy(5^<O$g}~ATh8^BM{NN=mUTE~u9d@oSVHAUTb?@v4Vc%T+MD9k?
zW@y>!^Lvi*A0EB^;1^K!i%jBPVtDBUA5xn2ar0cmLxC6vEAWDACF9EHw)EPurNUIP
z<s0<^R~PW*6Kfxy%1kF(Q$dag7DXE8oY%eE2guzDq`~|7FftN5y@+zJk}V^1gNi*S
zfn>g>D)wqOFEZkOUl;RExY(sxbMfaxA9qN8#ICOzU4IaGkPP1Fm|sv>JgQ8g$D}>h
zgq)HDvYEUhMIEWUNXAE-vXd|R<N^e;r>LO%%)`lyK^1QbLz$4C%aaS#3wn=Au6zgi
zSoQwlns>VzIX?-+O5PoIIvLiyD%yD|Z!9RLrKPPFlR`d_lQ6zO4_PjB1!{KxKDyBv
zeDXW!&Ye3&rwey<bToNi(AL-vP8&l#yh*xQq;*;Sfi<q*eLm`ovHmF87rMotExONs
z@J5$StUW}^m4!j_`RH_BaSV$}gEs2x>o4EZ1(RlYSo7NOl`IA=&bHxs#nf|Rs<Zl0
z?pq@lQ6k`Zq@|?5B-PBhk3w7huvsB-a}pF1Vrk_!Pen|wc{wzitO=rvc4gFA$sw`f
zh@^~^-el#ltGL7LBchyZL6<&#L%GJki?dfWB=yN|-B2X7FF~#7)%GI=YIm4?d$@IG
z>37Mvqfmo47`+$=(jJl4j*ep9Xs^r4ZEIc{VZ91IBHJ7J{f`tYr_Leidg26z%djr>
zo9BnGZ|t*UDI5~pdKn7a(}-Xq58`B+Jm|4EiH}W$pRX<mFa`uDB_;P>o1Z!O`1-4B
zSZOvTq}k?dDIGn9qJm2CwX#rYC?t-Wl>anzVJ<ZM+G<|-STCW)W^yZQq-<e_Wnu2T
z4n-*c@)M1|ZoU-~j^U>=a+y{dd{EL0N_U6-KBufmRU6(g5xA7heRiD#BFPUe6XrA8
z{+h5N6^pXT)VeM}xXRVm+G;&_xmgCYw!R)M7j#_K&I_pm6Ki5Z{~+d;q}l9`?$*|~
z(!WAzuY?fa+tnAbCNh(gbo%4t%w`Y^w?VrhchZwnhuDLUd)vDEAN3x1L>CSYewAyN
zB-4Xj@AXJsz7mg`bZOqvKXE3Sdt(+$USLS#j8hTnEa07<_cJpydox4d_mvJw2kG$v
zT?DZ@k=q(;pS$9u`R4cjjHLkK)z<pDh7UdDBf>Z$C78<ElMZ2YB_dXm7BU;r_jOGE
zhH>gxe?P^B-0O+8F-s#z$OTe<DqDo1nWg2-XZ2LXfM5Y$2ZkEdd#09o|8+|<34{T3
zGV>tb$sNWbo5ltS84->=7xo}e?%u3xhMKXxhsm7Wx0;$7;RSL!`g>W$k{HjMvQHL&
zUcpO1Lb#-Fkv*r2Z^C-fbkp=DfT^?*N(q64gq%CS4W4^lkng>0^Ghd0>Nz=?_^cg*
zSRSTitJ{q6+OQ}XZ4hQkZhJ|Q7DFIEL{mIbXF1DeHpbT0_*!c;;}&I1F=-cDEG#WS
zTtQ2B?aI_*OUWT2dDepuwCi{lE9#{mz)*T&vUOI1j0v6LO?06$n0DIvVn>aQ{Y<~k
z+{s0*C1!|%93!%NhtObA>)4JR50K+ozz-p@m&2Ap#9Z+e%g}P@w9wBiJ?&UGK1vuQ
ztq>|`+2lVWm?6*gy=+hI$CV~9)O0HA=8`Yl%{0wVP7>qHO}Y&CW`!V-`Fn{fkL^J_
zctyH6Cli$WAU^QgvlP+O#Po|JUr#@WDlw3i*{r3r6CwCQ8Vp(|Z~pU*wuV_JFl*~~
z?hK1ArU_F*-egqKFmaNPtazH#&Wwk$A*NsM5cCcngbM(?;R3qDVbl5@1^XoV#=YeM
z->~RjVTgnhG`jGvhK{PYAq%8OBuRwl1)Y*#e$jVH8qkcRXU=!uH~}jNL=9Y>|CsBo
zz%Y$;>Q<8FsNF4awc*5CawpIDvH}kzWXKljrG5RPhk06HH|A`gj)@M%@I5lf4@HHN
z2oTIPXW!B=nG_!_VW@2a6`Mu-vo4bILm(c5SJpQPU5W3LQQ_8i73tpW7#)NHsDL6T
zNnwT~%IH`nY>23M_PkAVeBeSf)g{~V5SQGQw}OkYQXrLoPoRG@-TD59gIJ=d{rQMI
zQpgVlg(ZcGB7$$O5zQBn8S~_WOM|+DyOZ-)eD@A<_D8>d79}5Yws2}vX!W#ai;{+e
zPPkFR!JAH8ipjfrRX_2r^J@->LK0`b2M1#%LB{K@tCCdf>_YH~_Q^4hHt_fHVOMjY
z@D<@Sow!-22#dfE>wA02$l#Vd638!i$|?*7V~qFV-PXFtbz0c8aDaALNf@_{XIW9!
z!HUBmSeToeuL+)kiOj8BC51?ynI~aPS`XSTP&ZEHw7CEYAtB|bl(hX7TaF_B_)50%
z+xA5vw39DCWxD|s3{Tm7LVAi@mt`<E!L*#F7a;jvC#!-Xv0k>XlWS^gjq5dJFjr@g
z)en8Yudkz{W0*rq2kSP2u5uk6?QawX?~G*K##0kLx~{)b&RvNmb{@0_Kd0L}+JEA4
ziZbvg6da;L3EKp@=hlNw-E(uMMRjhO0vJy;E<2GM0Hd|JX4m*LkcAFkWu+*>(O`qW
z&THj$ula?1Qqb#%)rOzqIv)x+h%vvF_;p0=VkAkCjN^;5AvfspyDpJXCr|`1!SZrM
zB6oQTjfKP`vzZbQ7r~-*XD?s;nI8Dal@&SnRK_lkM+xQ2ea|C!XTaU8a;38)SQgXX
z9jvVzlrYu*v%qkgo-ln;A_!&&@?l2n3Qp8|5Cblg7s6tk4zoXh{;UT>;ddsLQ+6im
z?u)MFCO@yg?_-V+$_xV_%CTEMGkhawF9Espl9c~R^PFns)Ob_%%#&%{tV?tM(9nx%
z&rd!Z-tAQ@G&1jm1)`#y&-dxz`G_jeUJn-~DE767Yp2C!MLLGK#W%l$;+=C>7<CyK
z0SLQe$+%){zp?M<pA)(5(bYhq$Z?p~q5rwt3{$#>ch&uY?TcH4OK*F(-(1G|(vvHG
zZQE9y=~x$`?Ip&zO0cXUY96&?>v9(gc1*Cym?~UFQGl8ZCdq<FDo&qV(Q2M(oK_)i
z4%OZB>*R_q1m`}!vmmBzymNL+A}hARJB3bZ!&RSA6wY74f>{26Vh?_!X7rp+3}A2>
zw)!po)8#{B;Ad@Fc!wy4@dTNy!K6BQR1+@OuHw@f8P*$ulv=+oT+o~z_8vEQ3Z$8|
zU+b65Ji-6o56lDaGZjx~M6g*P{^PDyKO6H+vL>+8pdkl_vX&hM$m&4a->5ZQ6!pQA
zliWWwhY#K~v3QD7pMV%+oRf?=Z49If5L}*HJ8Zd@7v0C;vJESzh*4xY$hK-Q!Cabe
zY~8*Q*=eloqk@u{ScChg4h#*Q^`8{f*56urF2Lw7E~|{_w5ZR?s`yNZY?^bXk0?m|
zd>nWi<sQu&oor4vaSJYkncI-@O2d_m-~$g9M*ktq;A9SB#?stUPox|(FLheP6ElYf
z%$Ec6hHQ!mv;K}rx|FgYw4cY|8uFgBmIvicr<6K=C3GEZl;W5_Wfs)UIu&=lWJQhT
zQkyUu9+s~j9hvyh^I|*?JU-Tii;kJQLhu*HA(*cn3oWSVovU#+>t50f5e2%&bjGQ#
zb_gI@H`@Px)(VCbz|qC+g@0k|_|g)hXnUcq>w}c#d_jx9*v2RpGLnQ~GPkhMYn~H-
zyg!7+mk&9Am%%tn!cIZ%DNI_SY@hisaP;7L|07QETm9}Z5bM21@{}G*v|cbDz9z4x
zhHxCdfL8HIO3#>l6JfU)ux&u&IfTm=6cj9*+hQi_CLm6G!AukRwC1Y?fOT8m0N@nu
zyad1z>tZV6ba1VXWw>tWO71Nf%`E<`cjky4fPOLO$v64KNUxvq8*cNE&RG{k*Mrsk
z^5yxfwFUQI_J;;6q#XpgnWuqyi0owmp^sn0@fs}d=X+Y;Tzl<QjZ_4v_q;BX#VIhT
zItjrS@Wu!Av@HG^um1)0Vpd;%_w^^=j|mL7cGxGHN~AhpFXHSnsb;1Z2<w}@Fpp(a
zrAM}#U;PB}&0Vm^&3X`{es@M(w?8i9A`ri{34+zzzN|g_EJTB%R8~h6-PajV3a04$
zifE=yFr-hEO{R+77k_^IRh9fYd6plV&h}cAb-t7~&3s8EP=X>fY8&nBFZ}%O=LEDc
z6+K)^uSsq~<;2cpwZmt4l$E8HeXN^F9+h1#!4^-i<*Qgk%5R){j~MbL-5@6^=Fki*
z4RL@$Zb^nvA>tW~atis$Hg|Uu-w7uN!Dd4_dMMw&;>(konCP|C%Z<E=cRd`NB3FVs
zu6?t<!nUtIC1O%-IPHCZbDWVbd&DlbB}fI8rFe%ezNCpy7E><=ie+0OzdV?`Am;G!
z@W@w%diwefNh`l8ulzFju;>nz^43a;XaCi*8pq1xVn=irSIp*=LepHe?hP6@=j=*x
zs}y$6;9E?X?6WLB<^#q}iRcswI5bpYUFu^CDaCCpW=fL<<$pDrYip&qv)l}k&~EO4
z9ymPC>O#kFF1!IKBhC17B^kdi1Cj2ouhZh=_wQFcw|c`+O>16gkDzvNs{>p`-P-S8
z5zFLo+nXa5kIt&kARBLs**creVD@MaY^ck`#MAX$viF(nThUKycTLKATHg~j@rQ5E
z7<$R;6OJX2gBF(zS4s;5rzWj5THPrOv}npW()D5+Nm7<PTJe;4kJhTd4lw1mo`D;?
zG!MIp2TyN#H|EWC%?%9=jY&M#vi4U1(E)_!yN(Lwt0VB)zN#mStH!s8MZ_y){Bv8=
zzZRa`EUq7a5zyCVK-YhQ>mG{CZ|t9jj-0HsUh)rwoROsb+O$6|yqY95uJtB+EIQ0(
zFzYA^m$aQEl>EkbpKK9*7P^Bi{mb?AdGw<f)9VCm*l!(w?W4589G)h~|3Upd+|gW)
zN$2g1)j&a4Brwc-?}A36%KG;-+@aa}1vfAmSFgcZwO;Fk%zGy6*>)kM$i_HT6iVuc
z-_Fs!u{J9`wk#LQp(Bwt`M9xg>anUKUBZel;6zsU_9oFK^3PVkeJlKl)T1NU+t!`M
zIj0wA`&ZE8f;U?_I+8z?!eZ4%I@G<?y=dNA)ZVZcnat?^lCW4lZ@a%dmt3NUza8Nh
zSE63HF2NjIY=J)vsM$1wq4>;ZZpvT?*7~xOWuu-hezl8l5oKjnw_&Kmwa@!957uNk
z!^^Nm<%H(y_VPNcOC#;9n|U)dH2-1kOl;PMHa>7~+}j}{x<Yy={K#PeKhgznQrG&l
zZi~rGE#SH{XY8Hzkmm*&KZxv(A2^|;TfLzx62^T&G_9lB9W*~g$>EdTlD4m}`R5vG
zT~B&cKfAdv@@Cd4`Y=E+5c+IUk}^1(BfrpYZpB7zd7f1%ihX+_uxe)Gk!Z%<?%<7}
ze^!4xZ@BPh99Zy^2@EfdA6cW@YL_(U3NG3V^T)6F_B)7uy3<lxA@_~>D5<pk08%=-
z^?@^|lpNkPcQcev+?Yv6io3_%*Vp&QPwAYv1FK%mteDN1EpNiN;_g!CLA|lQhKR~-
z?f#d_1H%uXx-n`+pE}FIx&(Er31smD8z*=GCqlsc!CyBecPukvhoy1lfBGaRAHOL1
ze21XQpgK+Q(95zttx1`bUo_VjgL~YpiI`SLO7nWJ1&o5Bww}<9P_C!dhL$-so33RH
zXw#N5sOmOE7xE5@b`uX+stqeOP6pgzN4ra!OEw5&HP?l%>f;?aORSn0bL>Yvh#ZYo
z=E1xe+$wGb)ArKcE6#C#QxhIT8(SC!>Um?l)D<dTD&^_7Js^r@ErKOulp9|N>g?E9
zA6R6dTuiT(Ld{0kFT6?hr8yB~c4W-aBQa)OI&#DLZY3xvUMNQ_dk*K=m7t1oi<`(i
zxxH=dopYfIO69q!+-E_YF2~uryPKy0c!$Gz?#f_ReVHV8`gS{{V%3T?Fh(#G$U3ue
zJO5T0({0E`lQB0*+q?yYbk1~UmKK75!O2L=%64|!=N`o-aTf>-gOqy|RCA9JVOl$r
z@>?6EEth8F(*{Z$P1Y6wYld}614S-hpAGXJd$e-fRq04cFPC`b^ft5IzG|$?Ez6}l
zO*ac-1Q=xj^Eaa`l7~q#a~-v~!bv`?;eg4&tZc6JL{jB^>2M*?TzmBlCtb14v~s`F
z2c~}HDgp*|<E8S-(=URkF3cxe(YJyx3vD;%X#nt>SeqQ&81>D)p3QE5=5F^wnuL8^
zOhH%8#x#HY=1Ry_oKC-+ZQ;6S$J%=x-M5)jBDST;Ra9j#X{x0JjX8Ya76~c#8Ts7x
z*ZpSv>#Y6?s{P}G*GA*X7}<-iF$wKdAlX)aB7?}?ER?4t<;~!7KA<d><$XGMCkyvW
z$QjnhsKt4ex76dtiscy7AhzGvrOiEPvf?3);ebw)a-Qcb&V}j>+4Pr5kJe;4ycuH~
zDL=II$Ao1P>z5C-Tgulh0iE`VjGW+tw95>NTdRb*6Q|u;Y-r11pXLIfHEH>DO1_KP
z{@>avVlR`!gFeH*xyV%frk>ttJn##{{lYkFJ5kNC!Dl=1VH^sq|JX|B)DZ<M+`2Vs
z4}-qc(6U=-X7rk6AQe+QHlSr?(9$yWw8V?wnN@{i+O)q1Yud&!cJ*UySIX*gva|2D
ztsfxD8|JnKZt6SKfj&SvlgWQd*6UwSda<`F7tu~{Sn@M81dYkk$nMN&^IPXhI=iu7
zdzAZ4=0M*xiKS2bY(;X`im!=uf+R-qcHCJhsZ^EP=z75mN^rTZDo(MA(52xeJv3Uy
zK-v#Z38&L~r5>zpsGH;TqT&YeW0R9x<u%uj@i#q(HeHaZ=!nO0viWrcrKH)GB|rL#
z_hr6XbmUGm9U6-iBq@2f4J7^HNc#tFL3iPS4~EK`y>FX8Gfc_61m|q?$A?>&Ratx!
zXuog9-q5zrJ=#{Lk>FQ9d$&E_pv&}jR%~)Uy6c%o3JD{*{`We;e7%*vE)Wu>r~CV#
zALJ`cX-~glXGznMlaf+Qjpl-u9k=ZJ$sVYY*xN;J--EQGLUst*SMzsqrRR*iQ_u}Z
zuief>Rvb_bm(`-3g|FOgd$gRPbzM)c*vmC;andB!-%?^)G2Xb^Pyq9$22V2McCGO7
zaO-gT9aWU@Xko%FFUN%^<^1_vH{2QsR4`BB65i-<jK<3wGqsc_a}~pO<qz#%&OLgY
zP-n@hQjM7M`nNL3*nZSQ4{!^>h7Wh-YwYLn#IV#SOWfgB=F8@Wos<DBxfG*>WgSo>
z-8{)!k?MDUUso#)LrKMz-0VId`sGVWC0NEx6pB4<DgjCEGX9+w59R_Vckr_ZjXT6d
z1QS7TS7j=#ZTXuo8z2^@18WR#OcSR#{nwsyk^{K68)%9C?QCtXP%=vWYQ?wYa!jny
z)r{C%bENz#KCw@YQ=mN{^1SAN0+FF^^JOK8%{xlIF>@<*Xm6cR60^{cdhUC?_Yyx5
z++nD6HI@~^kE5eZXUj<DST!e1(M=etKunv(cN?(r?O3(f<L4#UPfOK^M3@@1_fq&r
ze3tsa?T8(M$;JT(^puGtUc2|`pJsI!N+Xo7W<z7PTf6Ff+5G3v(?@s%{s$M?B9sG;
z_U@sBp3{L8Uc71gFO>lZ6zQD%)koh!O<W?7yTUYeb$Du+9rd<xe_!OwxhrlGPd6?o
zy-875b9H6BPEp|n*i?xu?W?tlbmU@e@yN@44`DmhSoS2doxQ;Bcw5`>%nxII#t||Y
z<4`10;L>?=eTLKdg&!yrwH+*`jpsM4OlhE8?LHyaCKJ&Bl2^T#2wlFLSZt{C{03Rr
zE!lK3dra=x&f^~~rJ!tGD$kT19tn?U!h-QQ47l%@$t)s}sDCBG%h$KO{fb%z3j;s}
z1j=hOoox>Y+{2;=RE+w$w=uYfVF$>IkoAWGm-NW~iCg4I-wi>Mq_oAx@!xK7jBO5F
z@)g`#3~hg>%Zc#>O4GPGm86GpD(k#y&s2!W*6hAa{LLu~A6Gh=xXRa~?y$ve%h4F^
z91_Ken#%j_&r%KpT+9u(^$U5Gbe8CohQg!UDWqvWfvB{tZ*_iPdqU!s$ULezL=gXm
zJq460fb(XKM?p&+tPB*cVQ`-^E*rS~7;~^)>fluI;Bh=A;ulJ`Oj7~PpZ<$B&NPub
zWd#ldhmu|(w$OJf(7KCT{Rqf<@uVg5MeOa-!}DGg<jDJFj?xQw$5yk?ltxapGMr`O
zTKpR{+dZ{>vwS7JRz1^0xaAR+R3e;S_0{B1|FBi`z3vRY-$m^Z1GfM;9E-9QEda@}
zt*fgP&@4}wx0b^1ie5JX<iQ?zJD4`9^MEd`a;1AlEac^yLE>qqJ;YPBjybOB$^T}V
zeEitkU<hfR+VrBPssHe>HFfb*akk9>o=D)Notc?29K|xI(o4nJzzrd=`}^<C&+iyd
ztg~~HmpGz5w6iXGDF`q58dAU|U4D2}d8Duu!j=<Pq&fo$)6th5a#w)BQf&J%S(A58
zV$2wZVnDrpOYQl~$k=^Y&#ryR_kMG8Gv1h^LJl3qkv}mp(HROeTXd@zs5HY54aoY0
zJFZX$P`h1R>HU`dRA=V!Yv98X7q{n8LC~6y7bdCu67D~E@N>>oFwUk#Gbha%Dp)G$
z#To7}Utx*lOy3o)B;_|_zjQAV=A{qi97LKwse=PztAyH^nd*$8;60GS@04kgVFL@&
zI^$pVD;;^t!D#U3nBB$40omZ)K2j&7&uY>fCS!)*@sch9^$GZ)oCYc#7q(JTiuGMW
z=LaxZiHJZSwgKf~#4e0hy}w0t>+;LzyHmX_JNEkZc3%xgIr1P7Hi=U&fc+KbqyPG}
zV_<mXyLx;)pBbW&5y!vyoG&~dyuotM#=#H)#;2owliNR^3RI8jopG_qJpPnw@!;ZQ
z`P?&hREIrEMO}g7S29V3=v~*{ANfH3lE|`Nr3E2qb$NMNE~^L@<T~ye%)4#y<O5AJ
z3gG=-s}P@oTUD9KsW+reUhBPaUSiK6p`EKyJ$vJKkDRxWf5QHSSswG|v(zF6Zq@HY
zt*pc=$U;{+Q!A-H2mylGr5P|9d{oLdH3Tnd)f{8}+!e6e0Ru1&_!J<coeK>CdK!>3
z&~;TJGwFAfm$2<VRRl6$G-LP|M=&*7m|2j_zDm%_!V1x}M-A?05rx6G$RGz_PcfIq
zDOKCP*&DEZ5=s2(N3w}YrX|>X^<@J>Fo1_@RWSG&vivs5xScRwMY=0#&$!8aLSVBr
z-Y*uf?DyZQI#=c<p1QG(YA`O(yV{<3vJ-rA!+Ak87PUCgeH=1lEBfjJ+}Fj!o*dd^
z*AiT*8`$Y3uyNK0y_vG_r#-i2XzE5`%s-`~tFq;t)rPGR?>o_c9#Rt+%Av5lb}>#g
zU(W||HVY&IZt=p88~at+lT|QzDxH2?QES&dHf78u7|(l08wr*LtTuOe3vCTx`-{fz
z8t=%iXc%Z4+7+8wSg<+gr~5x=e=`AOkWW?dZ?5ORBQ8$)K9h{InbJwi%3>)eq%|fc
zI7!s#kskrB{9vF*Um4dGNEq;g0l7r(@%zP7n(OYyzR7I(+bg8Rj<0S}F$~{`G|E_;
zT94i{zg~$0Rv7As&8-J{r8IGn$N#Xa*UAd96!o*u4T^IA82MXfN@vccxe7%B;kBkA
z^Gtchn2}VN2158GagKjj6bPHKmNkKv&@u%_=^QRPLda>8xu4od&YV8Hhd)O+-E_^b
zFabtMav#I#xA*`Oh@9vb`=#Pl(Rzb*nlRTw(ezNCdRW=U=4En{^;p(va(GK{L)%8V
zf3Cg3bB;TS4s#u%*KEssI9rSekH^QOr3+B3PImfPq?70RbRO%d9(-`iO73L{5I~0=
z#1TQh1?4%mUwB~b3|umTZZ<jP=lE_Fa<=W1B}+YLk*&jsC;t8@`_#kZogWoMcm`k$
z<Yf}7OA!SP-j_Zp_yB29c<F35i6uEalk?sMz*4onv<?RhcTZP~(Rlzo09S0Fn4Onr
znu2u;47@46`V*AiHzuy)WL^XL|CHX}E(}p^Njq-u>?Ato{pzVWGu)F&(XB2%s3Y~>
zNx~+J(pD~yC1*_{4+Es)H^B*9A#V1Pux}<ZiqK~Y&|?5$$3Z}C%vm?9I$JjQGz!GE
zeO)*W;cZE%vA`vu^%QR{G;O>EBL5M{8}VIAA;eSl4jCLTdpn!jKdXN5;3weI*Z2L}
zUXmpK>T=zSbTN1R+~K%~<GZ-R{T1-s%Q&gI-z`#c%7BY3<b8w{3WfS?DQs_?i|->&
z3jjG7NIP+D`aPK;=kWaUD)D%z5i%&qWkd@F^l>0pu{ekI_Iz4u02$&#K7}*e`7*bz
z?CA~hxh}{-FCzpXn$>tS2O1W+J)or#BD-T*t+U|*6sDt{%aSy6?Xtt72t^K*`@_6=
zfnjD=?%RN)R8i3*t-E~rvO0FO;d(w-QQd>Y;sY68V$cHU#H(X#`O`b5cz%;iWpg7f
zQ-b!U?mr(mUp9d3p8X|9f|Qisv-SGyzR2dH5pJ(FZRV#01psY50Xpl+o!T-->yT6?
z&KwJBPOdip&#h^i@r&tz4JGiWH@$hJXJRaS21Y=d&r~q*%=WTprRiL68`dKcK5G=#
z7ePzmaH%Hup}@vD%-39ji9LU9=rdYfHRIH`!Dbh*Na0dPjijMsdRF8--V8fsXdT=R
z=to~q)#uhv8=OaqXv*AV-YY@*Iw|Us`+{uikf(919uBc6tWb}y4C<{*&nGh~qQ(o!
zJa4D=4h$S=lV_sS`3ltV4W9_QP%x-~w62skvwSBz^y_WV(@HRJf}y2!SZtAK#8|2(
zAG^)KoGv9C&_IV{z%k+zF<ypSo8SQyN#L#~SUGY|=gabcl1jT_yP;lv`YmMqqIHZt
zX4x5Y|5nl(b3ILssv1v6zh_}q@VHVLV87T1|9p)RN1rhm4c>p^P`hNPcvH2&Ta)Ty
zqaJp-fP}W-jBk=nk1b#y!tR*wm*)!9Ooe}wltWvPeb@k66hhu10NkivJds<Zt41vF
zVdBk}6?RY&pz9;y*z@!Hu(H8)c{>b+{%O=h69EE^-_CLFFK!UEbb{R=Cg1Xz(-Q$T
z^}WQqvH5cyVC`bZU#zUG#A?itTg(f+e7!9`YVY{=A#5w$I-hdJU#*`XX~k2MZ_bDD
z1Z*+jARgj}Ul6VyJ&dGt_m271b@Gi;04U-qP4(t82;y{WYpY1j{S{_X!<yCNOuEqP
zNkbO+Lb}k@;cFl>9e9B73w$Uj{bq(stKRKEok^?%WrE(oniqd{jOZ!z=kfVeV<h-a
z&z@4Ov6);{eW6hwB_?VEGfS&hG*$ZjI)iqf0-+Grhm%2G9#Q~R2+`tHp`jfm-0$~w
zGsoL>2fqx(H@jXKwa<z=$Eorb>w?UiE&?xnqm^yfNqk9;Dl}wE2JqRtyPgZ@g}%CM
zWE^C>H*JN}X~Y|`;SvaLq8V>ei{|6jasv05_|0^G126yrmH<JoK+7>D<C!%M@bF&U
z$CvBiUpSePF3F|w%>YM}-*YW<=IpCmZP^^hSNxPE(_R=90KwWzy1Fy3ZskM6f+VO)
zJQ>b%R<X!AU8wP|vh_%fUdP8fN#{A%_)WUQZqDfJlcPoJdI2noHD;YmeRbx6aqSEx
zHd2x9!~VWF;SlAn)%P{wIw+5WVvFw-*2gC$U34^tf@E2pEqcq*)=8J)Z4N;{l;cBC
z?x5bQope$7xLC?K%Te~0_uMKQS<S}m#(pv^GAeq>qkMIJP&R%=w5Y}Hpzx?msI$Ys
zsTs^&Acak=?e#CZPv6nzy80yS?t|%G&6Wju!e{^XmTh0`%txeo<;Jybe{1&aZdrQ9
zxWy|;CGUJ?Eg#1-P`E}!FG@C!%4+K$EHur-8*X{s3eq!~T_kHYJmUNoKaVbb^v&Qj
z4scc8a#D!<`Sg7*>kqwnC8-A&BPHV;C9>Xbz9d^oOcC1m@UY7o$`&xcwmp=>4s0<0
za7J?#`O!=9_y!Y?#D|08=<(qh*CtXSIU{y@brTi$gGPO;4JH*Id_8hT+J1Bs!gmO5
z`#!1@xK*fqL6v@EzhR4ja)&uG`GPR4HdQ2gg<VHi*Vv7<uPG!t+CbjPQmIvoK*7;=
zzEnl`pRR?#T>8-=+dg42yFsYE6~06|y?(6H^Njx}PNy+jVr4hxy7;JMT4ckad3jb9
zr#>hrfwc22X!z7@lw7w{foHQ<P_++%8wgvTF|X7`Lm2>fuxY->uw4@5T!jhaNHC1I
zu&}7xnc*Iwgu^@jX(?Q6v-7G^R(<YtI=^=P!g~aPW68wEJI~6gcE;w(Kg+%@Dwrp!
zh$@nbG-l!XXHfH*_&dVI-()1oLXA_IMC_UD<4YGMryJR|u0COl;J(*DX<&QCo96Jx
zVz!Wq0V{XX`mSg9oyvqMT|amCg6is#9A76~fD7QQHJX5*^RX9Y<4+Ps#`2JCUEMjo
z4!D*B!Y1F*D0jsRi7aWql7DYDA%#)P9cI5XaMoTZIFb94yhb}#aIWis$?%;q?b>^b
z#p85cBwKdhbkuez)4Xi7fvxd?Ux3zK)4caFAGz-V2b06$7vPMliwnHV5seGev^Pg1
z*(!F<=OEc%Wgh|TbCXpTD3G0jsM%-vCauYJRM!#ep10)k3%~VBFN=8cM-b_THl+0W
z8{7#TmH_**#e-~rm$+3K0CB68W>kIEB=-KR+n@=4TOS3_;rVh6@GqK+A$Jt2*JWBz
z_L0_QOMQj!f9M6Cyc@Q)K#9I7@F-I~b&Bw*L?X(>N(7hu@+x38qFEyih}X$L|49}A
zBbiLIW+rf{iq+s6OC(`)SLOj4IfOIwZ^{#Ev+oalu|pU)^7v}D_L}#5jRv<(rF1)i
z6G$h>nWe8l$SASoF%?Io<cVp&T=Oz{n#+SI#i5$Sv~Q%mBA86zXIBkI>)6Rjo*d$H
zu(_U%#aJ67BK1byp^UAJDgVaHE$`tb(Yvj!HSPGHQt@+eQNv!}7T;VR<Wls$U-frc
zuhn(2?8>A~Im!wn|E~e52Fd0FNco2Q31gRLJ1^S;_8_GjeG4(pFHh^-Z0d+~q$96F
zh(G)33OCTC_cRRW4^@yjbMAEhPd#SsAGeJB5+nj-X+XboY3olr*UvI+?6(0Opg%xP
zsC2>+sM}+J=(u?qwg2s=yWqspkIZ40o?=J-rK<&KWU_kzt0E&4H7n!FTcEDWX!0|E
z5lz+NDSRZCw_|>0bWvgoFqR(ivQ5Cr0T$YL#5|DZ1LTaDKh#c?zHOxWv_z0Iokaw5
zqbq`GsxR6$s(SA{=Z!8lQ_sLWOd-sC57E#aG>ywW4PiqT7CAA&ML;dS#*WTr{N>j<
z{rhVz?l=z!yA65vSZZVb_@r6=vF}?SR$PFTa3cx$X4b<_)6GwVk`B=wg9ZG{5xc&i
zEw;X#C18KEO)-aZUNc3sV>Q}cD<ndt;<2{;VHcK`eJ#LazcmvCXX9)W!12$NzgA9_
z{%1|Q)7GA>a3FtIEu~JaHO)FD+gcci0~&h%HEhPfPPWT`!O+?dc(uPbW`xtB?-bge
znsF0)EjsmXjSL6K*|Ycq7VWe|uMl|z^7f~r#dcr{tjc@&9gzIf@$uiUds1YV?gr*9
z;|_aIy5_bt0#gGPUiVt*JRzO*sOvp_966v0rEGV4+$YUIGn$&sREv#vEbQ#a%12k=
zc#Up+k%^bV0IRBII}hM{ENa!mdh@3PZUHkGVE_D1Mpry{1A}+EK@9j9=lm2`n+n`{
zz-%~0qkmI-!UPjJg4B6IfQ8e6($koSMu<n-z+H%4k7v;3Wir~2czhhNBk?p9;T0ID
zT{)%U$Ha;%*!MGE%K#SDPg@dmKPSWxvop#$_tnudin%Leew}s;wU_)BrG)fTe0#8!
zC6CK4%uEBi5p%*PVYkDfxr7+h(PhtAaFWo7-~&_fSC^%VE*wlq9)EM>UJF|$TTRjJ
zr+L*g8{&W>UMtZT7wAJ`JL}G{eeUM>3!`1r{Vo$UG4%d>ec&Ch?c_)=s+1)ed8R(Y
za5@3cixrvb_C9x1GD}Ui$!3ti9B>ZI0uE@jFxB}5{ONx)$dt5l^oJ4b6BIJQn28@D
zg)o1>h<jRLb<AQgrM|ngbnOZ7rnDZ=o>XL294ekFN}kp2Lsw=TZ;`_*oB?mO)W%Mr
z?Bk?PFVsmY4Hz^Or^vfwp7Uf55A>}B5|$?A&}Z2w`7@<wSdlK-xa9Ps?b-{%-?G&a
zKAz^yyE63B%VV=x8sw()tBM>jAxZgFIq|n$-W?|obn6<76uj#k$9T6-{T{PK>No1W
z^OBwls)6BOuFtM&+dVEq{}Wqj{{}Fwjd@9NfH$6ObKw`It8g&@5Y_%`ZN0%7>#~Ii
zD%DwxnbmEED_<`q`h|@1oTR*k?E@{pRl_CO8=l{^3)^Y27PUrNuO}V>F6==}w?*yC
zB@bVDsjd~@PqheEK<g5AYHTVm<~N*fQP?G&j9YwoqK!Ca+5h%UX;xkI_FM!pJ3IT0
zWf)4#`s|XSra1(C2Kayd9bt>=d?;ixHcHO+j&0)Ii}lVGf-JsH>>ZH@c9$qf`6G42
zWyT}6J}7+l)Ovrv?34R5t<v^t8hx_kx@PGlp1DgpucY=~kNS_1Keh{~{<^mZpM48(
zI(FgkPzu<phH+*?|L7?}0k1S3e|64DqA%R%O|!?lI{u3rTfqN4yi_;+JyharTU_pn
z_#+R;1$?y}Ey~@Y)vQkba;XRG^|bAzP7d(lnF%5<E4TMraUgboaX_nT!lLdD6;f#%
z-5>N9aHi)f9I|~UP4aflwyWE)k`Y+$`3Q!+ze1wX`Q&)20kg_8<zPirf0vZ7$cjWW
zYC$YK1~+%EZ!Iv)_ZF3s7cS#kr%2AuAWO(uw6ESMJ12yb*(oD4CI;n|PLwwg=PW2^
znXPVbWwL9UhOY`jFOxspe#8raW^s<sm59wD>I#5h++pg**YLnoK1~F;vN9jAU5Sfj
z$L0v7)N>uh#}AXSJq~`#Y6Bk0h>(E>HAe$PYUD^J8{F{-=Sa4IXZhJ-G_`~0i{>gv
zIVdfsI4g33aj2h#n|PZ)w<+5I2H#{SA=4POt9;$A00y|bTQB*f0F1y@NXstAD>b^c
zb(0`xKx6zZ|M5rKS8WRh*Fk0E39{yRgsMJ^=#uZy-D;$H8bGXsAcH4Lv>pGt8=3u2
zB(^%70AJ4R8!w9xW4=dQnhczJQ^@i~NkeMlt9{+!f+6JR06F<}t8-)K_Q_QTYKhz+
z*OzRX@?(=)HZ}XnYjf^Lb^?vxgz&xGpqT^uv9?b?^G&!Y{+zDFyW$3UVCZTpc|{B0
zKTAD#^+ubMW`G<je8`+OBYFBCTv>afL1t-ehC}Pj#@qWUBgRcOsilrZJY0-Qu#Tz#
za}bCLhco42PEUQd?+d<f-MrNdAawdvFbq{LPO@d=l$CtInSqIR*5otAscBEZk1n9=
z3|n;@_in8qbs7T<E~e^$O+OlPzqBGk(A_}28kna|8v18_>L&06jF;>IPJ7zmoFrJ;
zx(q<P+SnOw6l*!lZikd$QZyLdeMZE8w8=WVlMfhiB<tx~K8btaObj}StjmPy@Xh|t
z9`Y-7qAg=k<*ddjLkl|DD&JMyUbm}%)iqxe!!87zBkacpw+&wPmI@Sfm<gts3F!mp
zBVKf3>gq2TOIisMrf#|uKDyD*s52JGW=4`y_<{Yqw5_LkS|?BaKgRc4CX!=#u^F#u
zXDui*(#v;+)Abm<$>GuOMw_BG?JLfbFlK1-c{aVUt$#$50;;-G6J~jmntSb@vR5-a
zNBg2VJ$)b$Muyxo>%w=zwiK_+Bx3YR9~{I)Jv<}}+uy!-%mjwp+d(q8?Z!!QN2G5j
zm~F`YgBmREO~#7vbav!v0eRM0f@LvnT&z=2r#yRpB0_ps1kssMTCQ6e_tAn9=to6p
zxeTr3k;tpczVcevFW$(l2VC3yEd*DFCvCsG6GkTGR5MPHI<}Y2(cr6hRyGBckE2p*
zbQNLX5-Ppmn{O`CMrJ}xyly6TwMKocVQ&-|@BB{=T3=_nv1K#yOJi&nv`;MoSFi27
zmtOLMy<Xk-Q>7yx7j?+AlFKt4ORCxJsvKUHzUyNL%)6K-@pUsJ&VK@P^6O-69-CWy
zzEkmuOM6g<Zzwwn9f40t0YjZ|84m36$%b}}7dJdyfL*x#678&{fW}Ot=$HsOJe=-1
z@H7B9x{!pB4NzmRRX(w(H<dONd_Q$eJPg^~Gc;J?ypdn$&vspG06E`nuwDUD*pRKB
z6Bw^Nk&8e`yUH`!a3~9(R;*i)6<z0;de#&enYF2Z`5fmtkoG{%17}x()3)>K1m&lj
z?j`}(Lga72gZ@K2f;}PQS9MRD?O`oWcsu@pPg}oIRK)g78k-F+TD}Rv@Al-80`RCU
zGP8>U!%zu3^%5$#dZyu+1B}gPtU}7-b3|_BW{3uGRbK+-#oFVWO3<aw)^=}}Dq8u(
z)Zr3$V+(B^xf{|(6#QYl+DC!6v$RDrw5lq@Y$*)^d|?pwzoNr8Cun`e#DE88V;X)d
z|65y4s^`Z>Gr<y~MJdK$mOr|XV;JmgXc940f*!5p1wTC4PY1mF=4Q;UOqw^9VBF$|
zG$G4mR-s|hBVdlNZjqoR;xO`tm}VWLe_d-;N1*1%ch7<?2T`lrxkAl0NvCqA*6rK3
z8Bii~0U(6{+w*^32M_aRU5rNV3fLh4bQ0rKEar7G!gwi=`V5(~r>l%Nn{O`ro>sit
z)g_EhDguvpP2}Fh?*Oy9Z|-Ynmpr{x1h}%b#y@#K*=5PoBQP*9IM2NspJ`z)*WV-!
zEJVPwDX=rV+bmZc$G3n*%=I7mFMa&PrOPljGNQdj+#>{T?eK7Q;b!-N`UE!CAoJ##
z>2)}{fc^#IqVd80((vVS#<$vvDJO*P6Ma9KtQzehoXpYjVg2?a$Q#YXaG;mX&CL<i
zTn?R5sj=7|_-wjAZkhTlCr~*9_MTm8XSYAqfAHH~_}WEDs7a4Soj^OSS0LeFYsf!#
z6*UwxOsMWyK@)&(C{tEi1s<`W*0Njfs74@|fb}aq?m@Z!?g;$ErLwmwu4YOOchFLJ
z9wk(^3*L<$C}4TVCO{X!ew8pk5d%F|(08CR@BJqFKMCdkccV!c=;*(n5GBK#S;vWE
z4l2RF5AC<^)<$1*1DFz%C(lCUthw>%Gw@37be}YFWo}BwjdgSWch%rnS;#qh4C)u)
zo)6FMQGrY*mp1`S=CM?4tIWd>Y}I>SM6E8O;@e*ScL9W1OfX+*ZwK|OK=ItvGx7!e
z+HjiC00kY`1x6)F1RK@V&SZn2M1N7?zdc_eezBR#OQ)`t=7pe2G3AD{7h{%v?^jMm
z|32Cf0#EY^&FSy)xdQCQ0UKR_T=!|ki0NPNPDARS{cL-VUJB1w1tmpbG=ON^+{bI5
z=UP2i1H2qyn+ZD3$e^{@L~5})rsRBHsMVjnbz*Y)dVY=+9Gv7J-fi`n<reX`e3PUh
z0+!lPaIvXfYk50Zic}CF>quI5;rC#N#px%GCtpbgh=s(TD0X=&k_$>YwKGK^YhOvp
z*u7xfo6?5Eq&xX+9RZON;PkE_qK0uQ@l7eQX7~3m&@%m>4W}Xg>Wn&kDNN_VVt}F0
zS~2Eky(!lz^$*^%xX~-kFZ|t3Ny~~*C>f%mK7AA*wNF!~{nrf$TtT6v*E{TnJM2WP
z7{rWqJOQM$S1<kyhz@YYt8-&;gnSLD;y*j~AQj3Af4}Az8nvDO(-g#t<=;KU|NhCZ
z^uOo)>&XA_r2ogC{Zv(2x0U|3{}Jre`tv>Hah;J$@~Yt9euSukzxND$`2L<N;PH~`
ze@{*{)MW^bYHHK`yI<7m#X%(ybS~;vwR8Wy<kDR*M-J8rCoN+pe|ASgAa~wJMVY9m
zEJPBERk*!U|7Z8B+ZKT;Wv4I5F7y2VY?ZAmn|h2;S-1s{N+|hzqaUH(OgelkI$+D!
zpZoMo3?PJ`JMbu%z!~}@qRdjR{XJLs;q>FiPwl}!)?amdDXO$)?*8rCm76?;+kNuK
zeEGYwfrzqvg}(<%+vi!=_5B2tpM$=usMriq{7+-EEmM!H*$DDMnaT(wT?S72Sli_P
z>Auw>z{GO<!E!8Mp@UehuHyO51O2Df4*^p2)ZOu)eaBy(V#z{Y7?;V#SNhF=c5y-=
z5gJje-`+~;v-x?^q(p){x605h8~m9vA*DlWE>REc^8hx?W#Vt|vabwRnXEl>{c{{b
zm3VP1CaM&OptZZSHz?s63uHIg=&7RyWxeG8K38<VikC7axt<0kCER<9+{g@U-}dpW
xmV^;b_p6_NBz1lVYDTSY;7@93s0%lbf-`jW4J@8R#7+<1Q@yWJqhuBOe*jw?y1D=W

literal 0
HcmV?d00001


From f5b199faeae124ee248f013b8378ffd8fdfd733f Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Thu, 21 Mar 2024 16:22:34 +0100
Subject: [PATCH 042/123] =?UTF-8?q?welcome=20@betterstack-community=20?=
 =?UTF-8?q?=E2=9C=8C=EF=B8=8F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 _layouts/default.html | 4 +++-
 assets/css/app.css    | 6 +++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/_layouts/default.html b/_layouts/default.html
index aa0ae44..56f4a92 100644
--- a/_layouts/default.html
+++ b/_layouts/default.html
@@ -132,7 +132,9 @@
         Kindly supported by&nbsp; 👉
       </li>
       <li>
-        <img width="" src="assets/sponsor-betterstack.png" />
+        <a href="https://betterstack.com/">
+          <img width="" src="assets/sponsor-betterstack.png" />
+        </a>
       </li>
     </ul>
   </header>
diff --git a/assets/css/app.css b/assets/css/app.css
index 7f6c276..e42ff66 100644
--- a/assets/css/app.css
+++ b/assets/css/app.css
@@ -129,7 +129,11 @@ ul#sponsoring li {
     font-size: 16px;
 }
 
-ul#sponsoring li img {
+ul#sponsoring li a {
+    display: flex;
+}
+
+ul#sponsoring li a img {
     max-width: 180px;
     max-height: 80px;
 }

From 85b102df08cb18786f484599b8c6f0ef58f29d1d Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Thu, 21 Mar 2024 16:25:24 +0100
Subject: [PATCH 043/123] =?UTF-8?q?Welcome=20@betterstack-community=20?=
 =?UTF-8?q?=E2=9C=8C=EF=B8=8F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/README.md b/README.md
index b500cb7..c5c241c 100644
--- a/README.md
+++ b/README.md
@@ -4,6 +4,21 @@
 
 Collection available here: **[https://samber.github.io/awesome-prometheus-alerts](https://samber.github.io/awesome-prometheus-alerts)**
 
+<div align="center">
+  <hr>
+  <sup><b>Sponsored by:</b></sup>
+  <br>
+  <a href="https://betterstack.com">
+    <div>
+      <img src="https://samber.github.io/awesome-prometheus-alerts/assets/sponsor-betterstack.png" width="200" alt="Better Stack">
+    </div>
+    <div>
+      Better Stack lets you centralize, search, and visualize your logs.
+    </div>
+  </a>
+  <hr>
+</div>
+
 ## ✨ Contents
 
 - [Rules](#-rules)

From 2494ccdf3164f06df9e90a1c444511088b52ef47 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rastislav=20P=C3=B4bi=C5=A1?= <rastislav.pobis@gmail.com>
Date: Tue, 26 Mar 2024 16:56:15 +0100
Subject: [PATCH 044/123] Added prepared statements mysqld-exporter alert
 (#407)

---
 _data/rules.yml                      | 5 +++++
 dist/rules/mysql/mysqld-exporter.yml | 9 +++++++++
 2 files changed, 14 insertions(+)

diff --git a/_data/rules.yml b/_data/rules.yml
index 9f430eb..3dc5c15 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -588,6 +588,11 @@ groups:
                 query: "max_over_time(mysql_global_status_threads_connected[1m]) / mysql_global_variables_max_connections * 100 > 80"
                 severity: warning
                 for: 2m
+              - name: MySQL high prepared statements utilization (> 80%)
+                description: "High utilization of prepared statements (>80%) on {{ $labels.instance }}"
+                query: "max_over_time(mysql_global_status_prepared_stmt_count[1m]) / mysql_global_variables_max_prepared_stmt_count * 100 > 80"
+                severity: warning
+                for: 2m
               - name: MySQL high threads running
                 description: "More than 60% of MySQL connections are in running state on {{ $labels.instance }}"
                 query: "max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections * 100 > 60"
diff --git a/dist/rules/mysql/mysqld-exporter.yml b/dist/rules/mysql/mysqld-exporter.yml
index ad8ed5f..4811ee9 100644
--- a/dist/rules/mysql/mysqld-exporter.yml
+++ b/dist/rules/mysql/mysqld-exporter.yml
@@ -22,6 +22,15 @@ groups:
         summary: MySQL too many connections (> 80%) (instance {{ $labels.instance }})
         description: "More than 80% of MySQL connections are in use on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
+    - alert: MySQL high prepared statements utilization (> 80%)
+      expr: 'max_over_time(mysql_global_status_prepared_stmt_count[1m]) / mysql_global_variables_max_prepared_stmt_count * 100 > 80'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: MySQL high prepared statements utilization (> 80%) (instance {{ $labels.instance }})
+        description: "High utilization of prepared statements (>80%) on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
     - alert: MysqlHighThreadsRunning
       expr: 'max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections * 100 > 60'
       for: 2m

From 6b05a59ad9b78736079a1cbb80165d89b671f3e3 Mon Sep 17 00:00:00 2001
From: samber <samber@users.noreply.github.com>
Date: Tue, 26 Mar 2024 15:57:31 +0000
Subject: [PATCH 045/123] Publish

---
 dist/rules/mysql/mysqld-exporter.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dist/rules/mysql/mysqld-exporter.yml b/dist/rules/mysql/mysqld-exporter.yml
index 4811ee9..380fca3 100644
--- a/dist/rules/mysql/mysqld-exporter.yml
+++ b/dist/rules/mysql/mysqld-exporter.yml
@@ -22,7 +22,7 @@ groups:
         summary: MySQL too many connections (> 80%) (instance {{ $labels.instance }})
         description: "More than 80% of MySQL connections are in use on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: MySQL high prepared statements utilization (> 80%)
+    - alert: MysqlHighPreparedStatementsUtilization(>80%)
       expr: 'max_over_time(mysql_global_status_prepared_stmt_count[1m]) / mysql_global_variables_max_prepared_stmt_count * 100 > 80'
       for: 2m
       labels:

From 267c3e8e70db9f39aa3f7cb697d58aaa3d790fbe Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Mon, 29 Apr 2024 22:35:43 +0200
Subject: [PATCH 046/123] Update rules.yml

---
 _data/rules.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/_data/rules.yml b/_data/rules.yml
index 3dc5c15..838591e 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -199,6 +199,7 @@ groups:
                 description: "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem"
                 query: "node_filesystem_device_error == 1"
                 severity: critical
+                for: 2m
               - name: Host inodes will fill in 24 hours
                 description: Filesystem is predicted to run out of inodes within the next 24 hours at current write rate
                 query: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{fstype!="msdosfs"} == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'

From b77cb3467c1de3dd45d35e8d4e459a10df544628 Mon Sep 17 00:00:00 2001
From: samber <samber@users.noreply.github.com>
Date: Mon, 29 Apr 2024 20:36:49 +0000
Subject: [PATCH 047/123] Publish

---
 dist/rules/host-and-hardware/node-exporter.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dist/rules/host-and-hardware/node-exporter.yml b/dist/rules/host-and-hardware/node-exporter.yml
index de48231..6655ef7 100644
--- a/dist/rules/host-and-hardware/node-exporter.yml
+++ b/dist/rules/host-and-hardware/node-exporter.yml
@@ -96,7 +96,7 @@ groups:
 
     - alert: HostFilesystemDeviceError
       expr: 'node_filesystem_device_error == 1'
-      for: 0m
+      for: 2m
       labels:
         severity: critical
       annotations:

From aad1c4cd959a4713cba9e614513d20ddcaf9cd93 Mon Sep 17 00:00:00 2001
From: Sergey Shtoltz <shtoltz@users.noreply.github.com>
Date: Thu, 2 May 2024 21:48:46 +0300
Subject: [PATCH 048/123] RedisOutOfConfiguredMaxmemory: checking if memory
 limit is set (#410)

---
 _data/rules.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/_data/rules.yml b/_data/rules.yml
index 838591e..109b0c8 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -821,7 +821,7 @@ groups:
                   The exporter must be started with --include-system-metrics flag or REDIS_EXPORTER_INCL_SYSTEM_METRICS=true environment variable.
               - name: Redis out of configured maxmemory
                 description: Redis is running out of configured maxmemory (> 90%)
-                query: "redis_memory_used_bytes / redis_memory_max_bytes * 100 > 90"
+                query: "redis_memory_used_bytes / redis_memory_max_bytes * 100 > 90 and on(instance) redis_memory_max_bytes > 0"
                 severity: warning
                 for: 2m
               - name: Redis too many connections

From 5c0963558a1165f89bf2c0216c52512fb07602da Mon Sep 17 00:00:00 2001
From: samber <samber@users.noreply.github.com>
Date: Thu, 2 May 2024 18:49:56 +0000
Subject: [PATCH 049/123] Publish

---
 dist/rules/redis/oliver006-redis-exporter.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dist/rules/redis/oliver006-redis-exporter.yml b/dist/rules/redis/oliver006-redis-exporter.yml
index 08cdf23..6b4dd8d 100644
--- a/dist/rules/redis/oliver006-redis-exporter.yml
+++ b/dist/rules/redis/oliver006-redis-exporter.yml
@@ -77,7 +77,7 @@ groups:
         description: "Redis is running out of system memory (> 90%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: RedisOutOfConfiguredMaxmemory
-      expr: 'redis_memory_used_bytes / redis_memory_max_bytes * 100 > 90'
+      expr: 'redis_memory_used_bytes / redis_memory_max_bytes * 100 > 90 and on(instance) redis_memory_max_bytes > 0'
       for: 2m
       labels:
         severity: warning

From 59e6a9165dfb5dd23a3a3eafdf75210a818982e8 Mon Sep 17 00:00:00 2001
From: enesyalinkaya <49714068+enesyalinkaya@users.noreply.github.com>
Date: Mon, 6 May 2024 02:32:00 +0300
Subject: [PATCH 050/123] add new alerts for elasticsearch rules.yml (#411)

This commit adds new Prometheus alert definitions to monitor indexing and query metrics in Elasticsearch clusters. These alerts are essential for detecting performance issues related to indexing and querying activities.
---
 _data/rules.yml | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/_data/rules.yml b/_data/rules.yml
index 109b0c8..744bf10 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -1128,6 +1128,26 @@ groups:
                 description: No new documents for 10 min!
                 query: 'increase(elasticsearch_indices_indexing_index_total{es_data_node="true"}[10m]) < 1'
                 severity: warning
+              - name: Elasticsearch High Indexing Latency
+                description: "The indexing latency on Elasticsearch cluster is higher than the threshold."
+                query: "elasticsearch_indices_indexing_index_time_seconds_total / elasticsearch_indices_indexing_index_total > 0.0005"
+                severity: warning
+                for: 10m       
+              - name: Elasticsearch High Indexing Rate
+                description: "The indexing rate on Elasticsearch cluster is higher than the threshold."
+                query: "elasticsearch_indices_indexing_index_total > 100000"
+                severity: warning
+                for: 5m     
+              - name: Elasticsearch High Query Rate
+                description: "The query rate on Elasticsearch cluster is higher than the threshold."
+                query: "elasticsearch_indices_search_query_total > 100000"
+                severity: warning
+                for: 5m
+              - name: Elasticsearch High Query Latency
+                description: "The query latency on Elasticsearch cluster is higher than the threshold."
+                query: "elasticsearch_indices_search_fetch_time_seconds / elasticsearch_indices_search_fetch_total > 1"
+                severity: warning
+                for: 5m                 
 
       - name: Cassandra
         exporters:

From 515fca9c10898f728c116c9816a186c9d600a5b4 Mon Sep 17 00:00:00 2001
From: samber <samber@users.noreply.github.com>
Date: Sun, 5 May 2024 23:33:11 +0000
Subject: [PATCH 051/123] Publish

---
 ...theus-community-elasticsearch-exporter.yml | 36 +++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml b/dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml
index 4ed5660..9aeadec 100644
--- a/dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml
+++ b/dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml
@@ -138,3 +138,39 @@ groups:
       annotations:
         summary: Elasticsearch no new documents (instance {{ $labels.instance }})
         description: "No new documents for 10 min!\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: ElasticsearchHighIndexingLatency
+      expr: 'elasticsearch_indices_indexing_index_time_seconds_total / elasticsearch_indices_indexing_index_total > 0.0005'
+      for: 10m
+      labels:
+        severity: warning
+      annotations:
+        summary: Elasticsearch High Indexing Latency (instance {{ $labels.instance }})
+        description: "The indexing latency on Elasticsearch cluster is higher than the threshold.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: ElasticsearchHighIndexingRate
+      expr: 'elasticsearch_indices_indexing_index_total > 100000'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Elasticsearch High Indexing Rate (instance {{ $labels.instance }})
+        description: "The indexing rate on Elasticsearch cluster is higher than the threshold.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: ElasticsearchHighQueryRate
+      expr: 'elasticsearch_indices_search_query_total > 100000'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Elasticsearch High Query Rate (instance {{ $labels.instance }})
+        description: "The query rate on Elasticsearch cluster is higher than the threshold.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: ElasticsearchHighQueryLatency
+      expr: 'elasticsearch_indices_search_fetch_time_seconds / elasticsearch_indices_search_fetch_total > 1'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Elasticsearch High Query Latency (instance {{ $labels.instance }})
+        description: "The query latency on Elasticsearch cluster is higher than the threshold.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

From 2547288c13ba21703dbd4be25e3c6e4618180255 Mon Sep 17 00:00:00 2001
From: Ali <115415312+xogoodnow@users.noreply.github.com>
Date: Mon, 13 May 2024 12:02:18 +0330
Subject: [PATCH 052/123] Added Clickhouse (#412)

* Added Clickhouse

* Update rules.yml

Added reasonable time periods for each query to avoid false positives and in some cased give the system a short window to try to solve the issue.
Also changed the severity level of authentication alerts from critical to info which seems more appropriate

* Modified time period for alerts embedded-exporter.yml

I made a few adjustments in time periods.
See if they seem reasonable or not

* Replication alerts time periods were adjusted

IMHO, replication alerts must be sent right away.
---
 _data/rules.yml                             |  82 ++++++++++++
 dist/rules/clickhouse/embedded-exporter.yml | 131 ++++++++++++++++++++
 2 files changed, 213 insertions(+)
 create mode 100644 dist/rules/clickhouse/embedded-exporter.yml

diff --git a/_data/rules.yml b/_data/rules.yml
index 744bf10..ffb4604 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -1300,6 +1300,88 @@ groups:
                 severity: critical
                 for: 2m
 
+      - name: Clickhouse
+        exporters:
+          - name: Embedded Exporter
+            slug: embedded-exporter
+            doc_url: https://clickhouse.com/docs/en/operations/system-tables/metrics
+            rules:
+              - name: ClickHouse Memory Usage Critical
+                description: Memory usage is critically high, over 90%.
+                query: "ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 90"
+                severity: critical
+                for: 5m
+              - name: ClickHouse Memory Usage Warning
+                description: Memory usage is over 80%.
+                query: "ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 80"
+                severity: warning
+                for: 5m
+              - name: ClickHouse Disk Space Low on Default
+                description: Disk space on default is below 20%.
+                query: "ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 20"
+                severity: warning
+                for: 2m
+              - name: ClickHouse Disk Space Critical on Default
+                description: Disk space on default disk is critically low, below 10%.
+                query: "ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 10"
+                severity: critical
+                for: 2m
+              - name: ClickHouse Disk Space Low on Backups
+                description: Disk space on backups is below 20%.
+                query: "ClickHouseAsyncMetrics_DiskAvailable_backups / (ClickHouseAsyncMetrics_DiskAvailable_backups + ClickHouseAsyncMetrics_DiskUsed_backups) * 100 < 20"
+                severity: warning
+                for: 2m
+              - name: ClickHouse Replica Errors
+                description: Critical replica errors detected, either all replicas are stale or lost.
+                query: "ClickHouseErrorMetric_ALL_REPLICAS_ARE_STALE == 1 or ClickHouseErrorMetric_ALL_REPLICAS_LOST == 1"
+                severity: critical
+                for: 0m
+              - name: ClickHouse No Available Replicas
+                description: No available replicas in ClickHouse.
+                query: "ClickHouseErrorMetric_NO_AVAILABLE_REPLICA == 1"
+                severity: critical
+                for: 0m
+              - name: ClickHouse No Live Replicas
+                description: There are too few live replicas available, risking data loss and service disruption.
+                query: "ClickHouseErrorMetric_TOO_FEW_LIVE_REPLICAS == 1"
+                severity: critical
+                for: 0m
+              - name: ClickHouse High Network Traffic
+                description: Network traffic is unusually high, may affect cluster performance.
+                query: "ClickHouseMetrics_NetworkSend > 250 or ClickHouseMetrics_NetworkReceive > 250"
+                severity: warning
+                for: 5m
+                comments: |
+                  Please replace the threshold with an appropriate value
+              - name: ClickHouse High TCP Connections
+                description: High number of TCP connections, indicating heavy client or inter-cluster communication.
+                query: "ClickHouseMetrics_TCPConnection > 400"
+                severity: warning
+                for: 5m
+                comments: |
+                  Please replace the threshold with an appropriate value
+              - name: ClickHouse Interserver Connection Issues
+                description: An increase in interserver connections may indicate replication or distributed query handling issues.
+                query: "increase(ClickHouseMetrics_InterserverConnection[5m]) > 0"
+                severity: warning
+                for: 1m
+              - name: ClickHouse ZooKeeper Connection Issues
+                description: ClickHouse is experiencing issues with ZooKeeper connections, which may affect cluster state and coordination.
+                query: "avg(ClickHouseMetrics_ZooKeeperSession) != 1"
+                severity: warning
+                for: 3m
+              - name: ClickHouse Authentication Failures
+                description: Authentication failures detected, indicating potential security issues or misconfiguration.
+                query: "increase(ClickHouseErrorMetric_AUTHENTICATION_FAILED[5m]) > 0"
+                severity: info
+                for: 0m
+              - name: ClickHouse Access Denied Errors
+                description: Access denied errors have been logged, which could indicate permission issues or unauthorized access attempts.
+                query: "increase(ClickHouseErrorMetric_RESOURCE_ACCESS_DENIED[5m]) > 0"
+                severity: info
+                for: 0m
+
+
       - name: Zookeeper
         exporters:
           - name: cloudflare/kafka_zookeeper_exporter
diff --git a/dist/rules/clickhouse/embedded-exporter.yml b/dist/rules/clickhouse/embedded-exporter.yml
new file mode 100644
index 0000000..19917bb
--- /dev/null
+++ b/dist/rules/clickhouse/embedded-exporter.yml
@@ -0,0 +1,131 @@
+groups:
+- name: EmbeddedExporter
+  rules:
+    - alert: ClickHouseMemoryUsageCritical
+      expr: 'ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 90'
+      for: 5m
+      labels:
+        severity: critical
+      annotations:
+        summary: ClickHouse Memory Usage Critical (instance {{ $labels.instance }})
+        description: "Memory usage is critically high, over 90%.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: ClickHouseMemoryUsageWarning
+      expr: 'ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 80'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: ClickHouse Memory Usage Warning (instance {{ $labels.instance }})
+        description: "Memory usage is over 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: ClickHouseDiskSpaceLowDefault
+      expr: 'ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 20'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: ClickHouse Disk Space Low on Default (instance {{ $labels.instance }})
+        description: "Disk space on default is below 20%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: ClickHouseDiskSpaceCriticalDefault
+      expr: 'ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 10'
+      for: 2m
+      labels:
+        severity: critical
+      annotations:
+        summary: ClickHouse Disk Space Critical on Default Disk (instance {{ $labels.instance }})
+        description: "Disk space on default disk is critically low, below 10%.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: ClickHouseDiskSpaceLowBackups
+      expr: 'ClickHouseAsyncMetrics_DiskAvailable_backups / (ClickHouseAsyncMetrics_DiskAvailable_backups + ClickHouseAsyncMetrics_DiskUsed_backups) * 100 < 20'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: ClickHouse Disk Space Low on Backups (instance {{ $labels.instance }})
+        description: "Disk space on backups is below 20%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: ClickHouseReplicaErrors
+      expr: 'ClickHouseErrorMetric_ALL_REPLICAS_ARE_STALE == 1 or ClickHouseErrorMetric_ALL_REPLICAS_LOST == 1'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: ClickHouse Replica Errors Detected (instance {{ $labels.instance }})
+        description: "Critical replica errors detected, either all replicas are stale or lost.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: ClickHouseNoAvailableReplicas
+      expr: 'ClickHouseErrorMetric_NO_AVAILABLE_REPLICA == 1'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: No Available Replicas in ClickHouse (instance {{ $labels.instance }})
+        description: "No available replicas in ClickHouse.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: ClickHouseNoLiveReplicas
+      expr: 'ClickHouseErrorMetric_TOO_FEW_LIVE_REPLICAS == 1'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: No Live Replicas in ClickHouse (instance {{ $labels.instance }})
+        description: "There are too few live replicas available, risking data loss and service disruption.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+
+    - alert: ClickHouseNetworkUsageHigh
+      expr: 'ClickHouseMetrics_NetworkSend > 1000 or ClickHouseMetrics_NetworkReceive > 1000'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: High Network Traffic in ClickHouse (instance {{ $labels.instance }})
+        description: "Network traffic is unusually high, may affect cluster performance.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: ClickHouseHighTCPConnections
+      expr: 'ClickHouseMetrics_TCPConnection > 1500'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: High TCP Connections in ClickHouse (instance {{ $labels.instance }})
+        description: "High number of TCP connections, indicating heavy client or inter-cluster communication.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: ClickHouseInterserverConnectionIssues
+      expr: 'increase(ClickHouseMetrics_InterserverConnection[5m]) > 0'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Interserver Connection Issues in ClickHouse (instance {{ $labels.instance }})
+        description: "An increase in interserver connections may indicate replication or distributed query handling issues.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: ClickHouseZooKeeperConnectionIssues
+      expr: 'avg(ClickHouseMetrics_ZooKeeperSession) != 1'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: ZooKeeper Connection Issues in ClickHouse (instance {{ $labels.instance }})
+        description: "ClickHouse is experiencing issues with ZooKeeper connections, which may affect cluster state and coordination.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: ClickHouseAuthenticationFailures
+      expr: 'increase(ClickHouseErrorMetric_AUTHENTICATION_FAILED[5m]) > 0'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Authentication Failures in ClickHouse (instance {{ $labels.instance }})
+        description: "Authentication failures detected, indicating potential security issues or misconfiguration.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: ClickHouseAccessDeniedErrors
+      expr: 'increase(ClickHouseErrorMetric_RESOURCE_ACCESS_DENIED[5m]) > 0'
+      for: 1m
+      labels:
+        severity: critical
+      annotations:
+        summary: Access Denied Errors in ClickHouse (instance {{ $labels.instance }})
+        description: "Access denied errors have been logged, which could indicate permission issues or unauthorized access attempts.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+

From 84b0569c97975361b600f25aa90d5fc1e583bd87 Mon Sep 17 00:00:00 2001
From: samber <samber@users.noreply.github.com>
Date: Mon, 13 May 2024 08:33:30 +0000
Subject: [PATCH 053/123] Publish

---
 dist/rules/clickhouse/embedded-exporter.yml | 74 ++++++++++-----------
 1 file changed, 37 insertions(+), 37 deletions(-)

diff --git a/dist/rules/clickhouse/embedded-exporter.yml b/dist/rules/clickhouse/embedded-exporter.yml
index 19917bb..3efe551 100644
--- a/dist/rules/clickhouse/embedded-exporter.yml
+++ b/dist/rules/clickhouse/embedded-exporter.yml
@@ -1,7 +1,10 @@
 groups:
+
 - name: EmbeddedExporter
+
   rules:
-    - alert: ClickHouseMemoryUsageCritical
+
+    - alert: ClickhouseMemoryUsageCritical
       expr: 'ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 90'
       for: 5m
       labels:
@@ -10,122 +13,119 @@ groups:
         summary: ClickHouse Memory Usage Critical (instance {{ $labels.instance }})
         description: "Memory usage is critically high, over 90%.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: ClickHouseMemoryUsageWarning
+    - alert: ClickhouseMemoryUsageWarning
       expr: 'ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 80'
       for: 5m
       labels:
         severity: warning
       annotations:
         summary: ClickHouse Memory Usage Warning (instance {{ $labels.instance }})
-        description: "Memory usage is over 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Memory usage is over 80%.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: ClickHouseDiskSpaceLowDefault
+    - alert: ClickhouseDiskSpaceLowOnDefault
       expr: 'ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 20'
       for: 2m
       labels:
         severity: warning
       annotations:
         summary: ClickHouse Disk Space Low on Default (instance {{ $labels.instance }})
-        description: "Disk space on default is below 20%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Disk space on default is below 20%.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: ClickHouseDiskSpaceCriticalDefault
+    - alert: ClickhouseDiskSpaceCriticalOnDefault
       expr: 'ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 10'
       for: 2m
       labels:
         severity: critical
       annotations:
-        summary: ClickHouse Disk Space Critical on Default Disk (instance {{ $labels.instance }})
+        summary: ClickHouse Disk Space Critical on Default (instance {{ $labels.instance }})
         description: "Disk space on default disk is critically low, below 10%.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: ClickHouseDiskSpaceLowBackups
+    - alert: ClickhouseDiskSpaceLowOnBackups
       expr: 'ClickHouseAsyncMetrics_DiskAvailable_backups / (ClickHouseAsyncMetrics_DiskAvailable_backups + ClickHouseAsyncMetrics_DiskUsed_backups) * 100 < 20'
       for: 2m
       labels:
         severity: warning
       annotations:
         summary: ClickHouse Disk Space Low on Backups (instance {{ $labels.instance }})
-        description: "Disk space on backups is below 20%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Disk space on backups is below 20%.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: ClickHouseReplicaErrors
+    - alert: ClickhouseReplicaErrors
       expr: 'ClickHouseErrorMetric_ALL_REPLICAS_ARE_STALE == 1 or ClickHouseErrorMetric_ALL_REPLICAS_LOST == 1'
       for: 0m
       labels:
         severity: critical
       annotations:
-        summary: ClickHouse Replica Errors Detected (instance {{ $labels.instance }})
+        summary: ClickHouse Replica Errors (instance {{ $labels.instance }})
         description: "Critical replica errors detected, either all replicas are stale or lost.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: ClickHouseNoAvailableReplicas
+    - alert: ClickhouseNoAvailableReplicas
       expr: 'ClickHouseErrorMetric_NO_AVAILABLE_REPLICA == 1'
       for: 0m
       labels:
         severity: critical
       annotations:
-        summary: No Available Replicas in ClickHouse (instance {{ $labels.instance }})
+        summary: ClickHouse No Available Replicas (instance {{ $labels.instance }})
         description: "No available replicas in ClickHouse.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: ClickHouseNoLiveReplicas
+    - alert: ClickhouseNoLiveReplicas
       expr: 'ClickHouseErrorMetric_TOO_FEW_LIVE_REPLICAS == 1'
       for: 0m
       labels:
         severity: critical
       annotations:
-        summary: No Live Replicas in ClickHouse (instance {{ $labels.instance }})
+        summary: ClickHouse No Live Replicas (instance {{ $labels.instance }})
         description: "There are too few live replicas available, risking data loss and service disruption.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-
-    - alert: ClickHouseNetworkUsageHigh
-      expr: 'ClickHouseMetrics_NetworkSend > 1000 or ClickHouseMetrics_NetworkReceive > 1000'
+    - alert: ClickhouseHighNetworkTraffic
+      expr: 'ClickHouseMetrics_NetworkSend > 250 or ClickHouseMetrics_NetworkReceive > 250'
       for: 5m
       labels:
         severity: warning
       annotations:
-        summary: High Network Traffic in ClickHouse (instance {{ $labels.instance }})
+        summary: ClickHouse High Network Traffic (instance {{ $labels.instance }})
         description: "Network traffic is unusually high, may affect cluster performance.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: ClickHouseHighTCPConnections
-      expr: 'ClickHouseMetrics_TCPConnection > 1500'
+    - alert: ClickhouseHighTcpConnections
+      expr: 'ClickHouseMetrics_TCPConnection > 400'
       for: 5m
       labels:
         severity: warning
       annotations:
-        summary: High TCP Connections in ClickHouse (instance {{ $labels.instance }})
+        summary: ClickHouse High TCP Connections (instance {{ $labels.instance }})
         description: "High number of TCP connections, indicating heavy client or inter-cluster communication.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: ClickHouseInterserverConnectionIssues
+    - alert: ClickhouseInterserverConnectionIssues
       expr: 'increase(ClickHouseMetrics_InterserverConnection[5m]) > 0'
-      for: 0m
+      for: 1m
       labels:
         severity: warning
       annotations:
-        summary: Interserver Connection Issues in ClickHouse (instance {{ $labels.instance }})
+        summary: ClickHouse Interserver Connection Issues (instance {{ $labels.instance }})
         description: "An increase in interserver connections may indicate replication or distributed query handling issues.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: ClickHouseZooKeeperConnectionIssues
+    - alert: ClickhouseZookeeperConnectionIssues
       expr: 'avg(ClickHouseMetrics_ZooKeeperSession) != 1'
-      for: 5m
+      for: 3m
       labels:
         severity: warning
       annotations:
-        summary: ZooKeeper Connection Issues in ClickHouse (instance {{ $labels.instance }})
+        summary: ClickHouse ZooKeeper Connection Issues (instance {{ $labels.instance }})
         description: "ClickHouse is experiencing issues with ZooKeeper connections, which may affect cluster state and coordination.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: ClickHouseAuthenticationFailures
+    - alert: ClickhouseAuthenticationFailures
       expr: 'increase(ClickHouseErrorMetric_AUTHENTICATION_FAILED[5m]) > 0'
       for: 0m
       labels:
-        severity: critical
+        severity: info
       annotations:
-        summary: Authentication Failures in ClickHouse (instance {{ $labels.instance }})
+        summary: ClickHouse Authentication Failures (instance {{ $labels.instance }})
         description: "Authentication failures detected, indicating potential security issues or misconfiguration.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: ClickHouseAccessDeniedErrors
+    - alert: ClickhouseAccessDeniedErrors
       expr: 'increase(ClickHouseErrorMetric_RESOURCE_ACCESS_DENIED[5m]) > 0'
-      for: 1m
+      for: 0m
       labels:
-        severity: critical
+        severity: info
       annotations:
-        summary: Access Denied Errors in ClickHouse (instance {{ $labels.instance }})
+        summary: ClickHouse Access Denied Errors (instance {{ $labels.instance }})
         description: "Access denied errors have been logged, which could indicate permission issues or unauthorized access attempts.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-

From 847143ecc94909ff05ebff39b83a91272122a68b Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Mon, 13 May 2024 10:42:04 +0200
Subject: [PATCH 054/123] Update README.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index c5c241c..16c92c7 100644
--- a/README.md
+++ b/README.md
@@ -52,6 +52,7 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts
 - [RabbitMQ](https://samber.github.io/awesome-prometheus-alerts/rules#rabbitmq)
 - [Elasticsearch](https://samber.github.io/awesome-prometheus-alerts/rules#elasticsearch)
 - [Cassandra](https://samber.github.io/awesome-prometheus-alerts/rules#cassandra)
+- [Clickhouse](https://samber.github.io/awesome-prometheus-alerts/rules#clickhouse)
 - [Zookeeper](https://samber.github.io/awesome-prometheus-alerts/rules#zookeeper)
 - [Kafka](https://samber.github.io/awesome-prometheus-alerts/rules#kafka)
 - [Pulsar](https://samber.github.io/awesome-prometheus-alerts/rules#pulsar)

From 870bbd47d2d9b09dee7e286961c3b3043976a76c Mon Sep 17 00:00:00 2001
From: Vijay Dharap <VDHARAP@volvocars.com>
Date: Mon, 13 May 2024 09:10:55 +0000
Subject: [PATCH 055/123] Fixed HPA rule to use more correct condition (#408)

* Fixed HPA rule to use more correct condition

* Update rules.yml

---------

Co-authored-by: Samuel Berthe <dev@samuel-berthe.fr>
---
 _data/rules.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/_data/rules.yml b/_data/rules.yml
index ffb4604..3f29fb0 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -1981,7 +1981,7 @@ groups:
                 for: 1m
               - name: Kubernetes HPA scale inability
                 description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to scale
-                query: 'kube_horizontalpodautoscaler_status_condition{status="false", condition="AbleToScale"} == 1'
+                query: '(kube_horizontalpodautoscaler_spec_max_replicas - kube_horizontalpodautoscaler_status_desired_replicas) * on (horizontalpodautoscaler,namespace) (kube_horizontalpodautoscaler_status_condition{condition="ScalingLimited", status="true"} == 1) == 0'
                 severity: warning
                 for: 2m
               - name: Kubernetes HPA metrics unavailability

From 613401a9600b5d8f31ec0a4890371b978becee62 Mon Sep 17 00:00:00 2001
From: samber <samber@users.noreply.github.com>
Date: Mon, 13 May 2024 09:12:01 +0000
Subject: [PATCH 056/123] Publish

---
 dist/rules/kubernetes/kubestate-exporter.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dist/rules/kubernetes/kubestate-exporter.yml b/dist/rules/kubernetes/kubestate-exporter.yml
index e43a1fb..9014275 100644
--- a/dist/rules/kubernetes/kubestate-exporter.yml
+++ b/dist/rules/kubernetes/kubestate-exporter.yml
@@ -122,7 +122,7 @@ groups:
         description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} went down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesHpaScaleInability
-      expr: 'kube_horizontalpodautoscaler_status_condition{status="false", condition="AbleToScale"} == 1'
+      expr: '(kube_horizontalpodautoscaler_spec_max_replicas - kube_horizontalpodautoscaler_status_desired_replicas) * on (horizontalpodautoscaler,namespace) (kube_horizontalpodautoscaler_status_condition{condition="ScalingLimited", status="true"} == 1) == 0'
       for: 2m
       labels:
         severity: warning

From 396083a2a1daabeb6b7b60a29c5c0ec3eef215b7 Mon Sep 17 00:00:00 2001
From: Florian Schlichting <fsfs@debian.org>
Date: Mon, 13 May 2024 12:09:04 +0200
Subject: [PATCH 057/123] Fix HaproxyBackendMaxActiveSession: look at current /
 limit (#413)

haproxy_backend_max_sessions is the maximum number of sessions ever encountered during the lifetime of the HAProxy process. That is, it will never go down until HAProxy is restarted, so the alert continues to fire even though the situation has cleared!

This doesn't make sense. Look at the currently active sessions instead.
---
 _data/rules.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/_data/rules.yml b/_data/rules.yml
index 3f29fb0..61425a5 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -1785,7 +1785,7 @@ groups:
                 severity: critical
               - name: HAProxy backend max active session
                 description: HAproxy backend {{ $labels.fqdn }}/{{ $labels.backend }} is reaching session limit (> 80%).
-                query: "((sum by (backend) (avg_over_time(haproxy_backend_max_sessions[2m]) * 100) / sum by (backend) (avg_over_time(haproxy_backend_limit_sessions[2m])))) > 80"
+                query: "((sum by (backend) (avg_over_time(haproxy_backend_current_sessions[2m]) * 100) / sum by (backend) (avg_over_time(haproxy_backend_limit_sessions[2m])))) > 80"
                 severity: warning
                 for: 2m
               - name: HAProxy pending requests

From 04886da968b4686de0af0c19f0fb4baa05f8265e Mon Sep 17 00:00:00 2001
From: samber <samber@users.noreply.github.com>
Date: Mon, 13 May 2024 10:10:12 +0000
Subject: [PATCH 058/123] Publish

---
 dist/rules/haproxy/haproxy-exporter-v1.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dist/rules/haproxy/haproxy-exporter-v1.yml b/dist/rules/haproxy/haproxy-exporter-v1.yml
index 2b2f93f..7be81a0 100644
--- a/dist/rules/haproxy/haproxy-exporter-v1.yml
+++ b/dist/rules/haproxy/haproxy-exporter-v1.yml
@@ -77,7 +77,7 @@ groups:
         description: "Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: HaproxyBackendMaxActiveSession
-      expr: '((sum by (backend) (avg_over_time(haproxy_backend_max_sessions[2m]) * 100) / sum by (backend) (avg_over_time(haproxy_backend_limit_sessions[2m])))) > 80'
+      expr: '((sum by (backend) (avg_over_time(haproxy_backend_current_sessions[2m]) * 100) / sum by (backend) (avg_over_time(haproxy_backend_limit_sessions[2m])))) > 80'
       for: 2m
       labels:
         severity: warning

From 4963331101e42b4a799978891a5c9c5f927623fb Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 14 May 2024 01:41:57 +0200
Subject: [PATCH 059/123] build(deps-dev): bump nokogiri from 1.16.2 to 1.16.5
 (#415)

Bumps [nokogiri](https://github.com/sparklemotion/nokogiri) from 1.16.2 to 1.16.5.
- [Release notes](https://github.com/sparklemotion/nokogiri/releases)
- [Changelog](https://github.com/sparklemotion/nokogiri/blob/main/CHANGELOG.md)
- [Commits](https://github.com/sparklemotion/nokogiri/compare/v1.16.2...v1.16.5)

---
updated-dependencies:
- dependency-name: nokogiri
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 Gemfile.lock | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Gemfile.lock b/Gemfile.lock
index df07b1f..2f8e470 100644
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -231,7 +231,7 @@ GEM
       jekyll-seo-tag (~> 2.1)
     minitest (5.17.0)
     multipart-post (2.1.1)
-    nokogiri (1.16.2-x86_64-linux)
+    nokogiri (1.16.5-x86_64-linux)
       racc (~> 1.4)
     octokit (4.22.0)
       faraday (>= 0.9)

From 8460f9008e1eb191bb62d445ea17709698ce63db Mon Sep 17 00:00:00 2001
From: "R.Sicart" <roger.sicart@gmail.com>
Date: Tue, 14 May 2024 20:34:43 +0200
Subject: [PATCH 060/123] fix: some kube api alert lint (#416)

* fix: apiserver regexp matchers are automatically fully anchored

Signed-off-by: R.Sicart <roger.sicart@gmail.com>

* fix: apiserver errors alert is using  label but the query removes it

Signed-off-by: R.Sicart <roger.sicart@gmail.com>

* fix: apiserver latency alert is using  label but the query removes it

Signed-off-by: R.Sicart <roger.sicart@gmail.com>

---------

Signed-off-by: R.Sicart <roger.sicart@gmail.com>
---
 _data/rules.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/_data/rules.yml b/_data/rules.yml
index 61425a5..eb87723 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -2071,7 +2071,7 @@ groups:
                 for: 12h
               - name: Kubernetes API server errors
                 description: Kubernetes API server is experiencing high error rate
-                query: 'sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[1m])) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) * 100 > 3'
+                query: 'sum(rate(apiserver_request_total{job="apiserver",code=~"(?:5..)"}[1m])) by (instance, job) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) by (instance, job) * 100 > 3'
                 severity: critical
                 for: 2m
               - name: Kubernetes API client errors
@@ -2089,7 +2089,7 @@ groups:
                 severity: critical
               - name: Kubernetes API server latency
                 description: "Kubernetes API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}."
-                query: 'histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"} [10m])) WITHOUT (instance, resource)) > 1'
+                query: 'histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{verb!~"(?:CONNECT|WATCHLIST|WATCH|PROXY)"} [10m])) WITHOUT (subresource)) > 1'
                 severity: warning
                 for: 2m
 

From 81079a2a7e9923ce369b35301ddcbf660bea1f09 Mon Sep 17 00:00:00 2001
From: samber <samber@users.noreply.github.com>
Date: Tue, 14 May 2024 18:35:54 +0000
Subject: [PATCH 061/123] Publish

---
 dist/rules/kubernetes/kubestate-exporter.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dist/rules/kubernetes/kubestate-exporter.yml b/dist/rules/kubernetes/kubestate-exporter.yml
index 9014275..3f9dc6a 100644
--- a/dist/rules/kubernetes/kubestate-exporter.yml
+++ b/dist/rules/kubernetes/kubestate-exporter.yml
@@ -266,7 +266,7 @@ groups:
         description: "Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in time.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesApiServerErrors
-      expr: 'sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[1m])) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) * 100 > 3'
+      expr: 'sum(rate(apiserver_request_total{job="apiserver",code=~"(?:5..)"}[1m])) by (instance, job) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) by (instance, job) * 100 > 3'
       for: 2m
       labels:
         severity: critical
@@ -302,7 +302,7 @@ groups:
         description: "A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesApiServerLatency
-      expr: 'histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"} [10m])) WITHOUT (instance, resource)) > 1'
+      expr: 'histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{verb!~"(?:CONNECT|WATCHLIST|WATCH|PROXY)"} [10m])) WITHOUT (subresource)) > 1'
       for: 2m
       labels:
         severity: warning

From 262e45162569863ea1784dcfc68066be32fcbe71 Mon Sep 17 00:00:00 2001
From: "R.Sicart" <roger.sicart@gmail.com>
Date: Tue, 14 May 2024 20:43:00 +0200
Subject: [PATCH 062/123] kube hpa lint and improvement (#417)

* fix: hpa alerts are using  label but the queries remove it

Signed-off-by: R.Sicart <roger.sicart@gmail.com>

* fix: hpa alert is using  label but the query removes it

Signed-off-by: R.Sicart <roger.sicart@gmail.com>

* feat: hpa scale max should not alert when min and max are the same

Signed-off-by: R.Sicart <roger.sicart@gmail.com>

---------

Signed-off-by: R.Sicart <roger.sicart@gmail.com>
---
 _data/rules.yml                              |  2 +-
 dist/rules/kubernetes/kubestate-exporter.yml | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/_data/rules.yml b/_data/rules.yml
index eb87723..4d0d9f4 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -1990,7 +1990,7 @@ groups:
                 severity: warning
               - name: Kubernetes HPA scale maximum
                 description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has hit maximum number of desired pods
-                query: "kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas"
+                query: '(kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas) and (kube_horizontalpodautoscaler_spec_max_replicas > 1) and (kube_horizontalpodautoscaler_spec_min_replicas != kube_horizontalpodautoscaler_spec_max_replicas)'
                 severity: info
                 for: 2m
               - name: Kubernetes HPA underutilized
diff --git a/dist/rules/kubernetes/kubestate-exporter.yml b/dist/rules/kubernetes/kubestate-exporter.yml
index 3f9dc6a..8684fdf 100644
--- a/dist/rules/kubernetes/kubestate-exporter.yml
+++ b/dist/rules/kubernetes/kubestate-exporter.yml
@@ -127,7 +127,7 @@ groups:
       labels:
         severity: warning
       annotations:
-        summary: Kubernetes HPA scale inability (instance {{ $labels.instance }})
+        summary: Kubernetes HPA scale inability ({{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }})
         description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to scale\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesHpaMetricsUnavailability
@@ -136,7 +136,7 @@ groups:
       labels:
         severity: warning
       annotations:
-        summary: Kubernetes HPA metrics unavailability (instance {{ $labels.instance }})
+        summary: Kubernetes HPA metrics unavailability ({{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }})
         description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to collect metrics\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesHpaScaleMaximum
@@ -145,16 +145,16 @@ groups:
       labels:
         severity: info
       annotations:
-        summary: Kubernetes HPA scale maximum (instance {{ $labels.instance }})
+        summary: Kubernetes HPA scale maximum ({{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }})
         description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has hit maximum number of desired pods\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesHpaUnderutilized
-      expr: 'max(quantile_over_time(0.5, kube_horizontalpodautoscaler_status_desired_replicas[1d]) == kube_horizontalpodautoscaler_spec_min_replicas) by (horizontalpodautoscaler) > 3'
+      expr: 'max(quantile_over_time(0.5, kube_horizontalpodautoscaler_status_desired_replicas[1d]) == kube_horizontalpodautoscaler_spec_min_replicas) by (horizontalpodautoscaler, namespace) > 3'
       for: 0m
       labels:
         severity: info
       annotations:
-        summary: Kubernetes HPA underutilized (instance {{ $labels.instance }})
+        summary: Kubernetes HPA underutilized ({{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }})
         description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is constantly at minimum replicas for 50% of the time. Potential cost saving here.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesPodNotHealthy

From 826be5877ffc129ca3511cd25af87d67ea67fb48 Mon Sep 17 00:00:00 2001
From: samber <samber@users.noreply.github.com>
Date: Tue, 14 May 2024 18:44:11 +0000
Subject: [PATCH 063/123] Publish

---
 dist/rules/kubernetes/kubestate-exporter.yml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/dist/rules/kubernetes/kubestate-exporter.yml b/dist/rules/kubernetes/kubestate-exporter.yml
index 8684fdf..2db1d64 100644
--- a/dist/rules/kubernetes/kubestate-exporter.yml
+++ b/dist/rules/kubernetes/kubestate-exporter.yml
@@ -127,7 +127,7 @@ groups:
       labels:
         severity: warning
       annotations:
-        summary: Kubernetes HPA scale inability ({{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }})
+        summary: Kubernetes HPA scale inability (instance {{ $labels.instance }})
         description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to scale\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesHpaMetricsUnavailability
@@ -136,25 +136,25 @@ groups:
       labels:
         severity: warning
       annotations:
-        summary: Kubernetes HPA metrics unavailability ({{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }})
+        summary: Kubernetes HPA metrics unavailability (instance {{ $labels.instance }})
         description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to collect metrics\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesHpaScaleMaximum
-      expr: 'kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas'
+      expr: '(kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas) and (kube_horizontalpodautoscaler_spec_max_replicas > 1) and (kube_horizontalpodautoscaler_spec_min_replicas != kube_horizontalpodautoscaler_spec_max_replicas)'
       for: 2m
       labels:
         severity: info
       annotations:
-        summary: Kubernetes HPA scale maximum ({{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }})
+        summary: Kubernetes HPA scale maximum (instance {{ $labels.instance }})
         description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has hit maximum number of desired pods\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesHpaUnderutilized
-      expr: 'max(quantile_over_time(0.5, kube_horizontalpodautoscaler_status_desired_replicas[1d]) == kube_horizontalpodautoscaler_spec_min_replicas) by (horizontalpodautoscaler, namespace) > 3'
+      expr: 'max(quantile_over_time(0.5, kube_horizontalpodautoscaler_status_desired_replicas[1d]) == kube_horizontalpodautoscaler_spec_min_replicas) by (horizontalpodautoscaler) > 3'
       for: 0m
       labels:
         severity: info
       annotations:
-        summary: Kubernetes HPA underutilized ({{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }})
+        summary: Kubernetes HPA underutilized (instance {{ $labels.instance }})
         description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is constantly at minimum replicas for 50% of the time. Potential cost saving here.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesPodNotHealthy

From 9877561b6cfc2c145db1b7de6f85c084989be10d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Enes=20Yal=C4=B1nkaya?=
 <49714068+enesyalinkaya@users.noreply.github.com>
Date: Wed, 15 May 2024 09:07:55 +0300
Subject: [PATCH 064/123] fix elasticsearch rate rules (#418)

* fix elasticsearch rate rules

* fix

* fix

* fix
---
 _data/rules.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/_data/rules.yml b/_data/rules.yml
index 4d0d9f4..bf8ee70 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -1135,12 +1135,12 @@ groups:
                 for: 10m       
               - name: Elasticsearch High Indexing Rate
                 description: "The indexing rate on Elasticsearch cluster is higher than the threshold."
-                query: "elasticsearch_indices_indexing_index_total > 100000"
+                query: "sum(rate(elasticsearch_indices_indexing_index_total[1m]))> 100000"
                 severity: warning
                 for: 5m     
               - name: Elasticsearch High Query Rate
                 description: "The query rate on Elasticsearch cluster is higher than the threshold."
-                query: "elasticsearch_indices_search_query_total > 100000"
+                query: "sum(rate(elasticsearch_indices_search_query_total[1m])) > 100000"
                 severity: warning
                 for: 5m
               - name: Elasticsearch High Query Latency

From 1adecd9ee79ce65ec546ad77d6d47ab04259f689 Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Wed, 15 May 2024 08:08:58 +0200
Subject: [PATCH 065/123] Update rules.yml

---
 _data/rules.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/_data/rules.yml b/_data/rules.yml
index bf8ee70..a41d8a2 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -1125,7 +1125,7 @@ groups:
                 severity: warning
                 for: 15m
               - name: Elasticsearch no new documents
-                description: No new documents for 10 min!
+                description: "No new documents for 10 min!"
                 query: 'increase(elasticsearch_indices_indexing_index_total{es_data_node="true"}[10m]) < 1'
                 severity: warning
               - name: Elasticsearch High Indexing Latency
@@ -1135,12 +1135,12 @@ groups:
                 for: 10m       
               - name: Elasticsearch High Indexing Rate
                 description: "The indexing rate on Elasticsearch cluster is higher than the threshold."
-                query: "sum(rate(elasticsearch_indices_indexing_index_total[1m]))> 100000"
+                query: "sum(rate(elasticsearch_indices_indexing_index_total[1m]))> 10000"
                 severity: warning
                 for: 5m     
               - name: Elasticsearch High Query Rate
                 description: "The query rate on Elasticsearch cluster is higher than the threshold."
-                query: "sum(rate(elasticsearch_indices_search_query_total[1m])) > 100000"
+                query: "sum(rate(elasticsearch_indices_search_query_total[1m])) > 100"
                 severity: warning
                 for: 5m
               - name: Elasticsearch High Query Latency

From 7dd767c4b4e4bcc2fa8d2b86cee89402f80acacd Mon Sep 17 00:00:00 2001
From: samber <samber@users.noreply.github.com>
Date: Wed, 15 May 2024 06:10:06 +0000
Subject: [PATCH 066/123] Publish

---
 .../prometheus-community-elasticsearch-exporter.yml           | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml b/dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml
index 9aeadec..5e6bb9d 100644
--- a/dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml
+++ b/dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml
@@ -149,7 +149,7 @@ groups:
         description: "The indexing latency on Elasticsearch cluster is higher than the threshold.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: ElasticsearchHighIndexingRate
-      expr: 'elasticsearch_indices_indexing_index_total > 100000'
+      expr: 'sum(rate(elasticsearch_indices_indexing_index_total[1m]))> 10000'
       for: 5m
       labels:
         severity: warning
@@ -158,7 +158,7 @@ groups:
         description: "The indexing rate on Elasticsearch cluster is higher than the threshold.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: ElasticsearchHighQueryRate
-      expr: 'elasticsearch_indices_search_query_total > 100000'
+      expr: 'sum(rate(elasticsearch_indices_search_query_total[1m])) > 100'
       for: 5m
       labels:
         severity: warning

From 61a40270d96ba9d7ae6489254f9dad2775f8e00c Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 16 May 2024 23:28:17 +0200
Subject: [PATCH 067/123] build(deps-dev): bump rexml from 3.2.5 to 3.2.8
 (#420)

Bumps [rexml](https://github.com/ruby/rexml) from 3.2.5 to 3.2.8.
- [Release notes](https://github.com/ruby/rexml/releases)
- [Changelog](https://github.com/ruby/rexml/blob/master/NEWS.md)
- [Commits](https://github.com/ruby/rexml/compare/v3.2.5...v3.2.8)

---
updated-dependencies:
- dependency-name: rexml
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 Gemfile.lock | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Gemfile.lock b/Gemfile.lock
index 2f8e470..ca3c33c 100644
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -243,7 +243,8 @@ GEM
     rb-fsevent (0.11.1)
     rb-inotify (0.10.1)
       ffi (~> 1.0)
-    rexml (3.2.5)
+    rexml (3.2.8)
+      strscan (>= 3.0.9)
     rouge (3.26.0)
     ruby2_keywords (0.0.5)
     rubyzip (2.3.2)
@@ -258,6 +259,7 @@ GEM
       faraday (> 0.8, < 2.0)
     simpleidn (0.2.1)
       unf (~> 0.1.4)
+    strscan (3.1.0)
     terminal-table (1.8.0)
       unicode-display_width (~> 1.1, >= 1.1.1)
     thread_safe (0.3.6)

From 9b0ac7d230f18f7a371ece02d7ecb7724007faa9 Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Thu, 23 May 2024 14:44:45 +0200
Subject: [PATCH 068/123] Update rules.yml

---
 _data/rules.yml | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/_data/rules.yml b/_data/rules.yml
index a41d8a2..cd0123e 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -876,11 +876,6 @@ groups:
                 query: 'avg by(instance) (rate(mongodb_ss_connections{conn_type="current"}[1m])) / avg by(instance) (sum (mongodb_ss_connections) by (instance)) * 100 > 80'
                 severity: warning
                 for: 2m
-              - name: MongoDB virtual memory usage
-                description: High memory usage
-                query: "(sum(mongodb_ss_mem_virtual) BY (instance) / sum(mongodb_ss_mem_resident) BY (instance)) > 3"
-                severity: warning
-                for: 2m
 
           - name: dcu/mongodb_exporter
             slug: dcu-mongodb-exporter

From 8759c50440e0c95363c739432e8c3a21b864c8e0 Mon Sep 17 00:00:00 2001
From: samber <samber@users.noreply.github.com>
Date: Thu, 23 May 2024 12:45:56 +0000
Subject: [PATCH 069/123] Publish

---
 dist/rules/mongodb/percona-mongodb-exporter.yml | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/dist/rules/mongodb/percona-mongodb-exporter.yml b/dist/rules/mongodb/percona-mongodb-exporter.yml
index 3e1e5e9..1bd446f 100644
--- a/dist/rules/mongodb/percona-mongodb-exporter.yml
+++ b/dist/rules/mongodb/percona-mongodb-exporter.yml
@@ -66,12 +66,3 @@ groups:
       annotations:
         summary: MongoDB too many connections (instance {{ $labels.instance }})
         description: "Too many connections (> 80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: MongodbVirtualMemoryUsage
-      expr: '(sum(mongodb_ss_mem_virtual) BY (instance) / sum(mongodb_ss_mem_resident) BY (instance)) > 3'
-      for: 2m
-      labels:
-        severity: warning
-      annotations:
-        summary: MongoDB virtual memory usage (instance {{ $labels.instance }})
-        description: "High memory usage\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

From 1e4ea0b3e75cdbf59a56c3e1cad94deaf2ab723f Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Thu, 6 Jun 2024 22:53:29 +0200
Subject: [PATCH 070/123] Update rules.yml

---
 _data/rules.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/_data/rules.yml b/_data/rules.yml
index cd0123e..9a08f71 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -2421,7 +2421,7 @@ groups:
             rules:
               - name: Minio cluster disk offline
                 description: "Minio cluster disk is offline"
-                query: "minio_cluster_disk_offline_total > 0"
+                query: "minio_cluster_drive_offline_total > 0"
                 severity: critical
               - name: Minio node disk offline
                 description: "Minio cluster node disk is offline"

From 1ee046b7392426039031b521a433dcc177104134 Mon Sep 17 00:00:00 2001
From: samber <samber@users.noreply.github.com>
Date: Thu, 6 Jun 2024 20:54:49 +0000
Subject: [PATCH 071/123] Publish

---
 dist/rules/minio/embedded-exporter.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dist/rules/minio/embedded-exporter.yml b/dist/rules/minio/embedded-exporter.yml
index 8e19729..1ac2de5 100644
--- a/dist/rules/minio/embedded-exporter.yml
+++ b/dist/rules/minio/embedded-exporter.yml
@@ -5,7 +5,7 @@ groups:
   rules:
 
     - alert: MinioClusterDiskOffline
-      expr: 'minio_cluster_disk_offline_total > 0'
+      expr: 'minio_cluster_drive_offline_total > 0'
       for: 0m
       labels:
         severity: critical

From ca4fb01c6dda3514048fb28166d3bd9f40b06ef7 Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Fri, 14 Jun 2024 20:15:44 +0200
Subject: [PATCH 072/123] Update rules.yml

---
 _data/rules.yml | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/_data/rules.yml b/_data/rules.yml
index 9a08f71..8994d44 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -240,12 +240,15 @@ groups:
                 query: '(rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
                 severity: warning
                 for: 5m
-              - name: Host context switching
-                description: Context switching is growing on the node (> 10000 / CPU / s)
-                query: '((rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+              - name: Host context switching high
+                description: Context switching is growing on the node (twice the daily average during the last 15m)
+                query: |
+                  (rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"}))
+                  /
+                  (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2
                 severity: warning
                 comments: |
-                  10000 context switches is an arbitrary number.
+                  x2 context switches is an arbitrary number.
                   The alert threshold depends on the nature of the application.
                   Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
               - name: Host swap is filling up

From 60c235975c4a34354d30031cef12234d3fe7e3f6 Mon Sep 17 00:00:00 2001
From: samber <samber@users.noreply.github.com>
Date: Fri, 14 Jun 2024 18:16:53 +0000
Subject: [PATCH 073/123] Publish

---
 dist/rules/host-and-hardware/node-exporter.yml | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/dist/rules/host-and-hardware/node-exporter.yml b/dist/rules/host-and-hardware/node-exporter.yml
index 6655ef7..0d80c16 100644
--- a/dist/rules/host-and-hardware/node-exporter.yml
+++ b/dist/rules/host-and-hardware/node-exporter.yml
@@ -175,14 +175,17 @@ groups:
         summary: Host unusual disk IO (instance {{ $labels.instance }})
         description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: HostContextSwitching
-      expr: '((rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+    - alert: HostContextSwitchingHigh
+      expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"}))
+/
+(rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2
+'
       for: 0m
       labels:
         severity: warning
       annotations:
-        summary: Host context switching (instance {{ $labels.instance }})
-        description: "Context switching is growing on the node (> 10000 / CPU / s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Host context switching high (instance {{ $labels.instance }})
+        description: "Context switching is growing on the node (twice the daily average during the last 15m)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: HostSwapIsFillingUp
       expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'

From b6a6c2e31315873c39c28622ffb2f26ad8f5ce9b Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Tue, 2 Jul 2024 09:33:01 +0200
Subject: [PATCH 074/123] Update README.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 16c92c7..9188322 100644
--- a/README.md
+++ b/README.md
@@ -51,6 +51,7 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts
 - [MongoDB](https://samber.github.io/awesome-prometheus-alerts/rules#mongodb)
 - [RabbitMQ](https://samber.github.io/awesome-prometheus-alerts/rules#rabbitmq)
 - [Elasticsearch](https://samber.github.io/awesome-prometheus-alerts/rules#elasticsearch)
+- [Meilisearch](https://samber.github.io/awesome-prometheus-alerts/rules#meilisearch)
 - [Cassandra](https://samber.github.io/awesome-prometheus-alerts/rules#cassandra)
 - [Clickhouse](https://samber.github.io/awesome-prometheus-alerts/rules#clickhouse)
 - [Zookeeper](https://samber.github.io/awesome-prometheus-alerts/rules#zookeeper)

From 9557d4b50e09f31d2f86ac1c70f87a025a6ac1b6 Mon Sep 17 00:00:00 2001
From: Greg <58505377+nohant@users.noreply.github.com>
Date: Tue, 2 Jul 2024 09:33:08 +0200
Subject: [PATCH 075/123] feat(meilisearch): add basic set of rules (#425)

* feat(meilisearch): add basic meilisearch rules

* fix(query): use == instead of =

* fix(data): set correct name and use ==

* chore(meilisearch): remove index filter
---
 _data/rules.yml                              | 15 +++++++++++++
 dist/rules/meilisearch/embedded-exporter.yml | 23 ++++++++++++++++++++
 2 files changed, 38 insertions(+)
 create mode 100644 dist/rules/meilisearch/embedded-exporter.yml

diff --git a/_data/rules.yml b/_data/rules.yml
index 8994d44..128793f 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -842,6 +842,21 @@ groups:
                 query: "increase(redis_rejected_connections_total[1m]) > 0"
                 severity: critical
 
+      - name: Meilisearch
+        exporters:
+          - name: Embedded exporter
+            slug: embedded-exporter
+            doc_url: https://github.com/orgs/meilisearch/discussions/625
+            rules:                
+              - name: Meilisearch index is empty
+                description: Meilisearch instance is down
+                query: 'meilisearch_index_docs_count == 0'
+                severity: warning
+              - name: Meilisearch http response time
+                description: Meilisearch http response time is too high
+                query: "meilisearch_http_response_time_seconds > 0.5"
+                severity: warning
+
       - name: MongoDB
         exporters:
           - name: percona/mongodb_exporter
diff --git a/dist/rules/meilisearch/embedded-exporter.yml b/dist/rules/meilisearch/embedded-exporter.yml
new file mode 100644
index 0000000..a8824dd
--- /dev/null
+++ b/dist/rules/meilisearch/embedded-exporter.yml
@@ -0,0 +1,23 @@
+groups:
+
+- name: EmbeddedExporter
+
+  rules:
+
+    - alert: MeilisearchIndexIsEmpty
+      expr: meilisearch_index_docs_count == 0
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: the index {{ $labels.Index }} is empty
+        description: "The index {{ $labels.Index }} is empty at the moment, and shouldnt be empty\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        
+    - alert: MeilisearchHttpResponseTimeIsTooHigh
+      expr: rate(meilisearch_http_response_time_seconds_sum[5m]) / rate(meilisearch_http_response_time_seconds_count[5m]) > 0.5
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: the meilisearch server http response time is too high
+        description: "The meilisearch server http response time is too high at the moment\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

From 47e74f65e02fbbb60fddaf450cca3178ef5e4ecd Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Tue, 2 Jul 2024 09:33:51 +0200
Subject: [PATCH 076/123] Update rules.yml

---
 _data/rules.yml | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/_data/rules.yml b/_data/rules.yml
index 128793f..0216beb 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -842,21 +842,6 @@ groups:
                 query: "increase(redis_rejected_connections_total[1m]) > 0"
                 severity: critical
 
-      - name: Meilisearch
-        exporters:
-          - name: Embedded exporter
-            slug: embedded-exporter
-            doc_url: https://github.com/orgs/meilisearch/discussions/625
-            rules:                
-              - name: Meilisearch index is empty
-                description: Meilisearch instance is down
-                query: 'meilisearch_index_docs_count == 0'
-                severity: warning
-              - name: Meilisearch http response time
-                description: Meilisearch http response time is too high
-                query: "meilisearch_http_response_time_seconds > 0.5"
-                severity: warning
-
       - name: MongoDB
         exporters:
           - name: percona/mongodb_exporter
@@ -1162,6 +1147,21 @@ groups:
                 severity: warning
                 for: 5m                 
 
+      - name: Meilisearch
+        exporters:
+          - name: Embedded exporter
+            slug: embedded-exporter
+            doc_url: https://github.com/orgs/meilisearch/discussions/625
+            rules:                
+              - name: Meilisearch index is empty
+                description: Meilisearch instance is down
+                query: 'meilisearch_index_docs_count == 0'
+                severity: warning
+              - name: Meilisearch http response time
+                description: Meilisearch http response time is too high
+                query: "meilisearch_http_response_time_seconds > 0.5"
+                severity: warning
+
       - name: Cassandra
         exporters:
           - name: instaclustr/cassandra-exporter

From 58ade95b8bfaf20f875c74b8d7a6c509f70a77ab Mon Sep 17 00:00:00 2001
From: samber <samber@users.noreply.github.com>
Date: Tue, 2 Jul 2024 07:34:59 +0000
Subject: [PATCH 077/123] Publish

---
 dist/rules/meilisearch/embedded-exporter.yml | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/dist/rules/meilisearch/embedded-exporter.yml b/dist/rules/meilisearch/embedded-exporter.yml
index a8824dd..8da2803 100644
--- a/dist/rules/meilisearch/embedded-exporter.yml
+++ b/dist/rules/meilisearch/embedded-exporter.yml
@@ -5,19 +5,19 @@ groups:
   rules:
 
     - alert: MeilisearchIndexIsEmpty
-      expr: meilisearch_index_docs_count == 0
-      for: 5m
+      expr: 'meilisearch_index_docs_count == 0'
+      for: 0m
       labels:
         severity: warning
       annotations:
-        summary: the index {{ $labels.Index }} is empty
-        description: "The index {{ $labels.Index }} is empty at the moment, and shouldnt be empty\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-        
-    - alert: MeilisearchHttpResponseTimeIsTooHigh
-      expr: rate(meilisearch_http_response_time_seconds_sum[5m]) / rate(meilisearch_http_response_time_seconds_count[5m]) > 0.5
-      for: 5m
+        summary: Meilisearch index is empty (instance {{ $labels.instance }})
+        description: "Meilisearch instance is down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MeilisearchHttpResponseTime
+      expr: 'meilisearch_http_response_time_seconds > 0.5'
+      for: 0m
       labels:
         severity: warning
       annotations:
-        summary: the meilisearch server http response time is too high
-        description: "The meilisearch server http response time is too high at the moment\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Meilisearch http response time (instance {{ $labels.instance }})
+        description: "Meilisearch http response time is too high\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

From 2c764df932e61bfeb81c396ece3d93bd7a4cf3bc Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Thu, 18 Jul 2024 10:14:45 +0200
Subject: [PATCH 078/123] fix: Gemfile & Gemfile.lock to reduce vulnerabilities
 (#426)

The following vulnerabilities are fixed with an upgrade:
- https://snyk.io/vuln/SNYK-RUBY-REXML-7462086

Co-authored-by: snyk-bot <snyk-bot@snyk.io>
---
 Gemfile      |   2 +-
 Gemfile.lock | 189 ++++++++++++++++++++++++---------------------------
 2 files changed, 90 insertions(+), 101 deletions(-)

diff --git a/Gemfile b/Gemfile
index 31ddf5d..eef87b6 100644
--- a/Gemfile
+++ b/Gemfile
@@ -1,3 +1,3 @@
 source 'https://rubygems.org'
-gem 'github-pages', group: :jekyll_plugins
+gem 'github-pages', '>= 227', group: :jekyll_plugins
 gem 'webrick', '~> 1.3', '>= 1.3.1'
\ No newline at end of file
diff --git a/Gemfile.lock b/Gemfile.lock
index ca3c33c..c11fe91 100644
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -1,66 +1,56 @@
 GEM
   remote: https://rubygems.org/
   specs:
-    activesupport (6.0.6.1)
+    activesupport (7.1.3.4)
+      base64
+      bigdecimal
       concurrent-ruby (~> 1.0, >= 1.0.2)
-      i18n (>= 0.7, < 2)
-      minitest (~> 5.1)
-      tzinfo (~> 1.1)
-      zeitwerk (~> 2.2, >= 2.2.2)
-    addressable (2.8.0)
-      public_suffix (>= 2.0.2, < 5.0)
+      connection_pool (>= 2.2.5)
+      drb
+      i18n (>= 1.6, < 2)
+      minitest (>= 5.1)
+      mutex_m
+      tzinfo (~> 2.0)
+    addressable (2.8.7)
+      public_suffix (>= 2.0.2, < 7.0)
+    base64 (0.2.0)
+    bigdecimal (3.1.8)
     coffee-script (2.4.1)
       coffee-script-source
       execjs
-    coffee-script-source (1.11.1)
+    coffee-script-source (1.12.2)
     colorator (1.1.0)
     commonmarker (0.23.10)
-    concurrent-ruby (1.2.0)
-    dnsruby (1.61.9)
-      simpleidn (~> 0.1)
+    concurrent-ruby (1.3.3)
+    connection_pool (2.4.1)
+    dnsruby (1.72.2)
+      simpleidn (~> 0.2.1)
+    drb (2.2.1)
     em-websocket (0.5.3)
       eventmachine (>= 0.12.9)
       http_parser.rb (~> 0)
-    ethon (0.15.0)
+    ethon (0.16.0)
       ffi (>= 1.15.0)
     eventmachine (1.2.7)
-    execjs (2.8.1)
-    faraday (1.10.0)
-      faraday-em_http (~> 1.0)
-      faraday-em_synchrony (~> 1.0)
-      faraday-excon (~> 1.1)
-      faraday-httpclient (~> 1.0)
-      faraday-multipart (~> 1.0)
-      faraday-net_http (~> 1.0)
-      faraday-net_http_persistent (~> 1.0)
-      faraday-patron (~> 1.0)
-      faraday-rack (~> 1.0)
-      faraday-retry (~> 1.0)
+    execjs (2.9.1)
+    faraday (2.8.1)
+      base64
+      faraday-net_http (>= 2.0, < 3.1)
       ruby2_keywords (>= 0.0.4)
-    faraday-em_http (1.0.0)
-    faraday-em_synchrony (1.0.0)
-    faraday-excon (1.1.0)
-    faraday-httpclient (1.0.1)
-    faraday-multipart (1.0.3)
-      multipart-post (>= 1.2, < 3)
-    faraday-net_http (1.0.1)
-    faraday-net_http_persistent (1.2.0)
-    faraday-patron (1.0.0)
-    faraday-rack (1.0.0)
-    faraday-retry (1.0.3)
-    ffi (1.15.5)
+    faraday-net_http (3.0.2)
+    ffi (1.16.3)
     forwardable-extended (2.6.0)
-    gemoji (3.0.1)
-    github-pages (226)
-      github-pages-health-check (= 1.17.9)
-      jekyll (= 3.9.2)
-      jekyll-avatar (= 0.7.0)
-      jekyll-coffeescript (= 1.1.1)
-      jekyll-commonmark-ghpages (= 0.2.0)
-      jekyll-default-layout (= 0.1.4)
-      jekyll-feed (= 0.15.1)
+    gemoji (4.1.0)
+    github-pages (231)
+      github-pages-health-check (= 1.18.2)
+      jekyll (= 3.9.5)
+      jekyll-avatar (= 0.8.0)
+      jekyll-coffeescript (= 1.2.2)
+      jekyll-commonmark-ghpages (= 0.4.0)
+      jekyll-default-layout (= 0.1.5)
+      jekyll-feed (= 0.17.0)
       jekyll-gist (= 1.5.0)
-      jekyll-github-metadata (= 2.13.0)
+      jekyll-github-metadata (= 2.16.1)
       jekyll-include-cache (= 0.2.1)
       jekyll-mentions (= 1.6.0)
       jekyll-optional-front-matter (= 0.3.2)
@@ -87,32 +77,32 @@ GEM
       jekyll-theme-tactile (= 0.2.0)
       jekyll-theme-time-machine (= 0.2.0)
       jekyll-titles-from-headings (= 0.5.3)
-      jemoji (= 0.12.0)
-      kramdown (= 2.3.2)
+      jemoji (= 0.13.0)
+      kramdown (= 2.4.0)
       kramdown-parser-gfm (= 1.1.0)
-      liquid (= 4.0.3)
+      liquid (= 4.0.4)
       mercenary (~> 0.3)
       minima (= 2.5.1)
-      nokogiri (>= 1.13.4, < 2.0)
-      rouge (= 3.26.0)
+      nokogiri (>= 1.13.6, < 2.0)
+      rouge (= 3.30.0)
       terminal-table (~> 1.4)
-    github-pages-health-check (1.17.9)
+    github-pages-health-check (1.18.2)
       addressable (~> 2.3)
       dnsruby (~> 1.60)
-      octokit (~> 4.0)
-      public_suffix (>= 3.0, < 5.0)
+      octokit (>= 4, < 8)
+      public_suffix (>= 3.0, < 6.0)
       typhoeus (~> 1.3)
-    html-pipeline (2.14.1)
+    html-pipeline (2.14.3)
       activesupport (>= 2)
       nokogiri (>= 1.4)
     http_parser.rb (0.8.0)
-    i18n (0.9.5)
+    i18n (1.14.5)
       concurrent-ruby (~> 1.0)
-    jekyll (3.9.2)
+    jekyll (3.9.5)
       addressable (~> 2.4)
       colorator (~> 1.0)
       em-websocket (~> 0.5)
-      i18n (~> 0.7)
+      i18n (>= 0.7, < 2)
       jekyll-sass-converter (~> 1.0)
       jekyll-watch (~> 2.0)
       kramdown (>= 1.17, < 3)
@@ -121,27 +111,27 @@ GEM
       pathutil (~> 0.9)
       rouge (>= 1.7, < 4)
       safe_yaml (~> 1.0)
-    jekyll-avatar (0.7.0)
+    jekyll-avatar (0.8.0)
       jekyll (>= 3.0, < 5.0)
-    jekyll-coffeescript (1.1.1)
+    jekyll-coffeescript (1.2.2)
       coffee-script (~> 2.2)
-      coffee-script-source (~> 1.11.1)
+      coffee-script-source (~> 1.12)
     jekyll-commonmark (1.4.0)
       commonmarker (~> 0.22)
-    jekyll-commonmark-ghpages (0.2.0)
-      commonmarker (~> 0.23.4)
+    jekyll-commonmark-ghpages (0.4.0)
+      commonmarker (~> 0.23.7)
       jekyll (~> 3.9.0)
       jekyll-commonmark (~> 1.4.0)
-      rouge (>= 2.0, < 4.0)
-    jekyll-default-layout (0.1.4)
-      jekyll (~> 3.0)
-    jekyll-feed (0.15.1)
+      rouge (>= 2.0, < 5.0)
+    jekyll-default-layout (0.1.5)
+      jekyll (>= 3.0, < 5.0)
+    jekyll-feed (0.17.0)
       jekyll (>= 3.7, < 5.0)
     jekyll-gist (1.5.0)
       octokit (~> 4.2)
-    jekyll-github-metadata (2.13.0)
+    jekyll-github-metadata (2.16.1)
       jekyll (>= 3.4, < 5.0)
-      octokit (~> 4.0, != 4.4.0)
+      octokit (>= 4, < 7, != 4.4.0)
     jekyll-include-cache (0.2.1)
       jekyll (>= 3.7, < 5.0)
     jekyll-mentions (1.6.0)
@@ -212,40 +202,44 @@ GEM
       jekyll (>= 3.3, < 5.0)
     jekyll-watch (2.2.1)
       listen (~> 3.0)
-    jemoji (0.12.0)
-      gemoji (~> 3.0)
+    jemoji (0.13.0)
+      gemoji (>= 3, < 5)
       html-pipeline (~> 2.2)
       jekyll (>= 3.0, < 5.0)
-    kramdown (2.3.2)
+    kramdown (2.4.0)
       rexml
     kramdown-parser-gfm (1.1.0)
       kramdown (~> 2.0)
-    liquid (4.0.3)
-    listen (3.7.1)
+    liquid (4.0.4)
+    listen (3.9.0)
       rb-fsevent (~> 0.10, >= 0.10.3)
       rb-inotify (~> 0.9, >= 0.9.10)
     mercenary (0.3.6)
+    mini_portile2 (2.8.7)
     minima (2.5.1)
       jekyll (>= 3.5, < 5.0)
       jekyll-feed (~> 0.9)
       jekyll-seo-tag (~> 2.1)
-    minitest (5.17.0)
-    multipart-post (2.1.1)
-    nokogiri (1.16.5-x86_64-linux)
+    minitest (5.24.1)
+    mutex_m (0.2.0)
+    nokogiri (1.15.6)
+      mini_portile2 (~> 2.8.2)
       racc (~> 1.4)
-    octokit (4.22.0)
-      faraday (>= 0.9)
-      sawyer (~> 0.8.0, >= 0.5.3)
+    nokogiri (1.15.6-x86_64-linux)
+      racc (~> 1.4)
+    octokit (4.25.1)
+      faraday (>= 1, < 3)
+      sawyer (~> 0.9)
     pathutil (0.16.2)
       forwardable-extended (~> 2.6)
-    public_suffix (4.0.7)
-    racc (1.7.3)
-    rb-fsevent (0.11.1)
-    rb-inotify (0.10.1)
+    public_suffix (5.1.1)
+    racc (1.8.0)
+    rb-fsevent (0.11.2)
+    rb-inotify (0.11.1)
       ffi (~> 1.0)
-    rexml (3.2.8)
-      strscan (>= 3.0.9)
-    rouge (3.26.0)
+    rexml (3.3.2)
+      strscan
+    rouge (3.30.0)
     ruby2_keywords (0.0.5)
     rubyzip (2.3.2)
     safe_yaml (1.0.5)
@@ -254,32 +248,27 @@ GEM
     sass-listen (4.0.0)
       rb-fsevent (~> 0.9, >= 0.9.4)
       rb-inotify (~> 0.9, >= 0.9.7)
-    sawyer (0.8.2)
+    sawyer (0.9.2)
       addressable (>= 2.3.5)
-      faraday (> 0.8, < 2.0)
-    simpleidn (0.2.1)
-      unf (~> 0.1.4)
+      faraday (>= 0.17.3, < 3)
+    simpleidn (0.2.3)
     strscan (3.1.0)
     terminal-table (1.8.0)
       unicode-display_width (~> 1.1, >= 1.1.1)
-    thread_safe (0.3.6)
-    typhoeus (1.4.0)
+    typhoeus (1.4.1)
       ethon (>= 0.9.0)
-    tzinfo (1.2.11)
-      thread_safe (~> 0.1)
-    unf (0.1.4)
-      unf_ext
-    unf_ext (0.0.8.1)
+    tzinfo (2.0.6)
+      concurrent-ruby (~> 1.0)
     unicode-display_width (1.8.0)
     webrick (1.7.0)
-    zeitwerk (2.6.6)
 
 PLATFORMS
+  ruby
   x86_64-linux
   x86_64-linux-musl
 
 DEPENDENCIES
-  github-pages
+  github-pages (>= 227)
   webrick (~> 1.3, >= 1.3.1)
 
 BUNDLED WITH

From 225607cf7f72d075c759d52921203906a3a10fd6 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 30 Jul 2024 17:25:23 +0200
Subject: [PATCH 079/123] build(deps-dev): bump nokogiri from 1.15.6 to 1.16.5
 (#427)

Bumps [nokogiri](https://github.com/sparklemotion/nokogiri) from 1.15.6 to 1.16.5.
- [Release notes](https://github.com/sparklemotion/nokogiri/releases)
- [Changelog](https://github.com/sparklemotion/nokogiri/blob/main/CHANGELOG.md)
- [Commits](https://github.com/sparklemotion/nokogiri/compare/v1.15.6...v1.16.5)

---
updated-dependencies:
- dependency-name: nokogiri
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 Gemfile.lock | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Gemfile.lock b/Gemfile.lock
index c11fe91..f41c2f6 100644
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -222,10 +222,10 @@ GEM
       jekyll-seo-tag (~> 2.1)
     minitest (5.24.1)
     mutex_m (0.2.0)
-    nokogiri (1.15.6)
+    nokogiri (1.16.5)
       mini_portile2 (~> 2.8.2)
       racc (~> 1.4)
-    nokogiri (1.15.6-x86_64-linux)
+    nokogiri (1.16.5-x86_64-linux)
       racc (~> 1.4)
     octokit (4.25.1)
       faraday (>= 1, < 3)
@@ -233,7 +233,7 @@ GEM
     pathutil (0.16.2)
       forwardable-extended (~> 2.6)
     public_suffix (5.1.1)
-    racc (1.8.0)
+    racc (1.8.1)
     rb-fsevent (0.11.2)
     rb-inotify (0.11.1)
       ffi (~> 1.0)

From 61da73d5171e02f542be374d85bc0a2655857666 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 2 Aug 2024 14:14:26 +0200
Subject: [PATCH 080/123] build(deps-dev): bump rexml from 3.3.2 to 3.3.3
 (#428)

Bumps [rexml](https://github.com/ruby/rexml) from 3.3.2 to 3.3.3.
- [Release notes](https://github.com/ruby/rexml/releases)
- [Changelog](https://github.com/ruby/rexml/blob/master/NEWS.md)
- [Commits](https://github.com/ruby/rexml/compare/v3.3.2...v3.3.3)

---
updated-dependencies:
- dependency-name: rexml
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 Gemfile.lock | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Gemfile.lock b/Gemfile.lock
index f41c2f6..f2d1111 100644
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -237,7 +237,7 @@ GEM
     rb-fsevent (0.11.2)
     rb-inotify (0.11.1)
       ffi (~> 1.0)
-    rexml (3.3.2)
+    rexml (3.3.3)
       strscan
     rouge (3.30.0)
     ruby2_keywords (0.0.5)

From d1715de75150fb714a4bd6cd489935ccc7d6282b Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Tue, 20 Aug 2024 18:31:08 +0200
Subject: [PATCH 081/123] fix PostgresqlInvalidIndex rule

---
 _data/rules.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/_data/rules.yml b/_data/rules.yml
index 0216beb..b9506d2 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -730,9 +730,11 @@ groups:
                   See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
               - name: Postgresql invalid index
                 description: "The table {{ $labels.relname }} has an invalid index: {{ $labels.indexrelname }}. You should execute `DROP INDEX {{ $labels.indexrelname }};`"
-                query: 'pg_genaral_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}'
+                query: 'pg_general_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}'
                 severity: warning
                 for: 6h
+                comments: |
+                  See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
 
       - name: SQL Server
         exporters:

From 02687db33d657d387045a6a3e43fae793e6f2dfd Mon Sep 17 00:00:00 2001
From: samber <samber@users.noreply.github.com>
Date: Tue, 20 Aug 2024 16:32:36 +0000
Subject: [PATCH 082/123] Publish

---
 dist/rules/postgresql/postgres-exporter.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dist/rules/postgresql/postgres-exporter.yml b/dist/rules/postgresql/postgres-exporter.yml
index 0e1f473..2ab461f 100644
--- a/dist/rules/postgresql/postgres-exporter.yml
+++ b/dist/rules/postgresql/postgres-exporter.yml
@@ -185,7 +185,7 @@ groups:
         description: "The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: PostgresqlInvalidIndex
-      expr: 'pg_genaral_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}'
+      expr: 'pg_general_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}'
       for: 6h
       labels:
         severity: warning

From 8c0bdc2b24e9ba6b07e659e8260020788755b4c3 Mon Sep 17 00:00:00 2001
From: Somrat Dutta <38795369+somratdutta@users.noreply.github.com>
Date: Wed, 21 Aug 2024 00:07:03 +0530
Subject: [PATCH 083/123] feat: Add NATS and JetStream Prometheus alert rules
 (#430)

* feat: Add comprehensive NATS and JetStream Prometheus alert rules

- Added multiple Prometheus alert rules for monitoring NATS server and JetStream metrics.
- Included alerts for:
  - High connection count
  - High pending bytes
  - High subscriptions count
  - High routes count
  - High memory usage
  - Slow consumers
  - NATS server downtime
  - High CPU usage
  - High number of active connections
  - High JetStream store and memory usage
  - Subscription limits exceeded
  - High pending messages
  - Authentication timeouts
  - Errors in NATS (JetStream API errors)
  - JetStream consumers limit exceeded
  - Exceeding max payload size
  - Leaf node connection issues
  - Ping operations limit exceeded
  - Write deadline exceeded
- Ensured consistency between `exporter.yml` and `rules.yml` files.
- Improved overall NATS and JetStream monitoring to prevent performance degradation and ensure system reliability.

This commit enhances the visibility of NATS and JetStream operations by providing key metrics to alert on potential issues and optimize system performance.

* Update rules.yml

* - minor changes, rollback rules.yml
- address comment changes
- revert to old rules.yml as they are generated

* - minor changes, rollback rules.yml
- address comment changes
- revert to old rules.yml as they are generated

* fix indentation

---------

Co-authored-by: somratdutta <duttasomratand.com>
Co-authored-by: Samuel Berthe <dev@samuel-berthe.fr>
Co-authored-by: somrat.dutta <somrat.dutta@nutanix.com>
---
 _data/rules.yml                   | 83 ++++++++++++++++++++++++++++++-
 dist/rules/nats/nats-exporter.yml |  2 +-
 2 files changed, 83 insertions(+), 2 deletions(-)

diff --git a/_data/rules.yml b/_data/rules.yml
index b9506d2..6f5d04d 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -1534,9 +1534,90 @@ groups:
                 for: 3m
               - name: Nats high routes count
                 description: High number of NATS routes ({{ $value }}) for {{ $labels.instance }}
-                query: "gnatsd_routez_num_routes > 10"
+                query: "gnatsd_varz_routes > 10"
                 severity: warning
                 for: 3m
+              - name: Nats high memory usage
+                description: NATS server memory usage is above 200MB for {{ $labels.instance }}
+                query: "gnatsd_varz_mem > 200 * 1024 * 1024"
+                severity: warning
+                for: 5m
+              - name: Nats slow consumers
+                description: There are slow consumers in NATS for {{ $labels.instance }}
+                query: "gnatsd_varz_slow_consumers > 0"
+                severity: critical
+                for: 3m
+              - name: Nats server down
+                description: NATS server has been down for more than 5 minutes
+                query: "absent(up{job="nats"})"
+                severity: critical
+                for: 5m
+              - name: Nats high CPU usage
+                description: NATS server is using more than 80% CPU for the last 5 minutes
+                query: "rate(gnatsd_varz_cpu[5m]) > 0.8"
+                severity: warning
+                for: 5m
+              - name: Nats high number of connections
+                description: NATS server has more than 1000 active connections
+                query: "gnatsd_connz_num_connections > 1000"
+                severity: warning
+                for: 5m
+              - name: Nats high JetStream store usage
+                description: JetStream store usage is over 80%
+                query: "gnatsd_varz_jetstream_stats_storage / gnatsd_varz_jetstream_config_max_storage > 0.8"
+                severity: warning
+                for: 5m
+              - name: Nats high JetStream memory usage
+                description: JetStream memory usage is over 80%
+                query: "gnatsd_varz_jetstream_stats_memory / gnatsd_varz_jetstream_config_max_memory > 0.8"
+                severity: warning
+                for: 5m
+              - name: Nats high number of subscriptions
+                description: NATS server has more than 1000 active subscriptions
+                query: "gnatsd_connz_subscriptions > 1000"
+                severity: warning
+                for: 5m
+              - name: Nats high pending bytes
+                description: NATS server has more than 100,000 pending bytes
+                query: "gnatsd_connz_pending_bytes > 100000"
+                severity: warning
+                for: 5m
+              - name: Nats too many errors
+                description: NATS server has encountered errors in the last 5 minutes
+                query: "increase(gnatsd_varz_jetstream_stats_api_errors[5m]) > 0"
+                severity: warning
+                for: 5m
+              - name: Nats JetStream consumers exceeded
+                description: JetStream has more than 100 active consumers
+                query: "sum(gnatsd_varz_jetstream_stats_accounts) > 100"
+                severity: warning
+                for: 5m
+              - name: Nats frequent authentication timeouts
+                description: There have been more than 5 authentication timeouts in the last 5 minutes
+                query: "increase(gnatsd_varz_auth_timeout[5m]) > 5"
+                severity: warning
+                for: 5m
+              - name: Nats max payload size exceeded
+                description: The max payload size allowed by NATS has been exceeded (1MB)
+                query: "max(gnatsd_varz_max_payload) > 1024 * 1024"
+                severity: critical
+                for: 5m
+              - name: Nats leaf node connection issue
+                description: No leaf node connections have been established in the last 5 minutes
+                query: "increase(gnatsd_varz_leafnodes[5m]) == 0"
+                severity: critical
+                for: 5m
+              - name: Nats max ping operations exceeded
+                description: The maximum number of ping operations in NATS has exceeded 50
+                query: "gnatsd_varz_ping_max > 50"
+                severity: warning
+                for: 5m
+              - name: Nats write deadline exceeded
+                description: The write deadline has been exceeded in NATS, indicating potential message delivery issues
+                query: "gnatsd_varz_write_deadline > 10"
+                severity: critical
+                for: 5m
+
 
       - name: Solr
         exporters:
diff --git a/dist/rules/nats/nats-exporter.yml b/dist/rules/nats/nats-exporter.yml
index 13eda2b..a9a74fa 100644
--- a/dist/rules/nats/nats-exporter.yml
+++ b/dist/rules/nats/nats-exporter.yml
@@ -38,4 +38,4 @@ groups:
         severity: warning
       annotations:
         summary: Nats high routes count (instance {{ $labels.instance }})
-        description: "High number of NATS routes ({{ $value }}) for {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "High number of NATS routes ({{ $value }}) for {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
\ No newline at end of file

From 3bf8d6d824e5d61221ce4e380d5979532f28f68a Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Sat, 24 Aug 2024 10:42:21 +0200
Subject: [PATCH 084/123] fix: Gemfile to reduce vulnerabilities (#432)

The following vulnerabilities are fixed with an upgrade:
- https://snyk.io/vuln/SNYK-RUBY-REXML-7814166

Co-authored-by: snyk-bot <snyk-bot@snyk.io>
---
 Gemfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Gemfile b/Gemfile
index eef87b6..c958185 100644
--- a/Gemfile
+++ b/Gemfile
@@ -1,3 +1,3 @@
 source 'https://rubygems.org'
-gem 'github-pages', '>= 227', group: :jekyll_plugins
+gem 'github-pages', '>= 232', group: :jekyll_plugins
 gem 'webrick', '~> 1.3', '>= 1.3.1'
\ No newline at end of file

From 995ab4d27a5cdaa8045f1103f15a7161c4b245b3 Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Wed, 28 Aug 2024 08:46:41 +0200
Subject: [PATCH 085/123] Update rules.yml

---
 _data/rules.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/_data/rules.yml b/_data/rules.yml
index 6f5d04d..f5ac4bf 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -267,7 +267,7 @@ groups:
                 for: 5m
               - name: Host node overtemperature alarm
                 description: "Physical node temperature alarm triggered"
-                query: '(node_hwmon_temp_crit_alarm_celsius == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+                query: '((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
                 severity: critical
               - name: Host RAID array got inactive
                 description: "RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically."

From f08e8df514173515f14a09b2f8a805010316c4f0 Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Wed, 28 Aug 2024 08:48:42 +0200
Subject: [PATCH 086/123] oops

---
 _data/rules.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/_data/rules.yml b/_data/rules.yml
index f5ac4bf..9b94c17 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -1549,7 +1549,7 @@ groups:
                 for: 3m
               - name: Nats server down
                 description: NATS server has been down for more than 5 minutes
-                query: "absent(up{job="nats"})"
+                query: 'absent(up{job="nats"})'
                 severity: critical
                 for: 5m
               - name: Nats high CPU usage

From 4aa45dee059a05bb5e8268b506197699296221aa Mon Sep 17 00:00:00 2001
From: samber <samber@users.noreply.github.com>
Date: Wed, 28 Aug 2024 06:49:52 +0000
Subject: [PATCH 087/123] Publish

---
 .../rules/host-and-hardware/node-exporter.yml |   2 +-
 dist/rules/nats/nats-exporter.yml             | 148 +++++++++++++++++-
 2 files changed, 147 insertions(+), 3 deletions(-)

diff --git a/dist/rules/host-and-hardware/node-exporter.yml b/dist/rules/host-and-hardware/node-exporter.yml
index 0d80c16..6a465d9 100644
--- a/dist/rules/host-and-hardware/node-exporter.yml
+++ b/dist/rules/host-and-hardware/node-exporter.yml
@@ -215,7 +215,7 @@ groups:
         description: "Physical hardware component too hot\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: HostNodeOvertemperatureAlarm
-      expr: '(node_hwmon_temp_crit_alarm_celsius == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: '((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
       for: 0m
       labels:
         severity: critical
diff --git a/dist/rules/nats/nats-exporter.yml b/dist/rules/nats/nats-exporter.yml
index a9a74fa..7648762 100644
--- a/dist/rules/nats/nats-exporter.yml
+++ b/dist/rules/nats/nats-exporter.yml
@@ -32,10 +32,154 @@ groups:
         description: "High number of NATS subscriptions ({{ $value }}) for {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: NatsHighRoutesCount
-      expr: 'gnatsd_routez_num_routes > 10'
+      expr: 'gnatsd_varz_routes > 10'
       for: 3m
       labels:
         severity: warning
       annotations:
         summary: Nats high routes count (instance {{ $labels.instance }})
-        description: "High number of NATS routes ({{ $value }}) for {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
\ No newline at end of file
+        description: "High number of NATS routes ({{ $value }}) for {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: NatsHighMemoryUsage
+      expr: 'gnatsd_varz_mem > 200 * 1024 * 1024'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Nats high memory usage (instance {{ $labels.instance }})
+        description: "NATS server memory usage is above 200MB for {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: NatsSlowConsumers
+      expr: 'gnatsd_varz_slow_consumers > 0'
+      for: 3m
+      labels:
+        severity: critical
+      annotations:
+        summary: Nats slow consumers (instance {{ $labels.instance }})
+        description: "There are slow consumers in NATS for {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: NatsServerDown
+      expr: 'absent(up{job="nats"})'
+      for: 5m
+      labels:
+        severity: critical
+      annotations:
+        summary: Nats server down (instance {{ $labels.instance }})
+        description: "NATS server has been down for more than 5 minutes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: NatsHighCpuUsage
+      expr: 'rate(gnatsd_varz_cpu[5m]) > 0.8'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Nats high CPU usage (instance {{ $labels.instance }})
+        description: "NATS server is using more than 80% CPU for the last 5 minutes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: NatsHighNumberOfConnections
+      expr: 'gnatsd_connz_num_connections > 1000'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Nats high number of connections (instance {{ $labels.instance }})
+        description: "NATS server has more than 1000 active connections\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: NatsHighJetstreamStoreUsage
+      expr: 'gnatsd_varz_jetstream_stats_storage / gnatsd_varz_jetstream_config_max_storage > 0.8'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Nats high JetStream store usage (instance {{ $labels.instance }})
+        description: "JetStream store usage is over 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: NatsHighJetstreamMemoryUsage
+      expr: 'gnatsd_varz_jetstream_stats_memory / gnatsd_varz_jetstream_config_max_memory > 0.8'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Nats high JetStream memory usage (instance {{ $labels.instance }})
+        description: "JetStream memory usage is over 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: NatsHighNumberOfSubscriptions
+      expr: 'gnatsd_connz_subscriptions > 1000'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Nats high number of subscriptions (instance {{ $labels.instance }})
+        description: "NATS server has more than 1000 active subscriptions\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: NatsHighPendingBytes
+      expr: 'gnatsd_connz_pending_bytes > 100000'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Nats high pending bytes (instance {{ $labels.instance }})
+        description: "NATS server has more than 100,000 pending bytes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: NatsTooManyErrors
+      expr: 'increase(gnatsd_varz_jetstream_stats_api_errors[5m]) > 0'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Nats too many errors (instance {{ $labels.instance }})
+        description: "NATS server has encountered errors in the last 5 minutes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: NatsJetstreamConsumersExceeded
+      expr: 'sum(gnatsd_varz_jetstream_stats_accounts) > 100'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Nats JetStream consumers exceeded (instance {{ $labels.instance }})
+        description: "JetStream has more than 100 active consumers\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: NatsFrequentAuthenticationTimeouts
+      expr: 'increase(gnatsd_varz_auth_timeout[5m]) > 5'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Nats frequent authentication timeouts (instance {{ $labels.instance }})
+        description: "There have been more than 5 authentication timeouts in the last 5 minutes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: NatsMaxPayloadSizeExceeded
+      expr: 'max(gnatsd_varz_max_payload) > 1024 * 1024'
+      for: 5m
+      labels:
+        severity: critical
+      annotations:
+        summary: Nats max payload size exceeded (instance {{ $labels.instance }})
+        description: "The max payload size allowed by NATS has been exceeded (1MB)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: NatsLeafNodeConnectionIssue
+      expr: 'increase(gnatsd_varz_leafnodes[5m]) == 0'
+      for: 5m
+      labels:
+        severity: critical
+      annotations:
+        summary: Nats leaf node connection issue (instance {{ $labels.instance }})
+        description: "No leaf node connections have been established in the last 5 minutes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: NatsMaxPingOperationsExceeded
+      expr: 'gnatsd_varz_ping_max > 50'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Nats max ping operations exceeded (instance {{ $labels.instance }})
+        description: "The maximum number of ping operations in NATS has exceeded 50\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: NatsWriteDeadlineExceeded
+      expr: 'gnatsd_varz_write_deadline > 10'
+      for: 5m
+      labels:
+        severity: critical
+      annotations:
+        summary: Nats write deadline exceeded (instance {{ $labels.instance }})
+        description: "The write deadline has been exceeded in NATS, indicating potential message delivery issues\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

From 65a5f586cb8f72c44310acf357f1eaae476cd591 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 9 Sep 2024 20:09:20 +0200
Subject: [PATCH 088/123] build(deps-dev): bump rexml from 3.3.3 to 3.3.6
 (#431)

Bumps [rexml](https://github.com/ruby/rexml) from 3.3.3 to 3.3.6.
- [Release notes](https://github.com/ruby/rexml/releases)
- [Changelog](https://github.com/ruby/rexml/blob/master/NEWS.md)
- [Commits](https://github.com/ruby/rexml/compare/v3.3.3...v3.3.6)

---
updated-dependencies:
- dependency-name: rexml
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 Gemfile.lock | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Gemfile.lock b/Gemfile.lock
index f2d1111..71b8bda 100644
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -237,7 +237,7 @@ GEM
     rb-fsevent (0.11.2)
     rb-inotify (0.11.1)
       ffi (~> 1.0)
-    rexml (3.3.3)
+    rexml (3.3.6)
       strscan
     rouge (3.30.0)
     ruby2_keywords (0.0.5)

From d6d6ae4ef843bfc63d021008772a0d55c02beb51 Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Thu, 26 Sep 2024 11:31:21 +0200
Subject: [PATCH 089/123] fix: Gemfile to reduce vulnerabilities (#434)

The following vulnerabilities are fixed with an upgrade:
- https://snyk.io/vuln/SNYK-RUBY-WEBRICK-8068535

Co-authored-by: snyk-bot <snyk-bot@snyk.io>
---
 Gemfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Gemfile b/Gemfile
index c958185..1ff80aa 100644
--- a/Gemfile
+++ b/Gemfile
@@ -1,3 +1,3 @@
 source 'https://rubygems.org'
 gem 'github-pages', '>= 232', group: :jekyll_plugins
-gem 'webrick', '~> 1.3', '>= 1.3.1'
\ No newline at end of file
+gem 'webrick', '~> 1.8', '>= 1.8.2'
\ No newline at end of file

From 35596c866f129e3134f7ac705e90f50002dae073 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 27 Sep 2024 22:24:21 +0200
Subject: [PATCH 090/123] build(deps): bump webrick from 1.7.0 to 1.8.2 (#435)

Bumps [webrick](https://github.com/ruby/webrick) from 1.7.0 to 1.8.2.
- [Release notes](https://github.com/ruby/webrick/releases)
- [Commits](https://github.com/ruby/webrick/compare/v1.7.0...v1.8.2)

---
updated-dependencies:
- dependency-name: webrick
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 Gemfile      |  2 +-
 Gemfile.lock | 72 ++++++++++++++++++++++++++++++----------------------
 2 files changed, 42 insertions(+), 32 deletions(-)

diff --git a/Gemfile b/Gemfile
index 1ff80aa..cddfa60 100644
--- a/Gemfile
+++ b/Gemfile
@@ -1,3 +1,3 @@
 source 'https://rubygems.org'
 gem 'github-pages', '>= 232', group: :jekyll_plugins
-gem 'webrick', '~> 1.8', '>= 1.8.2'
\ No newline at end of file
+gem 'webrick', '~> 1.8'
\ No newline at end of file
diff --git a/Gemfile.lock b/Gemfile.lock
index 71b8bda..1afa3ed 100644
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -1,16 +1,17 @@
 GEM
   remote: https://rubygems.org/
   specs:
-    activesupport (7.1.3.4)
+    activesupport (7.2.1)
       base64
       bigdecimal
-      concurrent-ruby (~> 1.0, >= 1.0.2)
+      concurrent-ruby (~> 1.0, >= 1.3.1)
       connection_pool (>= 2.2.5)
       drb
       i18n (>= 1.6, < 2)
+      logger (>= 1.4.2)
       minitest (>= 5.1)
-      mutex_m
-      tzinfo (~> 2.0)
+      securerandom (>= 0.3)
+      tzinfo (~> 2.0, >= 2.0.5)
     addressable (2.8.7)
       public_suffix (>= 2.0.2, < 7.0)
     base64 (0.2.0)
@@ -21,8 +22,9 @@ GEM
     coffee-script-source (1.12.2)
     colorator (1.1.0)
     commonmarker (0.23.10)
-    concurrent-ruby (1.3.3)
+    concurrent-ruby (1.3.4)
     connection_pool (2.4.1)
+    csv (3.3.0)
     dnsruby (1.72.2)
       simpleidn (~> 0.2.1)
     drb (2.2.1)
@@ -33,20 +35,23 @@ GEM
       ffi (>= 1.15.0)
     eventmachine (1.2.7)
     execjs (2.9.1)
-    faraday (2.8.1)
-      base64
-      faraday-net_http (>= 2.0, < 3.1)
-      ruby2_keywords (>= 0.0.4)
-    faraday-net_http (3.0.2)
-    ffi (1.16.3)
+    faraday (2.12.0)
+      faraday-net_http (>= 2.0, < 3.4)
+      json
+      logger
+    faraday-net_http (3.3.0)
+      net-http
+    ffi (1.17.0)
+    ffi (1.17.0-x86_64-linux-gnu)
+    ffi (1.17.0-x86_64-linux-musl)
     forwardable-extended (2.6.0)
     gemoji (4.1.0)
-    github-pages (231)
+    github-pages (232)
       github-pages-health-check (= 1.18.2)
-      jekyll (= 3.9.5)
+      jekyll (= 3.10.0)
       jekyll-avatar (= 0.8.0)
       jekyll-coffeescript (= 1.2.2)
-      jekyll-commonmark-ghpages (= 0.4.0)
+      jekyll-commonmark-ghpages (= 0.5.1)
       jekyll-default-layout (= 0.1.5)
       jekyll-feed (= 0.17.0)
       jekyll-gist (= 1.5.0)
@@ -83,9 +88,10 @@ GEM
       liquid (= 4.0.4)
       mercenary (~> 0.3)
       minima (= 2.5.1)
-      nokogiri (>= 1.13.6, < 2.0)
+      nokogiri (>= 1.16.2, < 2.0)
       rouge (= 3.30.0)
       terminal-table (~> 1.4)
+      webrick (~> 1.8)
     github-pages-health-check (1.18.2)
       addressable (~> 2.3)
       dnsruby (~> 1.60)
@@ -96,11 +102,12 @@ GEM
       activesupport (>= 2)
       nokogiri (>= 1.4)
     http_parser.rb (0.8.0)
-    i18n (1.14.5)
+    i18n (1.14.6)
       concurrent-ruby (~> 1.0)
-    jekyll (3.9.5)
+    jekyll (3.10.0)
       addressable (~> 2.4)
       colorator (~> 1.0)
+      csv (~> 3.0)
       em-websocket (~> 0.5)
       i18n (>= 0.7, < 2)
       jekyll-sass-converter (~> 1.0)
@@ -111,6 +118,7 @@ GEM
       pathutil (~> 0.9)
       rouge (>= 1.7, < 4)
       safe_yaml (~> 1.0)
+      webrick (>= 1.0)
     jekyll-avatar (0.8.0)
       jekyll (>= 3.0, < 5.0)
     jekyll-coffeescript (1.2.2)
@@ -118,9 +126,9 @@ GEM
       coffee-script-source (~> 1.12)
     jekyll-commonmark (1.4.0)
       commonmarker (~> 0.22)
-    jekyll-commonmark-ghpages (0.4.0)
-      commonmarker (~> 0.23.7)
-      jekyll (~> 3.9.0)
+    jekyll-commonmark-ghpages (0.5.1)
+      commonmarker (>= 0.23.7, < 1.1.0)
+      jekyll (>= 3.9, < 4.0)
       jekyll-commonmark (~> 1.4.0)
       rouge (>= 2.0, < 5.0)
     jekyll-default-layout (0.1.5)
@@ -206,6 +214,7 @@ GEM
       gemoji (>= 3, < 5)
       html-pipeline (~> 2.2)
       jekyll (>= 3.0, < 5.0)
+    json (2.7.2)
     kramdown (2.4.0)
       rexml
     kramdown-parser-gfm (1.1.0)
@@ -214,18 +223,20 @@ GEM
     listen (3.9.0)
       rb-fsevent (~> 0.10, >= 0.10.3)
       rb-inotify (~> 0.9, >= 0.9.10)
+    logger (1.6.1)
     mercenary (0.3.6)
     mini_portile2 (2.8.7)
     minima (2.5.1)
       jekyll (>= 3.5, < 5.0)
       jekyll-feed (~> 0.9)
       jekyll-seo-tag (~> 2.1)
-    minitest (5.24.1)
-    mutex_m (0.2.0)
-    nokogiri (1.16.5)
+    minitest (5.25.1)
+    net-http (0.4.1)
+      uri
+    nokogiri (1.16.7)
       mini_portile2 (~> 2.8.2)
       racc (~> 1.4)
-    nokogiri (1.16.5-x86_64-linux)
+    nokogiri (1.16.7-x86_64-linux)
       racc (~> 1.4)
     octokit (4.25.1)
       faraday (>= 1, < 3)
@@ -237,10 +248,8 @@ GEM
     rb-fsevent (0.11.2)
     rb-inotify (0.11.1)
       ffi (~> 1.0)
-    rexml (3.3.6)
-      strscan
+    rexml (3.3.7)
     rouge (3.30.0)
-    ruby2_keywords (0.0.5)
     rubyzip (2.3.2)
     safe_yaml (1.0.5)
     sass (3.7.4)
@@ -251,8 +260,8 @@ GEM
     sawyer (0.9.2)
       addressable (>= 2.3.5)
       faraday (>= 0.17.3, < 3)
+    securerandom (0.3.1)
     simpleidn (0.2.3)
-    strscan (3.1.0)
     terminal-table (1.8.0)
       unicode-display_width (~> 1.1, >= 1.1.1)
     typhoeus (1.4.1)
@@ -260,7 +269,8 @@ GEM
     tzinfo (2.0.6)
       concurrent-ruby (~> 1.0)
     unicode-display_width (1.8.0)
-    webrick (1.7.0)
+    uri (0.13.1)
+    webrick (1.8.2)
 
 PLATFORMS
   ruby
@@ -268,8 +278,8 @@ PLATFORMS
   x86_64-linux-musl
 
 DEPENDENCIES
-  github-pages (>= 227)
-  webrick (~> 1.3, >= 1.3.1)
+  github-pages (>= 232)
+  webrick (~> 1.8)
 
 BUNDLED WITH
    2.3.13

From cd5b39a1f02123cb69476b868fee22f393f3f346 Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Sat, 5 Oct 2024 18:06:22 +0200
Subject: [PATCH 091/123] Create FUNDING.json

---
 FUNDING.json | 7 +++++++
 1 file changed, 7 insertions(+)
 create mode 100644 FUNDING.json

diff --git a/FUNDING.json b/FUNDING.json
new file mode 100644
index 0000000..cf7bea6
--- /dev/null
+++ b/FUNDING.json
@@ -0,0 +1,7 @@
+{
+  "drips": {
+    "ethereum": {
+      "ownedBy": "0xc31e1c24253da5a0c7ed4955347588c626c22292"
+    }
+  }
+}

From 640f06588d86e9f0818e3d8e90fed90a4b5cd6e2 Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Sat, 5 Oct 2024 18:21:35 +0200
Subject: [PATCH 092/123] Delete FUNDING.json

---
 FUNDING.json | 7 -------
 1 file changed, 7 deletions(-)
 delete mode 100644 FUNDING.json

diff --git a/FUNDING.json b/FUNDING.json
deleted file mode 100644
index cf7bea6..0000000
--- a/FUNDING.json
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-  "drips": {
-    "ethereum": {
-      "ownedBy": "0xc31e1c24253da5a0c7ed4955347588c626c22292"
-    }
-  }
-}

From 7313acce364a5ce013cc020d71fdc5178b7729be Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Sat, 5 Oct 2024 18:57:43 +0200
Subject: [PATCH 093/123] Create FUNDING.json

---
 FUNDING.json | 7 +++++++
 1 file changed, 7 insertions(+)
 create mode 100644 FUNDING.json

diff --git a/FUNDING.json b/FUNDING.json
new file mode 100644
index 0000000..c4eccbf
--- /dev/null
+++ b/FUNDING.json
@@ -0,0 +1,7 @@
+{
+  "drips": {
+    "ethereum": {
+      "ownedBy": "0x1Baee8431ead537455399cC7099eBb219227C1f1"
+    }
+  }
+}

From c41fda1d924f66315773df90b9e12db172169e81 Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Sun, 6 Oct 2024 17:31:23 +0200
Subject: [PATCH 094/123] Update alertmanager.md

---
 alertmanager.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/alertmanager.md b/alertmanager.md
index 6b07089..d350945 100644
--- a/alertmanager.md
+++ b/alertmanager.md
@@ -138,4 +138,4 @@ If the notification takes too much time to be triggered, check the following del
 Also read:
 - [https://pracucci.com/prometheus-understanding-the-delays-on-alerting.html](https://pracucci.com/prometheus-understanding-the-delays-on-alerting.html).
 - [https://hodovi.cc/blog/creating-awesome-alertmanager-templates-for-slack/](https://hodovi.cc/blog/creating-awesome-alertmanager-templates-for-slack/)
-
+- [https://grafana.com/blog/2024/10/03/how-to-use-prometheus-to-efficiently-detect-anomalies-at-scale/](https://grafana.com/blog/2024/10/03/how-to-use-prometheus-to-efficiently-detect-anomalies-at-scale/)

From f9e683896f07e44ffd3ea15ba3b290a932c30a35 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 28 Oct 2024 20:17:58 +0100
Subject: [PATCH 095/123] build(deps-dev): bump rexml from 3.3.7 to 3.3.9
 (#438)

Bumps [rexml](https://github.com/ruby/rexml) from 3.3.7 to 3.3.9.
- [Release notes](https://github.com/ruby/rexml/releases)
- [Changelog](https://github.com/ruby/rexml/blob/master/NEWS.md)
- [Commits](https://github.com/ruby/rexml/compare/v3.3.7...v3.3.9)

---
updated-dependencies:
- dependency-name: rexml
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 Gemfile.lock | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Gemfile.lock b/Gemfile.lock
index 1afa3ed..305a897 100644
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -248,7 +248,7 @@ GEM
     rb-fsevent (0.11.2)
     rb-inotify (0.11.1)
       ffi (~> 1.0)
-    rexml (3.3.7)
+    rexml (3.3.9)
     rouge (3.30.0)
     rubyzip (2.3.2)
     safe_yaml (1.0.5)

From bb75cb2c68e0baa634df262b93129b9731a14c94 Mon Sep 17 00:00:00 2001
From: sipr-invivo <160140834+sipr-invivo@users.noreply.github.com>
Date: Mon, 28 Oct 2024 22:24:10 +0100
Subject: [PATCH 096/123] feat: Add rule to Kubernetes Job not starting (#436)

---
 _data/rules.yml | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/_data/rules.yml b/_data/rules.yml
index 9b94c17..f05d289 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -1132,12 +1132,12 @@ groups:
                 description: "The indexing latency on Elasticsearch cluster is higher than the threshold."
                 query: "elasticsearch_indices_indexing_index_time_seconds_total / elasticsearch_indices_indexing_index_total > 0.0005"
                 severity: warning
-                for: 10m       
+                for: 10m
               - name: Elasticsearch High Indexing Rate
                 description: "The indexing rate on Elasticsearch cluster is higher than the threshold."
                 query: "sum(rate(elasticsearch_indices_indexing_index_total[1m]))> 10000"
                 severity: warning
-                for: 5m     
+                for: 5m
               - name: Elasticsearch High Query Rate
                 description: "The query rate on Elasticsearch cluster is higher than the threshold."
                 query: "sum(rate(elasticsearch_indices_search_query_total[1m])) > 100"
@@ -1147,14 +1147,14 @@ groups:
                 description: "The query latency on Elasticsearch cluster is higher than the threshold."
                 query: "elasticsearch_indices_search_fetch_time_seconds / elasticsearch_indices_search_fetch_total > 1"
                 severity: warning
-                for: 5m                 
+                for: 5m
 
       - name: Meilisearch
         exporters:
           - name: Embedded exporter
             slug: embedded-exporter
             doc_url: https://github.com/orgs/meilisearch/discussions/625
-            rules:                
+            rules:
               - name: Meilisearch index is empty
                 description: Meilisearch instance is down
                 query: 'meilisearch_index_docs_count == 0'
@@ -2044,6 +2044,11 @@ groups:
                 description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete"
                 query: "kube_job_status_failed > 0"
                 severity: warning
+              - name: Kubernetes Job not starting
+                summary: Kubernetes Job not starting ({{ $labels.namespace }}/{{ $labels.job_name }})
+                description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} did not start for 10 minutes"
+                query: "kube_job_status_active == 0 and kube_job_status_failed == 0 and kube_job_status_succeeded == 0 and (time() - kube_job_status_start_time) > 600"
+                severity: warning
               - name: Kubernetes CronJob suspended
                 summary: Kubernetes CronJob suspended ({{ $labels.namespace }}/{{ $labels.cronjob }})
                 description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is suspended"

From 14949721ba376f9c1c668bbe10eaeef6f86b0671 Mon Sep 17 00:00:00 2001
From: samber <samber@users.noreply.github.com>
Date: Mon, 28 Oct 2024 21:25:18 +0000
Subject: [PATCH 097/123] Publish

---
 dist/rules/kubernetes/kubestate-exporter.yml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/dist/rules/kubernetes/kubestate-exporter.yml b/dist/rules/kubernetes/kubestate-exporter.yml
index 2db1d64..efb914f 100644
--- a/dist/rules/kubernetes/kubestate-exporter.yml
+++ b/dist/rules/kubernetes/kubestate-exporter.yml
@@ -67,6 +67,15 @@ groups:
         summary: Kubernetes Job failed ({{ $labels.namespace }}/{{ $labels.job_name }})
         description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
+    - alert: KubernetesJobNotStarting
+      expr: 'kube_job_status_active == 0 and kube_job_status_failed == 0 and kube_job_status_succeeded == 0 and (time() - kube_job_status_start_time) > 600'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Kubernetes Job not starting ({{ $labels.namespace }}/{{ $labels.job_name }})
+        description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} did not start for 10 minutes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
     - alert: KubernetesCronjobSuspended
       expr: 'kube_cronjob_spec_suspend != 0'
       for: 0m

From 353ef1ed95ef03a93087feb2162356a392f53cc6 Mon Sep 17 00:00:00 2001
From: Martin Anderson <martins.andersons@hotmail.com>
Date: Sat, 30 Nov 2024 11:29:57 +0200
Subject: [PATCH 098/123] RabbitMQ: add too many ready messages alert (#441)

* RabbitMQ: add too many ready messages alert

* Add RabbitMQ ready messages alert rule

---------

Co-authored-by: Samuel Berthe <dev@samuel-berthe.fr>
---
 _data/rules.yml                           | 5 +++++
 dist/rules/rabbitmq/rabbitmq-exporter.yml | 9 +++++++++
 2 files changed, 14 insertions(+)

diff --git a/_data/rules.yml b/_data/rules.yml
index f05d289..aa0ed38 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -969,6 +969,11 @@ groups:
                 query: "rabbitmq_process_open_fds / rabbitmq_process_max_fds * 100 > 90"
                 severity: warning
                 for: 2m
+              - name: RabbitMQ too many ready messages
+                description: RabbitMQ too many ready messages on {{ $labels.instace }}
+                query: "sum(rabbitmq_queue_messages_ready) BY (queue) > 1000"
+                severity: warning
+                for: 1m
               - name: RabbitMQ too many unack messages
                 description: Too many unacknowledged messages
                 query: "sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000"
diff --git a/dist/rules/rabbitmq/rabbitmq-exporter.yml b/dist/rules/rabbitmq/rabbitmq-exporter.yml
index be95359..c699128 100644
--- a/dist/rules/rabbitmq/rabbitmq-exporter.yml
+++ b/dist/rules/rabbitmq/rabbitmq-exporter.yml
@@ -49,6 +49,15 @@ groups:
         summary: RabbitMQ file descriptors usage (instance {{ $labels.instance }})
         description: "A node use more than 90% of file descriptors\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
+    - alert: RabbitmqTooManyReadyMessages
+      expr: 'sum(rabbitmq_queue_messages_ready) BY (queue) > 1000'
+      for: 1m
+      labels:
+        severity: warning
+      annotations:
+        summary: RabbitMQ too many ready messages (instance {{ $labels.instance }})
+        description: "Too many ready messages\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
     - alert: RabbitmqTooManyUnackMessages
       expr: 'sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000'
       for: 1m

From 8a220b1b8af48f5405e2bbc28de14d93128efca7 Mon Sep 17 00:00:00 2001
From: samber <samber@users.noreply.github.com>
Date: Sat, 30 Nov 2024 09:31:05 +0000
Subject: [PATCH 099/123] Publish

---
 dist/rules/rabbitmq/rabbitmq-exporter.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dist/rules/rabbitmq/rabbitmq-exporter.yml b/dist/rules/rabbitmq/rabbitmq-exporter.yml
index c699128..10823d2 100644
--- a/dist/rules/rabbitmq/rabbitmq-exporter.yml
+++ b/dist/rules/rabbitmq/rabbitmq-exporter.yml
@@ -56,7 +56,7 @@ groups:
         severity: warning
       annotations:
         summary: RabbitMQ too many ready messages (instance {{ $labels.instance }})
-        description: "Too many ready messages\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "RabbitMQ too many ready messages on {{ $labels.instace }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: RabbitmqTooManyUnackMessages
       expr: 'sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000'

From 8c3d06502fb26e7d0135a65d55130afa8e905ea9 Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Thu, 5 Dec 2024 23:37:28 +0100
Subject: [PATCH 100/123] Update rules.yml

---
 _data/rules.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/_data/rules.yml b/_data/rules.yml
index aa0ed38..5a274cc 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -2036,7 +2036,7 @@ groups:
                 for: 2m
               - name: Kubernetes Node out of pod capacity
                 description: "Node {{ $labels.node }} is out of pod capacity"
-                query: 'sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90'
+                query: 'sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid, instance) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90'
                 severity: warning
                 for: 2m
               - name: Kubernetes Container oom killer

From 4e38ae2087b03672e2c63439341ffcedb63ce90e Mon Sep 17 00:00:00 2001
From: samber <samber@users.noreply.github.com>
Date: Thu, 5 Dec 2024 22:38:38 +0000
Subject: [PATCH 101/123] Publish

---
 dist/rules/kubernetes/kubestate-exporter.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dist/rules/kubernetes/kubestate-exporter.yml b/dist/rules/kubernetes/kubestate-exporter.yml
index efb914f..7e32694 100644
--- a/dist/rules/kubernetes/kubestate-exporter.yml
+++ b/dist/rules/kubernetes/kubestate-exporter.yml
@@ -41,7 +41,7 @@ groups:
         description: "Node {{ $labels.node }} has NetworkUnavailable condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: KubernetesNodeOutOfPodCapacity
-      expr: 'sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90'
+      expr: 'sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid, instance) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90'
       for: 2m
       labels:
         severity: warning

From fff8a80ae5b19109bcdb6331e5304cf21fbba269 Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Sun, 8 Dec 2024 21:24:45 +0100
Subject: [PATCH 102/123] Update README.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 9188322..fdb5643 100644
--- a/README.md
+++ b/README.md
@@ -102,6 +102,7 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts
 - [Loki](https://samber.github.io/awesome-prometheus-alerts/rules#loki)
 - [Promtail](https://samber.github.io/awesome-prometheus-alerts/rules#promtail)
 - [Cortex](https://samber.github.io/awesome-prometheus-alerts/rules#cortex)
+- [Grafana Alloy](https://samber.github.io/awesome-prometheus-alerts/rules#grafana-alloy)
 - [Jenkins](https://samber.github.io/awesome-prometheus-alerts/rules#jenkins)
 - [Graph Node](https://samber.github.io/awesome-prometheus-alerts/rules#graph-node)
 

From a8d7c43b3052e08b8a365f831bc74f9e87f0a824 Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Sun, 8 Dec 2024 21:28:07 +0100
Subject: [PATCH 103/123] Update rules.yml

---
 _data/rules.yml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/_data/rules.yml b/_data/rules.yml
index 5a274cc..abf9beb 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -2982,6 +2982,15 @@ groups:
                 severity: critical
                 for: 5m
 
+      - name: Grafana Alloy
+        exporters:
+          - slug: embedded-exporter
+            rules:
+              - name: Grafana Alloy service down
+                description: Alloy on (instance {{ $labels.instance }}) is not responding or has stopped running. 
+                query: 'count by (instance) (alloy_build_info) unless count by (instance) (alloy_build_info offset 2m)  '
+                severity: critical
+
       - name: Jenkins
         exporters:
           - name: Metric plugin

From c5203e94d009b50d4bb616ba7a6775a2bb9833a7 Mon Sep 17 00:00:00 2001
From: samber <samber@users.noreply.github.com>
Date: Sun, 8 Dec 2024 20:29:15 +0000
Subject: [PATCH 104/123] Publish

---
 dist/rules/grafana-alloy/embedded-exporter.yml | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 dist/rules/grafana-alloy/embedded-exporter.yml

diff --git a/dist/rules/grafana-alloy/embedded-exporter.yml b/dist/rules/grafana-alloy/embedded-exporter.yml
new file mode 100644
index 0000000..d86c8a4
--- /dev/null
+++ b/dist/rules/grafana-alloy/embedded-exporter.yml
@@ -0,0 +1,14 @@
+groups:
+
+- name: EmbeddedExporter
+
+  rules:
+
+    - alert: GrafanaAlloyServiceDown
+      expr: 'count by (instance) (alloy_build_info) unless count by (instance) (alloy_build_info offset 2m)  '
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Grafana Alloy service down (instance {{ $labels.instance }})
+        description: "Alloy on (instance {{ $labels.instance }}) is not responding or has stopped running.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

From 52d4a8c7449edfce74e7d495ecb9a070c58f9d31 Mon Sep 17 00:00:00 2001
From: dxrayz <105016816+dxrayz@users.noreply.github.com>
Date: Mon, 16 Dec 2024 13:16:05 +0200
Subject: [PATCH 105/123] Update postgres-exporter.yml (#444)

Modify PostgresqlConfigurationChanged for prevent error: "many-to-many matching not allowed: matching labels must be unique on one side" in cases when you have multiple instances of postgres
---
 dist/rules/postgresql/postgres-exporter.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dist/rules/postgresql/postgres-exporter.yml b/dist/rules/postgresql/postgres-exporter.yml
index 2ab461f..36070b3 100644
--- a/dist/rules/postgresql/postgres-exporter.yml
+++ b/dist/rules/postgresql/postgres-exporter.yml
@@ -140,7 +140,7 @@ groups:
         description: "PostgreSQL dead tuples is too large\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: PostgresqlConfigurationChanged
-      expr: '{__name__=~"pg_settings_.*"} != ON(__name__) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"} OFFSET 5m'
+      expr: '{__name__=~"pg_settings_.*"} != ON(__name__,instance) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"} OFFSET 5m'
       for: 0m
       labels:
         severity: info

From 4533f23b79bc3cb8b608eddec3001ac949408aee Mon Sep 17 00:00:00 2001
From: samber <samber@users.noreply.github.com>
Date: Mon, 16 Dec 2024 11:17:17 +0000
Subject: [PATCH 106/123] Publish

---
 dist/rules/postgresql/postgres-exporter.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dist/rules/postgresql/postgres-exporter.yml b/dist/rules/postgresql/postgres-exporter.yml
index 36070b3..2ab461f 100644
--- a/dist/rules/postgresql/postgres-exporter.yml
+++ b/dist/rules/postgresql/postgres-exporter.yml
@@ -140,7 +140,7 @@ groups:
         description: "PostgreSQL dead tuples is too large\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: PostgresqlConfigurationChanged
-      expr: '{__name__=~"pg_settings_.*"} != ON(__name__,instance) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"} OFFSET 5m'
+      expr: '{__name__=~"pg_settings_.*"} != ON(__name__) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"} OFFSET 5m'
       for: 0m
       labels:
         severity: info

From 84a3b517a8338407b0c6dc7f7890f4aaa4580901 Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Mon, 16 Dec 2024 12:17:26 +0100
Subject: [PATCH 107/123] Update rules.yml

---
 _data/rules.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/_data/rules.yml b/_data/rules.yml
index abf9beb..6cb0fc8 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -703,7 +703,7 @@ groups:
                 for: 2m
               - name: Postgresql configuration changed
                 description: Postgres Database configuration change has occurred
-                query: '{__name__=~"pg_settings_.*"} != ON(__name__) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"} OFFSET 5m'
+                query: '{__name__=~"pg_settings_.*"} != ON(__name__, instance) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"} OFFSET 5m'
                 severity: info
               - name: Postgresql SSL compression active
                 description: Database connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.

From bdcc67c04e204d8abd88efe41085a7f68ea0ee39 Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Mon, 16 Dec 2024 12:17:59 +0100
Subject: [PATCH 108/123] Update rules.yml

---
 _data/rules.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/_data/rules.yml b/_data/rules.yml
index 6cb0fc8..fa57247 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -545,7 +545,7 @@ groups:
                 for: 5m
               - name: Netdata high memory usage
                 description: Netdata high memory usage (> 80%)
-                query: '100 / netdata_system_ram_MB_average * netdata_system_ram_MB_average{dimension=~"free|cached"} < 20'
+                query: '100 / netdata_system_ram_MiB_average * netdata_system_ram_MiB_average{dimension=~"free|cached"} < 20'
                 severity: warning
                 for: 5m
               - name: Netdata low disk space

From 53a369769d98837632cfc515d7caf53f04ebcd6b Mon Sep 17 00:00:00 2001
From: samber <samber@users.noreply.github.com>
Date: Mon, 16 Dec 2024 11:19:08 +0000
Subject: [PATCH 109/123] Publish

---
 dist/rules/netdata/embedded-exporter.yml    | 2 +-
 dist/rules/postgresql/postgres-exporter.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/dist/rules/netdata/embedded-exporter.yml b/dist/rules/netdata/embedded-exporter.yml
index 7d21766..8c57745 100644
--- a/dist/rules/netdata/embedded-exporter.yml
+++ b/dist/rules/netdata/embedded-exporter.yml
@@ -23,7 +23,7 @@ groups:
         description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: NetdataHighMemoryUsage
-      expr: '100 / netdata_system_ram_MB_average * netdata_system_ram_MB_average{dimension=~"free|cached"} < 20'
+      expr: '100 / netdata_system_ram_MiB_average * netdata_system_ram_MiB_average{dimension=~"free|cached"} < 20'
       for: 5m
       labels:
         severity: warning
diff --git a/dist/rules/postgresql/postgres-exporter.yml b/dist/rules/postgresql/postgres-exporter.yml
index 2ab461f..96ae5ea 100644
--- a/dist/rules/postgresql/postgres-exporter.yml
+++ b/dist/rules/postgresql/postgres-exporter.yml
@@ -140,7 +140,7 @@ groups:
         description: "PostgreSQL dead tuples is too large\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: PostgresqlConfigurationChanged
-      expr: '{__name__=~"pg_settings_.*"} != ON(__name__) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"} OFFSET 5m'
+      expr: '{__name__=~"pg_settings_.*"} != ON(__name__, instance) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"} OFFSET 5m'
       for: 0m
       labels:
         severity: info

From cbb2337438e8ff75f67e5bde59b8872b3fe31f6a Mon Sep 17 00:00:00 2001
From: sunlei <guizaicn@gmail.com>
Date: Mon, 13 Jan 2025 05:01:21 +0800
Subject: [PATCH 110/123] fix: formatting errors (#448)

* fix: formatting errors

* Update query format in rules.yml

---------

Co-authored-by: Samuel Berthe <dev@samuel-berthe.fr>
---
 _data/rules.yml                                | 5 +----
 dist/rules/host-and-hardware/node-exporter.yml | 5 +----
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/_data/rules.yml b/_data/rules.yml
index fa57247..eab61c2 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -242,10 +242,7 @@ groups:
                 for: 5m
               - name: Host context switching high
                 description: Context switching is growing on the node (twice the daily average during the last 15m)
-                query: |
-                  (rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"}))
-                  /
-                  (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2
+                query: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2'
                 severity: warning
                 comments: |
                   x2 context switches is an arbitrary number.
diff --git a/dist/rules/host-and-hardware/node-exporter.yml b/dist/rules/host-and-hardware/node-exporter.yml
index 6a465d9..a6adff1 100644
--- a/dist/rules/host-and-hardware/node-exporter.yml
+++ b/dist/rules/host-and-hardware/node-exporter.yml
@@ -176,10 +176,7 @@ groups:
         description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: HostContextSwitchingHigh
-      expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"}))
-/
-(rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2
-'
+      expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"}))/(rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2'
       for: 0m
       labels:
         severity: warning

From d916b7c6aba888a64535cb6d051070aeb9cd1a8e Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Tue, 28 Jan 2025 05:58:49 +0100
Subject: [PATCH 111/123] Fix from #405

---
 _data/rules.yml | 179 +++++++++++++++++++++---------------------------
 1 file changed, 79 insertions(+), 100 deletions(-)

diff --git a/_data/rules.yml b/_data/rules.yml
index eab61c2..88e0785 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -27,7 +27,7 @@ groups:
                 severity: critical
               - name: Prometheus target missing with warmup time
                 description: Allow a job time to start up (10 minutes) before alerting that it's down.
-                query: "sum by (instance, job) ((up == 0) * on (instance) group_right(job) (node_time_seconds - node_boot_time_seconds > 600))"
+                query: "sum by (instance, job) ((up == 0) * on (instance) group_left(__name__) (node_time_seconds - node_boot_time_seconds > 600))"
                 severity: critical
               - name: Prometheus configuration reload failure
                 description: Prometheus configuration reload error
@@ -137,53 +137,43 @@ groups:
             rules:
               - name: Host out of memory
                 description: Node memory is filling up (< 10% left)
-                query: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+                query: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < .10)'
                 severity: warning
                 for: 2m
               - name: Host memory under memory pressure
-                description: The node is under heavy memory pressure. High rate of major page faults
-                query: '(rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+                description: The node is under heavy memory pressure. High rate of loading memory pages from disk.
+                query: '(rate(node_vmstat_pgmajfault[5m]) > 1000)'
                 severity: warning
-                for: 2m
               - name: Host Memory is underutilized
-                description: "Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})"
-                query: '(100 - (avg_over_time(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+                description: "Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})"
+                query: 'min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8'
                 severity: info
-                for: 1w
                 comments: |
                   You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
               - name: Host unusual network throughput in
-                description: Host network interfaces are probably receiving too much data (> 100 MB/s)
-                query: '(sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+                description: Host receive bandwidth is high (>80%).
+                query: '((rate(node_network_receive_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)'
                 severity: warning
-                for: 5m
               - name: Host unusual network throughput out
-                description: Host network interfaces are probably sending too much data (> 100 MB/s)
-                query: '(sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+                description: Host transmit bandwidth is high (>80%)
+                query: '((rate(node_network_transmit_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)'
                 severity: warning
-                for: 5m
               - name: Host unusual disk read rate
-                description: Disk is probably reading too much data (> 50 MB/s)
-                query: '(sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+                description: Disk is too busy (IO wait > 80%)
+                query: '(rate(node_disk_io_time_seconds_total[5m]) > .80)'
                 severity: warning
-                for: 5m
-              - name: Host unusual disk write rate
-                description: Disk is probably writing too much data (> 50 MB/s)
-                query: '(sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-                severity: warning
-                for: 2m
               - name: Host out of disk space
                 description: Disk is almost full (< 10% left)
-                query: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-                severity: warning
+                query: '(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0)'
+                severity: critical
                 comments: |
                   Please add ignored mountpoints in node_exporter parameters like
                   "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
                   Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
                 for: 2m
-              - name: Host disk will fill in 24 hours
-                description: Filesystem is predicted to run out of space within the next 24 hours at current write rate
-                query: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+              - name: Host disk may fill in 24 hours
+                description: Filesystem will likely run out of space within the next 24 hours.
+                query: 'predict_linear(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_avail_bytes > 0'
                 severity: warning
                 comments: |
                   Please add ignored mountpoints in node_exporter parameters like
@@ -192,52 +182,52 @@ groups:
                 for: 2m
               - name: Host out of inodes
                 description: Disk is almost running out of available inodes (< 10% left)
-                query: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-                severity: warning
-                for: 2m
-              - name: Host filesystem device error
-                description: "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem"
-                query: "node_filesystem_device_error == 1"
+                query: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0)'
                 severity: critical
                 for: 2m
-              - name: Host inodes will fill in 24 hours
-                description: Filesystem is predicted to run out of inodes within the next 24 hours at current write rate
-                query: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{fstype!="msdosfs"} == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+              - name: Host filesystem device error
+                description: "Error stat-ing the {{ $labels.mountpoint }} filesystem"
+                query: "node_filesystem_device_error{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} == 1"
+                severity: critical
+                for: 2m
+              - name: Host inodes may fill in 24 hours
+                description: Filesystem will likely run out of inodes within the next 24 hours at current write rate
+                query: 'predict_linear(node_filesystem_files_free{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_files_free > 0'
                 severity: warning
                 for: 2m
               - name: Host unusual disk read latency
                 description: Disk latency is growing (read operations > 100ms)
-                query: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+                query: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0'
                 severity: warning
                 for: 2m
               - name: Host unusual disk write latency
                 description: Disk latency is growing (write operations > 100ms)
-                query: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+                query: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0)'
                 severity: warning
                 for: 2m
               - name: Host high CPU load
                 description: CPU load is > 80%
-                query: '(sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+                query: '(avg by (instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > .80'
                 severity: warning
                 for: 10m
               - name: Host CPU is underutilized
-                description: "CPU load is < 20% for 1 week. Consider reducing the number of CPUs."
-                query: '(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+                description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs."
+                query: '(min by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8'
                 severity: info
                 for: 1w
                 comments: |
                   You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
               - name: Host CPU steal noisy neighbor
                 description: CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.
-                query: '(avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+                query: 'avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10'
                 severity: warning
               - name: Host CPU high iowait
-                description: CPU iowait > 10%. A high iowait means that you are disk or network bound.
-                query: '(avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+                description: CPU iowait > 10%. Your CPU is idling waiting for storage to respond.
+                query: 'avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10'
                 severity: warning
               - name: Host unusual disk IO
-                description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues."
-                query: '(rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+                description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities. Check storage for issues."
+                query: 'rate(node_disk_io_time_seconds_total[5m]) > 0.8'
                 severity: warning
                 for: 5m
               - name: Host context switching high
@@ -250,86 +240,80 @@ groups:
                   Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
               - name: Host swap is filling up
                 description: Swap is filling up (>80%)
-                query: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+                query: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80)'
                 severity: warning
                 for: 2m
               - name: Host systemd service crashed
                 description: "systemd service crashed"
-                query: '(node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+                query: '(node_systemd_unit_state{state="failed"} == 1)'
                 severity: warning
               - name: Host physical component too hot
                 description: "Physical hardware component too hot"
-                query: '((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl"} > 75)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+                query: 'node_hwmon_temp_celsius > node_hwmon_temp_max_celsius'
                 severity: warning
                 for: 5m
               - name: Host node overtemperature alarm
                 description: "Physical node temperature alarm triggered"
-                query: '((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+                query: '((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1))'
                 severity: critical
-              - name: Host RAID array got inactive
-                description: "RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically."
-                query: '(node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+              - name: Host software RAID insufficient drives
+                description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} has insufficient drives remaining."
+                query: '((node_md_disks_required - on(device, instance) node_md_disks{state="active"}) > 0)'
                 severity: critical
-              - name: Host RAID disk failure
-                description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap"
-                query: '(node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+              - name: Host software RAID disk failure
+                description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} needs attention."
+                query: '(node_md_disks{state="failed"} > 0)'
                 severity: warning
                 for: 2m
               - name: Host kernel version deviations
-                description: Different kernel versions are running
-                query: '(count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-                severity: warning
-                for: 6h
+                description: Kernel version for {{ $labels.instance }} has changed.
+                query: 'changes(node_uname_info[1h]) > 0'
+                severity: info
               - name: Host OOM kill detected
                 description: OOM kill detected
-                query: '(increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+                query: '(increase(node_vmstat_oom_kill[1m]) > 0)'
                 severity: warning
               - name: Host EDAC Correctable Errors detected
                 description: 'Host {{ $labels.instance }} has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'
-                query: '(increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+                query: '(increase(node_edac_correctable_errors_total[1m]) > 0)'
                 severity: info
               - name: Host EDAC Uncorrectable Errors detected
                 description: 'Host {{ $labels.instance }} has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'
-                query: '(node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+                query: '(node_edac_uncorrectable_errors_total > 0)'
                 severity: warning
               - name: Host Network Receive Errors
                 description: 'Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.'
-                query: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+                query: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01)'
                 severity: warning
                 for: 2m
               - name: Host Network Transmit Errors
                 description: 'Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.'
-                query: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+                query: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01)'
                 severity: warning
                 for: 2m
-              - name: Host Network Interface Saturated
-                description: 'The network interface "{{ $labels.device }}" on "{{ $labels.instance }}" is getting overloaded.'
-                query: '((rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} > 0.8 < 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' # < to 10Gb to prevent +inf when max speed is unknown
-                severity: warning
-                for: 1m
               - name: Host Network Bond Degraded
                 description: 'Bond "{{ $labels.device }}" degraded on "{{ $labels.instance }}".'
-                query: '((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+                query: '((node_bonding_active - node_bonding_slaves) != 0)'
                 severity: warning
                 for: 2m
               - name: Host conntrack limit
                 description: "The number of conntrack is approaching limit"
-                query: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+                query: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8)'
                 severity: warning
                 for: 5m
               - name: Host clock skew
                 description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host."
-                query: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+                query: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0))'
                 severity: warning
                 for: 10m
               - name: Host clock not synchronising
                 description: "Clock not synchronising. Ensure NTP is configured on this host."
-                query: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+                query: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16)'
                 severity: warning
                 for: 2m
               - name: Host requires reboot
                 description: "{{ $labels.instance }} requires a reboot."
-                query: '(node_reboot_required > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+                query: '(node_reboot_required > 0)'
                 severity: info
                 for: 4h
 
@@ -339,31 +323,26 @@ groups:
             slug: smartctl-exporter
             doc_url: https://github.com/prometheus-community/smartctl_exporter
             rules:
-              - name: Smart device temperature warning
-                description: Device temperature  warning (instance {{ $labels.instance }})
-                query: smartctl_device_temperature > 60
+              - name: SMART device temperature warning
+                description: Device temperature warning on {{ $labels.instance }} drive {{ $labels.device }} over 60°C
+                query: '(avg_over_time(smartctl_device_temperature{temperature_type="current"} [5m]) unless on (instance, device) smartctl_device_temperature{temperature_type="drive_trip"}) > 60'
                 severity: warning
-                for: 2m
-              - name: Smart device temperature critical
-                description: Device temperature critical  (instance {{ $labels.instance }})
-                query: smartctl_device_temperature > 80
+              - name: SMART device temperature critical
+                description: Device temperature critical on {{ $labels.instance }} drive {{ $labels.device }} over 70°C
+                query: '(max_over_time(smartctl_device_temperature{temperature_type="current"} [5m]) unless on (instance, device) smartctl_device_temperature{temperature_type="drive_trip"}) > 70'
                 severity: critical
-                for: 2m
-              - name: Smart critical warning
-                description: device has critical warning (instance {{ $labels.instance }})
-                query: smartctl_device_critical_warning > 0
+              - name: SMART critical warning
+                description: Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }})
+                query: 'smartctl_device_critical_warning > 0'
                 severity: critical
-                for: 15m
-              - name: Smart media errors
-                description: device has media errors (instance {{ $labels.instance }})
-                query: smartctl_device_media_errors > 0
+              - name: SMART media errors
+                description: Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }})
+                query: 'smartctl_device_media_errors > 0'
                 severity: critical
-                for: 15m
-              - name: Smart NVME Wearout Indicator
-                description: NVMe device is wearing out (instance {{ $labels.instance }})
-                query: smartctl_device_available_spare{device=~"nvme.*"} < smartctl_device_available_spare_threshold{device=~"nvme.*"}
+              - name: SMART Wearout Indicator
+                description: Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }})
+                query: 'smartctl_device_available_spare < smartctl_device_available_spare_threshold'
                 severity: critical
-                for: 15m
 
       - name: Docker containers
         exporters:
@@ -646,11 +625,11 @@ groups:
                 severity: critical
               - name: Postgresql table not auto vacuumed
                 description: Table {{ $labels.relname }} has not been auto vacuumed for 10 days
-                query: "(pg_stat_user_tables_last_autovacuum > 0) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10"
+                query: "((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_vacuum_threshold) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10"
                 severity: warning
               - name: Postgresql table not auto analyzed
                 description: Table {{ $labels.relname }} has not been auto analyzed for 10 days
-                query: "(pg_stat_user_tables_last_autoanalyze > 0) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10"
+                query: "((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_analyze_threshold) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10"
                 severity: warning
               - name: Postgresql too many connections
                 description: PostgreSQL instance has too many connections (> 80%).
@@ -660,7 +639,7 @@ groups:
               - name: Postgresql not enough connections
                 description: PostgreSQL instance should have more connections (> 5)
                 query: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5'
-                severity: warning
+                severity: critical
                 for: 2m
               - name: Postgresql dead locks
                 description: PostgreSQL has dead-locks
@@ -672,7 +651,7 @@ groups:
                 severity: warning
               - name: Postgresql commit rate low
                 description: Postgresql seems to be processing very few transactions
-                query: "rate(pg_stat_database_xact_commit[1m]) < 10"
+                query: "increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[5m]) < 5"
                 severity: critical
                 for: 2m
               - name: Postgresql low XID consumption
@@ -703,7 +682,7 @@ groups:
                 query: '{__name__=~"pg_settings_.*"} != ON(__name__, instance) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"} OFFSET 5m'
                 severity: info
               - name: Postgresql SSL compression active
-                description: Database connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.
+                description: Database allows connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.
                 query: "sum(pg_stat_ssl_compression) > 0"
                 severity: critical
               - name: Postgresql too many locks acquired

From fc6b3faadc299610634e79c7174d44150ee5574f Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Tue, 28 Jan 2025 06:04:10 +0100
Subject: [PATCH 112/123] Fix from #405

---
 _data/rules.yml | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/_data/rules.yml b/_data/rules.yml
index 88e0785..6ad5f9d 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -331,6 +331,18 @@ groups:
                 description: Device temperature critical on {{ $labels.instance }} drive {{ $labels.device }} over 70°C
                 query: '(max_over_time(smartctl_device_temperature{temperature_type="current"} [5m]) unless on (instance, device) smartctl_device_temperature{temperature_type="drive_trip"}) > 70'
                 severity: critical
+              - name: SMART device temperature over trip value
+                description: Device temperature over trip value on {{ $labels.instance }} drive {{ $labels.device }})
+                query: 'max_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) >= on(device, instance) smartctl_device_temperature{temperature_type="drive_trip"}'
+                severity: critical
+              - name: SMART device temperature nearing trip value
+                description: Device temperature at 80% of trip value on {{ $labels.instance }} drive {{ $labels.device }})
+                query: 'max_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) >= on(device, instance) (smartctl_device_temperature{temperature_type="drive_trip"} * .80)'
+                severity: warning
+              - name: SMART status
+                description: Device has a SMART status failure on {{ $labels.instance }} drive {{ $labels.device }})
+                query: 'smartctl_device_smart_status != 1'
+                severity: critical
               - name: SMART critical warning
                 description: Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }})
                 query: 'smartctl_device_critical_warning > 0'

From 70ac7d9cae79a050378d9ad1fc456e9006bfed4f Mon Sep 17 00:00:00 2001
From: guruevi <evi.vanoost@gmail.com>
Date: Tue, 28 Jan 2025 00:06:47 -0500
Subject: [PATCH 113/123] Various updates and quality of life changes (#405)

* smartctl_exporter publishes both drive_trip and current drive temperatures. Since most of the alerts are going to be permanent, it does not make sense to wait for the alert to be on for a certain time. Temperature sensors likewise vary, using the last sample is not sufficient to alert on potential issues.

* Add an option to run GitHub Action manually

* Add an option to force running the action for testing purposes

* Set variables correctly

* Set variables correctly

* Publish

* Clean up some more metrics

* Publish

* Minor bug fixes

* Publish

* Removed queries that throw errors when systems are upgraded. Also fixed and simplified a few Postgres queries.

* Publish

* Refined some more queries

* Publish

* PostgreSQL now has optimized autovacuum behavior

* Publish

* PostgreSQL now has optimized autovacuum behavior

* Publish

* Publish

* Query fails if instance names are not unique across jobs. This fixes it.

* Publish

* Ruby is out of date

---------

Co-authored-by: samber <samber@users.noreply.github.com>
---
 .github/workflows/dist.yml                    |   6 +-
 .gitignore                                    |   1 +
 CONTRIBUTING.md                               |   4 +-
 .../rules/host-and-hardware/node-exporter.yml | 154 ++++++++----------
 dist/rules/postgresql/postgres-exporter.yml   |  14 +-
 .../embedded-exporter.yml                     |   4 +-
 .../smartctl-exporter.yml                     |  65 +++++---
 7 files changed, 130 insertions(+), 118 deletions(-)

diff --git a/.github/workflows/dist.yml b/.github/workflows/dist.yml
index 1f64e36..479f901 100644
--- a/.github/workflows/dist.yml
+++ b/.github/workflows/dist.yml
@@ -1,6 +1,7 @@
 name: Publish
 
 on:
+  workflow_dispatch:
   push:
     branches:
       - master
@@ -24,11 +25,12 @@ jobs:
         uses: mikefarah/yq@master
 
       - name: Install liquid
-        run: gem install liquid-cli
+        run: |
+         gem install liquid -v 5.5.1
+         gem install liquid-cli 
 
       - name: Build rule configuration
         run: |
-          gem install liquid-cli
           cat _data/rules.yml | yq -I 0 -o json > _data/rules.json
 
           rm -rf dist/rules
diff --git a/.gitignore b/.gitignore
index 12ca387..66a746a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,4 @@ _site/
 .jekyll-metadata
 _data/rules.json
 test/rules/
+/node_modules
\ No newline at end of file
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 1fcb24b..02b8c38 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -32,8 +32,8 @@ Or with Docker:
 docker run --rm -it -p 4000:4000 -v $(pwd):/srv/jekyll jekyll/jekyll jekyll serve
 ```
 
-Or with Docker-Compose:
+Or with Docker Compose:
 
 ```
-docker-compose up -d
+docker compose up -d
 ```
diff --git a/dist/rules/host-and-hardware/node-exporter.yml b/dist/rules/host-and-hardware/node-exporter.yml
index a6adff1..d52b34d 100644
--- a/dist/rules/host-and-hardware/node-exporter.yml
+++ b/dist/rules/host-and-hardware/node-exporter.yml
@@ -5,7 +5,7 @@ groups:
   rules:
 
     - alert: HostOutOfMemory
-      expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < .10)'
       for: 2m
       labels:
         severity: warning
@@ -14,97 +14,88 @@ groups:
         description: "Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: HostMemoryUnderMemoryPressure
-      expr: '(rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-      for: 2m
+      expr: '(rate(node_vmstat_pgmajfault[5m]) > 1000)'
+      for: 0m
       labels:
         severity: warning
       annotations:
         summary: Host memory under memory pressure (instance {{ $labels.instance }})
-        description: "The node is under heavy memory pressure. High rate of major page faults\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "The node is under heavy memory pressure. High rate of loading memory pages from disk.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: HostMemoryIsUnderutilized
-      expr: '(100 - (avg_over_time(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-      for: 1w
+      expr: 'min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8'
+      for: 0m
       labels:
         severity: info
       annotations:
         summary: Host Memory is underutilized (instance {{ $labels.instance }})
-        description: "Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: HostUnusualNetworkThroughputIn
-      expr: '(sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-      for: 5m
+      expr: '((rate(node_network_receive_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)'
+      for: 0m
       labels:
         severity: warning
       annotations:
         summary: Host unusual network throughput in (instance {{ $labels.instance }})
-        description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Host receive bandwidth is high (>80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: HostUnusualNetworkThroughputOut
-      expr: '(sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-      for: 5m
+      expr: '((rate(node_network_transmit_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)'
+      for: 0m
       labels:
         severity: warning
       annotations:
         summary: Host unusual network throughput out (instance {{ $labels.instance }})
-        description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Host transmit bandwidth is high (>80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: HostUnusualDiskReadRate
-      expr: '(sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-      for: 5m
+      expr: '(rate(node_disk_io_time_seconds_total[5m]) > .80)'
+      for: 0m
       labels:
         severity: warning
       annotations:
         summary: Host unusual disk read rate (instance {{ $labels.instance }})
-        description: "Disk is probably reading too much data (> 50 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: HostUnusualDiskWriteRate
-      expr: '(sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-      for: 2m
-      labels:
-        severity: warning
-      annotations:
-        summary: Host unusual disk write rate (instance {{ $labels.instance }})
-        description: "Disk is probably writing too much data (> 50 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Disk is too busy (IO wait > 80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: HostOutOfDiskSpace
-      expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: '(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0)'
       for: 2m
       labels:
-        severity: warning
+        severity: critical
       annotations:
         summary: Host out of disk space (instance {{ $labels.instance }})
         description: "Disk is almost full (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: HostDiskWillFillIn24Hours
-      expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+    - alert: HostDiskMayFillIn24Hours
+      expr: 'predict_linear(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_avail_bytes > 0'
       for: 2m
       labels:
         severity: warning
       annotations:
-        summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
-        description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Host disk may fill in 24 hours (instance {{ $labels.instance }})
+        description: "Filesystem will likely run out of space within the next 24 hours\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: HostOutOfInodes
-      expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0)'
       for: 2m
       labels:
-        severity: warning
+        severity: critical
       annotations:
         summary: Host out of inodes (instance {{ $labels.instance }})
         description: "Disk is almost running out of available inodes (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: HostFilesystemDeviceError
-      expr: 'node_filesystem_device_error == 1'
+      expr: 'node_filesystem_device_error{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} == 1'
       for: 2m
       labels:
         severity: critical
       annotations:
         summary: Host filesystem device error (instance {{ $labels.instance }})
-        description: "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Error stat-ing the {{ $labels.mountpoint }} filesystem\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: HostInodesWillFillIn24Hours
-      expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{fstype!="msdosfs"} == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: 'predict_linear(node_filesystem_files_free{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_files_free > 0'
       for: 2m
       labels:
         severity: warning
@@ -113,7 +104,7 @@ groups:
         description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: HostUnusualDiskReadLatency
-      expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0)'
       for: 2m
       labels:
         severity: warning
@@ -122,7 +113,7 @@ groups:
         description: "Disk latency is growing (read operations > 100ms)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: HostUnusualDiskWriteLatency
-      expr: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0)'
       for: 2m
       labels:
         severity: warning
@@ -131,7 +122,7 @@ groups:
         description: "Disk latency is growing (write operations > 100ms)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: HostHighCpuLoad
-      expr: '(sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > .80'
       for: 10m
       labels:
         severity: warning
@@ -140,16 +131,16 @@ groups:
         description: "CPU load is > 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: HostCpuIsUnderutilized
-      expr: '(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: '(min by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8'
       for: 1w
       labels:
         severity: info
       annotations:
         summary: Host CPU is underutilized (instance {{ $labels.instance }})
-        description: "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: HostCpuStealNoisyNeighbor
-      expr: '(avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: 'avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10'
       for: 0m
       labels:
         severity: warning
@@ -158,22 +149,22 @@ groups:
         description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: HostCpuHighIowait
-      expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: 'avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10'
       for: 0m
       labels:
         severity: warning
       annotations:
         summary: Host CPU high iowait (instance {{ $labels.instance }})
-        description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "CPU iowait > 10%. Your CPU is idling waiting for storage to respond.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: HostUnusualDiskIo
-      expr: '(rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: 'rate(node_disk_io_time_seconds_total[5m]) > 0.8'
       for: 5m
       labels:
         severity: warning
       annotations:
         summary: Host unusual disk IO (instance {{ $labels.instance }})
-        description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: HostContextSwitchingHigh
       expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"}))/(rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2'
@@ -185,7 +176,7 @@ groups:
         description: "Context switching is growing on the node (twice the daily average during the last 15m)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: HostSwapIsFillingUp
-      expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80)'
       for: 2m
       labels:
         severity: warning
@@ -194,7 +185,7 @@ groups:
         description: "Swap is filling up (>80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: HostSystemdServiceCrashed
-      expr: '(node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: '(node_systemd_unit_state{state="failed"} == 1)'
       for: 0m
       labels:
         severity: warning
@@ -203,7 +194,7 @@ groups:
         description: "systemd service crashed\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: HostPhysicalComponentTooHot
-      expr: '((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl"} > 75)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: 'node_hwmon_temp_celsius > node_hwmon_temp_max_celsius'
       for: 5m
       labels:
         severity: warning
@@ -220,44 +211,44 @@ groups:
         summary: Host node overtemperature alarm (instance {{ $labels.instance }})
         description: "Physical node temperature alarm triggered\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: HostRaidArrayGotInactive
-      expr: '(node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+    - alert: HostSoftwareRaidInsufficientDrives
+      expr: '((node_md_disks_required - on(device, instance) node_md_disks{state="active"}) > 0)'
       for: 0m
       labels:
         severity: critical
       annotations:
-        summary: Host RAID array got inactive (instance {{ $labels.instance }})
-        description: "RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Host Software RAID insufficient drives (instance {{ $labels.instance }})
+        description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} has insufficient drives remaining.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: HostRaidDiskFailure
-      expr: '(node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+    - alert: HostSoftwareRaidDiskFailure
+      expr: '(node_md_disks{state="failed"} > 0)'
       for: 2m
       labels:
         severity: warning
       annotations:
-        summary: Host RAID disk failure (instance {{ $labels.instance }})
-        description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Host Software RAID disk failure (instance {{ $labels.instance }})
+        description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} needs attention.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: HostKernelVersionDeviations
-      expr: '(count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-      for: 6h
-      labels:
-        severity: warning
-      annotations:
-        summary: Host kernel version deviations (instance {{ $labels.instance }})
-        description: "Different kernel versions are running\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: HostOomKillDetected
-      expr: '(increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: 'changes(node_uname_info[1h]) > 0'
       for: 0m
       labels:
-        severity: warning
+        severity: info
+      annotations:
+        summary: Host kernel version deviations (instance {{ $labels.instance }})
+        description: "Kernel version for {{ $labels.instance }} has changed\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: HostOomKillDetected
+      expr: '(increase(node_vmstat_oom_kill[1m]) > 0)'
+      for: 0m
+      labels:
+        severity: critical
       annotations:
         summary: Host OOM kill detected (instance {{ $labels.instance }})
         description: "OOM kill detected\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: HostEdacCorrectableErrorsDetected
-      expr: '(increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: '(increase(node_edac_correctable_errors_total[1m]) > 0)'
       for: 0m
       labels:
         severity: info
@@ -266,7 +257,7 @@ groups:
         description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: HostEdacUncorrectableErrorsDetected
-      expr: '(node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: '(node_edac_uncorrectable_errors_total > 0)'
       for: 0m
       labels:
         severity: warning
@@ -275,7 +266,7 @@ groups:
         description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: HostNetworkReceiveErrors
-      expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01)'
       for: 2m
       labels:
         severity: warning
@@ -284,7 +275,7 @@ groups:
         description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: HostNetworkTransmitErrors
-      expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01)'
       for: 2m
       labels:
         severity: warning
@@ -292,17 +283,8 @@ groups:
         summary: Host Network Transmit Errors (instance {{ $labels.instance }})
         description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: HostNetworkInterfaceSaturated
-      expr: '((rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} > 0.8 < 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-      for: 1m
-      labels:
-        severity: warning
-      annotations:
-        summary: Host Network Interface Saturated (instance {{ $labels.instance }})
-        description: "The network interface \"{{ $labels.device }}\" on \"{{ $labels.instance }}\" is getting overloaded.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
     - alert: HostNetworkBondDegraded
-      expr: '((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: '((node_bonding_active - node_bonding_slaves) != 0)'
       for: 2m
       labels:
         severity: warning
@@ -311,7 +293,7 @@ groups:
         description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: HostConntrackLimit
-      expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8)'
       for: 5m
       labels:
         severity: warning
@@ -320,7 +302,7 @@ groups:
         description: "The number of conntrack is approaching limit\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: HostClockSkew
-      expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0))'
       for: 10m
       labels:
         severity: warning
@@ -329,7 +311,7 @@ groups:
         description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: HostClockNotSynchronising
-      expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16)'
       for: 2m
       labels:
         severity: warning
@@ -338,7 +320,7 @@ groups:
         description: "Clock not synchronising. Ensure NTP is configured on this host.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: HostRequiresReboot
-      expr: '(node_reboot_required > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: '(node_reboot_required > 0)'
       for: 4h
       labels:
         severity: info
diff --git a/dist/rules/postgresql/postgres-exporter.yml b/dist/rules/postgresql/postgres-exporter.yml
index 96ae5ea..692acef 100644
--- a/dist/rules/postgresql/postgres-exporter.yml
+++ b/dist/rules/postgresql/postgres-exporter.yml
@@ -32,7 +32,7 @@ groups:
         description: "Postgresql exporter is showing errors. A query may be buggy in query.yaml\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: PostgresqlTableNotAutoVacuumed
-      expr: '(pg_stat_user_tables_last_autovacuum > 0) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10'
+      expr: '((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_vacuum_threshold) and (time() - pg_stat_user_tables_last_autovacuum) > 864000'
       for: 0m
       labels:
         severity: warning
@@ -41,7 +41,7 @@ groups:
         description: "Table {{ $labels.relname }} has not been auto vacuumed for 10 days\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: PostgresqlTableNotAutoAnalyzed
-      expr: '(pg_stat_user_tables_last_autoanalyze > 0) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10'
+      expr: '((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_analyze_threshold) and (time() - pg_stat_user_tables_last_autoanalyze) > 864000'
       for: 0m
       labels:
         severity: warning
@@ -53,7 +53,7 @@ groups:
       expr: 'sum by (instance, job, server) (pg_stat_activity_count) > min by (instance, job, server) (pg_settings_max_connections * 0.8)'
       for: 2m
       labels:
-        severity: warning
+        severity: critical
       annotations:
         summary: Postgresql too many connections (instance {{ $labels.instance }})
         description: "PostgreSQL instance has too many connections (> 80%).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
@@ -62,7 +62,7 @@ groups:
       expr: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5'
       for: 2m
       labels:
-        severity: warning
+        severity: critical
       annotations:
         summary: Postgresql not enough connections (instance {{ $labels.instance }})
         description: "PostgreSQL instance should have more connections (> 5)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
@@ -86,8 +86,8 @@ groups:
         description: "Ratio of transactions being aborted compared to committed is > 2 %\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: PostgresqlCommitRateLow
-      expr: 'rate(pg_stat_database_xact_commit[1m]) < 10'
-      for: 2m
+      expr: 'increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[5m]) < 5'
+      for: 5m
       labels:
         severity: critical
       annotations:
@@ -155,7 +155,7 @@ groups:
         severity: critical
       annotations:
         summary: Postgresql SSL compression active (instance {{ $labels.instance }})
-        description: "Database connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Database allows connections with SSL compression enabled.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: PostgresqlTooManyLocksAcquired
       expr: '((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20'
diff --git a/dist/rules/prometheus-self-monitoring/embedded-exporter.yml b/dist/rules/prometheus-self-monitoring/embedded-exporter.yml
index 65bfd82..8a2e402 100644
--- a/dist/rules/prometheus-self-monitoring/embedded-exporter.yml
+++ b/dist/rules/prometheus-self-monitoring/embedded-exporter.yml
@@ -32,7 +32,7 @@ groups:
         description: "A Prometheus job does not have living target anymore.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: PrometheusTargetMissingWithWarmupTime
-      expr: 'sum by (instance, job) ((up == 0) * on (instance) group_right(job) (node_time_seconds - node_boot_time_seconds > 600))'
+      expr: 'sum by (instance, job) ((up == 0) * on (instance) group_left (__name__) (node_time_seconds - node_boot_time_seconds > 600))'
       for: 0m
       labels:
         severity: critical
@@ -248,7 +248,7 @@ groups:
         description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: PrometheusTimeseriesCardinality
-      expr: 'label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000'
+      expr: '(label_replace(count by (__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") unless on (__name__) ({__name__=~"node_cpu.*|node_systemd_unit_state"})) > 10000'
       for: 0m
       labels:
         severity: warning
diff --git a/dist/rules/s.m.a.r.t-device-monitoring/smartctl-exporter.yml b/dist/rules/s.m.a.r.t-device-monitoring/smartctl-exporter.yml
index 1946c38..866d715 100644
--- a/dist/rules/s.m.a.r.t-device-monitoring/smartctl-exporter.yml
+++ b/dist/rules/s.m.a.r.t-device-monitoring/smartctl-exporter.yml
@@ -5,46 +5,73 @@ groups:
   rules:
 
     - alert: SmartDeviceTemperatureWarning
-      expr: 'smartctl_device_temperature > 60'
-      for: 2m
+      expr: '(avg_over_time(smartctl_device_temperature{temperature_type="current"} [5m]) unless on (instance, device) smartctl_device_temperature{temperature_type="drive_trip"}) > 60'
+      for: 0m
       labels:
         severity: warning
       annotations:
-        summary: Smart device temperature warning (instance {{ $labels.instance }})
-        description: "Device temperature  warning (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: SMART device temperature warning (instance {{ $labels.instance }})
+        description: "Device temperature warning on {{ $labels.instance }} drive {{ $labels.device }} over 60°C\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: SmartDeviceTemperatureCritical
-      expr: 'smartctl_device_temperature > 80'
-      for: 2m
+      expr: '(max_over_time(smartctl_device_temperature{temperature_type="current"} [5m]) unless on (instance, device) smartctl_device_temperature{temperature_type="drive_trip"}) > 70'
+      for: 0m
       labels:
         severity: critical
       annotations:
-        summary: Smart device temperature critical (instance {{ $labels.instance }})
-        description: "Device temperature critical  (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: SMART device temperature critical (instance {{ $labels.instance }})
+        description: "Device temperature critical on {{ $labels.instance }} drive {{ $labels.device }} over 70°C\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: SmartDeviceTemperatureOverTripValue
+      expr: 'max_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) >= on(device, instance) smartctl_device_temperature{temperature_type="drive_trip"}'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: SMART device temperature over trip value (instance {{ $labels.instance }})
+        description: "Device temperature over trip value on {{ $labels.instance }} drive {{ $labels.device }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: SmartDeviceTemperatureNearingTripValue
+      expr: 'max_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) >= on(device, instance) (smartctl_device_temperature{temperature_type="drive_trip"} * .80)'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: SMART device temperature nearing trip value (instance {{ $labels.instance }})
+        description: "Device temperature at 80% of trip value on {{ $labels.instance }} drive {{ $labels.device }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: SmartStatus
+      expr: 'smartctl_device_smart_status != 1'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: SMART status (instance {{ $labels.instance }})
+        description: "Device has a SMART status failure on {{ $labels.instance }} drive {{ $labels.device }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: SmartCriticalWarning
       expr: 'smartctl_device_critical_warning > 0'
-      for: 15m
+      for: 0m
       labels:
         severity: critical
       annotations:
-        summary: Smart critical warning (instance {{ $labels.instance }})
-        description: "device has critical warning (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: SMART critical warning (instance {{ $labels.instance }})
+        description: "Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: SmartMediaErrors
       expr: 'smartctl_device_media_errors > 0'
-      for: 15m
+      for: 0m
       labels:
         severity: critical
       annotations:
-        summary: Smart media errors (instance {{ $labels.instance }})
-        description: "device has media errors (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: SMART media errors (instance {{ $labels.instance }})
+        description: "Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: SmartNvmeWearoutIndicator
-      expr: 'smartctl_device_available_spare{device=~"nvme.*"} < smartctl_device_available_spare_threshold{device=~"nvme.*"}'
-      for: 15m
+    - alert: SmartWearoutIndicator
+      expr: 'smartctl_device_available_spare < smartctl_device_available_spare_threshold'
+      for: 0m
       labels:
         severity: critical
       annotations:
-        summary: Smart NVME Wearout Indicator (instance {{ $labels.instance }})
-        description: "NVMe device is wearing out (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: SMART Wearout Indicator (instance {{ $labels.instance }})
+        description: "Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

From 10d00c66da78c3c47ff3f76cf9903ad821b5fbbb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Felix=20B=C3=BChler?=
 <Stunkymonkey@users.noreply.github.com>
Date: Tue, 4 Feb 2025 14:23:14 +0100
Subject: [PATCH 114/123] Add caddy.yml (#450)

---
 README.md                              |  1 +
 _data/rules.yml                        | 23 +++++++++++++++++-
 dist/rules/caddy/embedded-exporter.yml | 32 ++++++++++++++++++++++++++
 3 files changed, 55 insertions(+), 1 deletion(-)
 create mode 100644 dist/rules/caddy/embedded-exporter.yml

diff --git a/README.md b/README.md
index fdb5643..7011402 100644
--- a/README.md
+++ b/README.md
@@ -67,6 +67,7 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts
 - [Apache](https://samber.github.io/awesome-prometheus-alerts/rules#apache)
 - [HaProxy](https://samber.github.io/awesome-prometheus-alerts/rules#haproxy)
 - [Traefik](https://samber.github.io/awesome-prometheus-alerts/rules#traefik)
+- [Caddy](https://samber.github.io/awesome-prometheus-alerts/rules#caddy)
 
 #### Runtimes
 
diff --git a/_data/rules.yml b/_data/rules.yml
index 6ad5f9d..00aa84a 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -1950,6 +1950,27 @@ groups:
                 severity: critical
                 for: 1m
 
+      - name: Caddy
+        exporters:
+          - name: Embedded exporter
+            doc_url: https://caddyserver.com/docs/metrics
+            rules:
+              - name: Caddy Reverse Proxy Down
+                description: "All Caddy reverse proxies are down"
+                query: "count(caddy_reverse_proxy_upstreams_healthy) by (upstream) == 0"
+                severity: critical
+                for: 0m
+              - name: Caddy high HTTP 4xx error rate service
+                description: "Caddy service 4xx error rate is above 5%"
+                query: 'sum(rate(caddy_http_request_duration_seconds_count{code=~"4.."}[3m])) by (instance) / sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) * 100 > 5'
+                severity: critical
+                for: 1m
+              - name: Caddy high HTTP 5xx error rate service
+                description: "Caddy service 5xx error rate is above 5%"
+                query: 'sum(rate(caddy_http_request_duration_seconds_count{code=~"5.."}[3m])) by (instance) / sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) * 100 > 5'
+                severity: critical
+                for: 1m
+
   - name: Runtimes
     services:
       - name: PHP-FPM
@@ -2975,7 +2996,7 @@ groups:
           - slug: embedded-exporter
             rules:
               - name: Grafana Alloy service down
-                description: Alloy on (instance {{ $labels.instance }}) is not responding or has stopped running. 
+                description: Alloy on (instance {{ $labels.instance }}) is not responding or has stopped running.
                 query: 'count by (instance) (alloy_build_info) unless count by (instance) (alloy_build_info offset 2m)  '
                 severity: critical
 
diff --git a/dist/rules/caddy/embedded-exporter.yml b/dist/rules/caddy/embedded-exporter.yml
new file mode 100644
index 0000000..4f23002
--- /dev/null
+++ b/dist/rules/caddy/embedded-exporter.yml
@@ -0,0 +1,32 @@
+groups:
+
+- name: EmbeddedExporter
+
+  rules:
+
+    - alert: CaddyReverseProxyDown
+      expr: 'count(caddy_reverse_proxy_upstreams_healthy) by (upstream) == 0'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Caddy reverse proxy down (instance {{ $labels.instance }})
+        description: "All Caddy reverse proxies are down\n LABELS = {{ $labels }}"
+
+    - alert: CaddyHighHttp4xxErrorRateService
+      expr: 'sum(rate(caddy_http_request_duration_seconds_count{code=~"4.."}[3m])) by (instance) / sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) * 100 > 5'
+      for: 1m
+      labels:
+        severity: critical
+      annotations:
+        summary: Caddy high HTTP 4xx error rate service (instance {{ $labels.instance }})
+        description: "Caddy service 4xx error rate is above 5%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: CaddyHighHttp5xxErrorRateService
+      expr: 'sum(rate(caddy_http_request_duration_seconds_count{code=~"5.."}[3m])) by (instance) / sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) * 100 > 5'
+      for: 1m
+      labels:
+        severity: critical
+      annotations:
+        summary: Caddy high HTTP 5xx error rate service (instance {{ $labels.instance }})
+        description: "Caddy service 5xx error rate is above 5%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

From ae12871fa93def4e0d4f9dfeea6de0a52fd3830a Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Tue, 4 Feb 2025 16:40:21 +0100
Subject: [PATCH 115/123] Update rules.yml

---
 _data/rules.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/_data/rules.yml b/_data/rules.yml
index 00aa84a..c35815b 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -187,7 +187,7 @@ groups:
                 for: 2m
               - name: Host filesystem device error
                 description: "Error stat-ing the {{ $labels.mountpoint }} filesystem"
-                query: "node_filesystem_device_error{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} == 1"
+                query: 'node_filesystem_device_error{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} == 1'
                 severity: critical
                 for: 2m
               - name: Host inodes may fill in 24 hours

From eb92a798984d26150dfe196a0fe252dc77e27423 Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Tue, 4 Feb 2025 16:44:31 +0100
Subject: [PATCH 116/123] upgrade github action ruby version

---
 .github/workflows/dist.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/dist.yml b/.github/workflows/dist.yml
index 479f901..85b6ce4 100644
--- a/.github/workflows/dist.yml
+++ b/.github/workflows/dist.yml
@@ -19,7 +19,7 @@ jobs:
       - name: Set up Ruby
         uses: ruby/setup-ruby@v1
         with:
-          ruby-version: 2.7
+          ruby-version: 3.4
 
       - name: Set up yq
         uses: mikefarah/yq@master

From 2f9c0c0483a64b54c4771d4269a19267e8a0a3a1 Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Sun, 16 Feb 2025 23:15:43 +0100
Subject: [PATCH 117/123] upgrade ruby version

---
 .github/workflows/test.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index c397867..00059f0 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -13,7 +13,7 @@ jobs:
       - name: Set up Ruby
         uses: ruby/setup-ruby@v1
         with:
-          ruby-version: 2.7
+          ruby-version: 3.4
 
       - name: Set up yq
         uses: mikefarah/yq@master
@@ -31,7 +31,7 @@ jobs:
             mkdir -p "${subdir}"
 
             # groupName=$(echo "{% assign groupName = name | split: ' ' %}{% capture groupNameCamelcase %}{% for word in groupName %}{{ word | capitalize }} {% endfor %}{% endcapture %} {{ groupNameCamelcase | remove: ' ' | remove: '-' }}" | liquid $(echo ${service} | base64 --decode | jq -r '.name | ascii_downcase | split(" ") | join("-")'))
-    
+
             for exporter in $(echo ${service} | base64 --decode | jq -r '.exporters[] | @base64'); do
               exporterName=$(echo ${exporter} | base64 --decode | jq -r '.slug')
               cat dist/template.yml | liquid "$(echo ${exporter} | base64 --decode)" > ${subdir}/${exporterName}.yml

From fb857e8b39e3fda796a1219d145a6d3e0222a7cd Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Sun, 16 Feb 2025 23:16:36 +0100
Subject: [PATCH 118/123] data: fix rules

---
 _data/rules.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/_data/rules.yml b/_data/rules.yml
index c35815b..9e60ba4 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -663,7 +663,7 @@ groups:
                 severity: warning
               - name: Postgresql commit rate low
                 description: Postgresql seems to be processing very few transactions
-                query: "increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[5m]) < 5"
+                query: 'increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[5m]) < 5'
                 severity: critical
                 for: 2m
               - name: Postgresql low XID consumption

From 20f9a36615278f0e77540f619e7b0443db628543 Mon Sep 17 00:00:00 2001
From: samber <samber@users.noreply.github.com>
Date: Sun, 16 Feb 2025 22:17:02 +0000
Subject: [PATCH 119/123] Publish

---
 .../caddy/{embedded-exporter.yml => null.yml} |  6 ++---
 .../rules/host-and-hardware/node-exporter.yml | 26 +++++++++----------
 dist/rules/postgresql/postgres-exporter.yml   | 10 +++----
 .../embedded-exporter.yml                     |  4 +--
 4 files changed, 23 insertions(+), 23 deletions(-)
 rename dist/rules/caddy/{embedded-exporter.yml => null.yml} (87%)

diff --git a/dist/rules/caddy/embedded-exporter.yml b/dist/rules/caddy/null.yml
similarity index 87%
rename from dist/rules/caddy/embedded-exporter.yml
rename to dist/rules/caddy/null.yml
index 4f23002..64b0230 100644
--- a/dist/rules/caddy/embedded-exporter.yml
+++ b/dist/rules/caddy/null.yml
@@ -1,6 +1,6 @@
 groups:
 
-- name: EmbeddedExporter
+- name: 
 
   rules:
 
@@ -10,8 +10,8 @@ groups:
       labels:
         severity: critical
       annotations:
-        summary: Caddy reverse proxy down (instance {{ $labels.instance }})
-        description: "All Caddy reverse proxies are down\n LABELS = {{ $labels }}"
+        summary: Caddy Reverse Proxy Down (instance {{ $labels.instance }})
+        description: "All Caddy reverse proxies are down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: CaddyHighHttp4xxErrorRateService
       expr: 'sum(rate(caddy_http_request_duration_seconds_count{code=~"4.."}[3m])) by (instance) / sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) * 100 > 5'
diff --git a/dist/rules/host-and-hardware/node-exporter.yml b/dist/rules/host-and-hardware/node-exporter.yml
index d52b34d..f168bcf 100644
--- a/dist/rules/host-and-hardware/node-exporter.yml
+++ b/dist/rules/host-and-hardware/node-exporter.yml
@@ -38,7 +38,7 @@ groups:
         severity: warning
       annotations:
         summary: Host unusual network throughput in (instance {{ $labels.instance }})
-        description: "Host receive bandwidth is high (>80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Host receive bandwidth is high (>80%).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: HostUnusualNetworkThroughputOut
       expr: '((rate(node_network_transmit_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)'
@@ -74,7 +74,7 @@ groups:
         severity: warning
       annotations:
         summary: Host disk may fill in 24 hours (instance {{ $labels.instance }})
-        description: "Filesystem will likely run out of space within the next 24 hours\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Filesystem will likely run out of space within the next 24 hours.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: HostOutOfInodes
       expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0)'
@@ -94,17 +94,17 @@ groups:
         summary: Host filesystem device error (instance {{ $labels.instance }})
         description: "Error stat-ing the {{ $labels.mountpoint }} filesystem\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: HostInodesWillFillIn24Hours
+    - alert: HostInodesMayFillIn24Hours
       expr: 'predict_linear(node_filesystem_files_free{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_files_free > 0'
       for: 2m
       labels:
         severity: warning
       annotations:
-        summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
-        description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: Host inodes may fill in 24 hours (instance {{ $labels.instance }})
+        description: "Filesystem will likely run out of inodes within the next 24 hours at current write rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: HostUnusualDiskReadLatency
-      expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0)'
+      expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0'
       for: 2m
       labels:
         severity: warning
@@ -164,10 +164,10 @@ groups:
         severity: warning
       annotations:
         summary: Host unusual disk IO (instance {{ $labels.instance }})
-        description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities. Check storage for issues.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: HostContextSwitchingHigh
-      expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"}))/(rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2'
+      expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2'
       for: 0m
       labels:
         severity: warning
@@ -203,7 +203,7 @@ groups:
         description: "Physical hardware component too hot\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: HostNodeOvertemperatureAlarm
-      expr: '((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: '((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1))'
       for: 0m
       labels:
         severity: critical
@@ -217,7 +217,7 @@ groups:
       labels:
         severity: critical
       annotations:
-        summary: Host Software RAID insufficient drives (instance {{ $labels.instance }})
+        summary: Host software RAID insufficient drives (instance {{ $labels.instance }})
         description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} has insufficient drives remaining.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: HostSoftwareRaidDiskFailure
@@ -226,7 +226,7 @@ groups:
       labels:
         severity: warning
       annotations:
-        summary: Host Software RAID disk failure (instance {{ $labels.instance }})
+        summary: Host software RAID disk failure (instance {{ $labels.instance }})
         description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} needs attention.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: HostKernelVersionDeviations
@@ -236,13 +236,13 @@ groups:
         severity: info
       annotations:
         summary: Host kernel version deviations (instance {{ $labels.instance }})
-        description: "Kernel version for {{ $labels.instance }} has changed\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Kernel version for {{ $labels.instance }} has changed.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: HostOomKillDetected
       expr: '(increase(node_vmstat_oom_kill[1m]) > 0)'
       for: 0m
       labels:
-        severity: critical
+        severity: warning
       annotations:
         summary: Host OOM kill detected (instance {{ $labels.instance }})
         description: "OOM kill detected\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
diff --git a/dist/rules/postgresql/postgres-exporter.yml b/dist/rules/postgresql/postgres-exporter.yml
index 692acef..42e5bb8 100644
--- a/dist/rules/postgresql/postgres-exporter.yml
+++ b/dist/rules/postgresql/postgres-exporter.yml
@@ -32,7 +32,7 @@ groups:
         description: "Postgresql exporter is showing errors. A query may be buggy in query.yaml\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: PostgresqlTableNotAutoVacuumed
-      expr: '((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_vacuum_threshold) and (time() - pg_stat_user_tables_last_autovacuum) > 864000'
+      expr: '((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_vacuum_threshold) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10'
       for: 0m
       labels:
         severity: warning
@@ -41,7 +41,7 @@ groups:
         description: "Table {{ $labels.relname }} has not been auto vacuumed for 10 days\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: PostgresqlTableNotAutoAnalyzed
-      expr: '((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_analyze_threshold) and (time() - pg_stat_user_tables_last_autoanalyze) > 864000'
+      expr: '((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_analyze_threshold) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10'
       for: 0m
       labels:
         severity: warning
@@ -53,7 +53,7 @@ groups:
       expr: 'sum by (instance, job, server) (pg_stat_activity_count) > min by (instance, job, server) (pg_settings_max_connections * 0.8)'
       for: 2m
       labels:
-        severity: critical
+        severity: warning
       annotations:
         summary: Postgresql too many connections (instance {{ $labels.instance }})
         description: "PostgreSQL instance has too many connections (> 80%).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
@@ -87,7 +87,7 @@ groups:
 
     - alert: PostgresqlCommitRateLow
       expr: 'increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[5m]) < 5'
-      for: 5m
+      for: 2m
       labels:
         severity: critical
       annotations:
@@ -155,7 +155,7 @@ groups:
         severity: critical
       annotations:
         summary: Postgresql SSL compression active (instance {{ $labels.instance }})
-        description: "Database allows connections with SSL compression enabled.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Database allows connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: PostgresqlTooManyLocksAcquired
       expr: '((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20'
diff --git a/dist/rules/prometheus-self-monitoring/embedded-exporter.yml b/dist/rules/prometheus-self-monitoring/embedded-exporter.yml
index 8a2e402..908f001 100644
--- a/dist/rules/prometheus-self-monitoring/embedded-exporter.yml
+++ b/dist/rules/prometheus-self-monitoring/embedded-exporter.yml
@@ -32,7 +32,7 @@ groups:
         description: "A Prometheus job does not have living target anymore.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: PrometheusTargetMissingWithWarmupTime
-      expr: 'sum by (instance, job) ((up == 0) * on (instance) group_left (__name__) (node_time_seconds - node_boot_time_seconds > 600))'
+      expr: 'sum by (instance, job) ((up == 0) * on (instance) group_left(__name__) (node_time_seconds - node_boot_time_seconds > 600))'
       for: 0m
       labels:
         severity: critical
@@ -248,7 +248,7 @@ groups:
         description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: PrometheusTimeseriesCardinality
-      expr: '(label_replace(count by (__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") unless on (__name__) ({__name__=~"node_cpu.*|node_systemd_unit_state"})) > 10000'
+      expr: 'label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000'
       for: 0m
       labels:
         severity: warning

From 4a7b9b5c7260464ce74e82d19de7e1a34d151986 Mon Sep 17 00:00:00 2001
From: asdf1234 <lirulei@users.noreply.github.com>
Date: Mon, 17 Feb 2025 06:29:00 +0800
Subject: [PATCH 120/123] Update mysqld-exporter.yml (#442)

* Update mysqld-exporter.yml

add some rules

* Add new MySQL monitoring rules

---------

Co-authored-by: Samuel Berthe <dev@samuel-berthe.fr>
---
 _data/rules.yml                      | 20 +++++++++++++
 dist/rules/mysql/mysqld-exporter.yml | 45 ++++++++++++++++++++++++++++
 2 files changed, 65 insertions(+)

diff --git a/_data/rules.yml b/_data/rules.yml
index 9e60ba4..11863ca 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -616,6 +616,26 @@ groups:
                 description: MySQL has just been restarted, less than one minute ago on {{ $labels.instance }}.
                 query: "mysql_global_status_uptime < 60"
                 severity: info
+              - name: MySQL High QPS
+                description: MySQL is being overload with unusual QPS (> 10k QPS).
+                query: "irate(mysql_global_status_questions[1m]) > 10000"
+                severity: info
+                for: 2m
+              - name: MySQL too many open files
+                description: MySQL has too many open files, consider increase variables open_files_limit on {{ $labels.instance }}.
+                query: "mysql_global_status_innodb_num_open_files / mysql_global_variables_open_files_limit * 100 > 75"
+                severity: warning
+                for: 2m
+              - name: MySQL InnoDB Force Recovery is enabled
+                description: "MySQL InnoDB force recovery is enabled on {{ $labels.instance }}"
+                query: "mysql_global_variables_innodb_force_recovery != 0"
+                severity: warning
+                for: 2m
+              - name: MySQL InnoDB history_len too long
+                description: "MySQL history_len (undo log) too long on {{ $labels.instance }}"
+                query: "mysql_info_schema_innodb_metrics_transaction_trx_rseg_history_len > 50000"
+                severity: warning
+                for: 2m
 
       - name: PostgreSQL
         exporters:
diff --git a/dist/rules/mysql/mysqld-exporter.yml b/dist/rules/mysql/mysqld-exporter.yml
index 380fca3..a9bdff8 100644
--- a/dist/rules/mysql/mysqld-exporter.yml
+++ b/dist/rules/mysql/mysqld-exporter.yml
@@ -93,3 +93,48 @@ groups:
       annotations:
         summary: MySQL restarted (instance {{ $labels.instance }})
         description: "MySQL has just been restarted, less than one minute ago on {{ $labels.instance }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: Mysql_High_QPS
+      expr: 'irate(mysql_global_status_questions[1m]) > 10000'
+      for: 2m
+      labels:
+        severity: info
+      annotations:
+        summary: Mysql_High_QPS (instance {{ $labels.instance }})
+        description: "Mysql_High_QPS on {{ $labels.instance }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MySQL too many open files
+      expr: 'mysql_global_status_innodb_num_open_files / mysql_global_variables_open_files_limit * 100 > 75'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: MySQL too many Open files  (instance {{ $labels.instance }})
+        description: "MySQL too many Open files,please conside increase variables open_files_limit  {{ $labels.instance }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MySQL too many connections
+      expr: 'sum by(instance) (mysql_info_schema_processes_by_host) / sum by(instance) (mysql_global_variables_max_connections) * 100 > 80'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: MySQL too many connections  (instance {{ $labels.instance }})
+        description: "MySQL too many connections,current connections is more than 80%  {{ $labels.instance }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MySQL InnoDB Force Recovery is enabled
+      expr: 'mysql_global_variables_innodb_force_recovery != 0'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: MySQL InnoDB Force Recovery is enabled (instance {{ $labels.instance }})
+        description: "MySQL InnoDB Force Recovery not equal 0  {{ $labels.instance }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MySQL InnoDB history_len too long
+      expr: 'mysql_info_schema_innodb_metrics_transaction_trx_rseg_history_len > 50000'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: MySQL history_len (undo log) too long (instance {{ $labels.instance }})
+        description: "MySQL  history_len (undo log) too long   {{ $labels.instance }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

From 12b8acb1b8b4e716988e234212f2c18d4566de28 Mon Sep 17 00:00:00 2001
From: samber <samber@users.noreply.github.com>
Date: Sun, 16 Feb 2025 22:29:24 +0000
Subject: [PATCH 121/123] Publish

---
 dist/rules/mysql/mysqld-exporter.yml | 31 ++++++++++------------------
 1 file changed, 11 insertions(+), 20 deletions(-)

diff --git a/dist/rules/mysql/mysqld-exporter.yml b/dist/rules/mysql/mysqld-exporter.yml
index a9bdff8..3ef716f 100644
--- a/dist/rules/mysql/mysqld-exporter.yml
+++ b/dist/rules/mysql/mysqld-exporter.yml
@@ -94,47 +94,38 @@ groups:
         summary: MySQL restarted (instance {{ $labels.instance }})
         description: "MySQL has just been restarted, less than one minute ago on {{ $labels.instance }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: Mysql_High_QPS
+    - alert: MysqlHighQps
       expr: 'irate(mysql_global_status_questions[1m]) > 10000'
       for: 2m
       labels:
         severity: info
       annotations:
-        summary: Mysql_High_QPS (instance {{ $labels.instance }})
-        description: "Mysql_High_QPS on {{ $labels.instance }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: MySQL High QPS (instance {{ $labels.instance }})
+        description: "MySQL is being overload with unusual QPS (> 10k QPS).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: MySQL too many open files
+    - alert: MysqlTooManyOpenFiles
       expr: 'mysql_global_status_innodb_num_open_files / mysql_global_variables_open_files_limit * 100 > 75'
       for: 2m
       labels:
         severity: warning
       annotations:
-        summary: MySQL too many Open files  (instance {{ $labels.instance }})
-        description: "MySQL too many Open files,please conside increase variables open_files_limit  {{ $labels.instance }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: MySQL too many open files (instance {{ $labels.instance }})
+        description: "MySQL has too many open files, consider increase variables open_files_limit on {{ $labels.instance }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: MySQL too many connections
-      expr: 'sum by(instance) (mysql_info_schema_processes_by_host) / sum by(instance) (mysql_global_variables_max_connections) * 100 > 80'
-      for: 2m
-      labels:
-        severity: warning
-      annotations:
-        summary: MySQL too many connections  (instance {{ $labels.instance }})
-        description: "MySQL too many connections,current connections is more than 80%  {{ $labels.instance }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-
-    - alert: MySQL InnoDB Force Recovery is enabled
+    - alert: MysqlInnodbForceRecoveryIsEnabled
       expr: 'mysql_global_variables_innodb_force_recovery != 0'
       for: 2m
       labels:
         severity: warning
       annotations:
         summary: MySQL InnoDB Force Recovery is enabled (instance {{ $labels.instance }})
-        description: "MySQL InnoDB Force Recovery not equal 0  {{ $labels.instance }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "MySQL InnoDB force recovery is enabled on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
-    - alert: MySQL InnoDB history_len too long
+    - alert: MysqlInnodbHistory_lenTooLong
       expr: 'mysql_info_schema_innodb_metrics_transaction_trx_rseg_history_len > 50000'
       for: 2m
       labels:
         severity: warning
       annotations:
-        summary: MySQL history_len (undo log) too long (instance {{ $labels.instance }})
-        description: "MySQL  history_len (undo log) too long   {{ $labels.instance }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        summary: MySQL InnoDB history_len too long (instance {{ $labels.instance }})
+        description: "MySQL history_len (undo log) too long on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

From add097c48994e10502d274c2b2f7b5eaf1db5172 Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Sun, 16 Feb 2025 23:36:32 +0100
Subject: [PATCH 122/123] data: revert 5f57f09 (see #398)

---
 _data/rules.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/_data/rules.yml b/_data/rules.yml
index 11863ca..d33d641 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -182,7 +182,7 @@ groups:
                 for: 2m
               - name: Host out of inodes
                 description: Disk is almost running out of available inodes (< 10% left)
-                query: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0)'
+                query: '(node_filesystem_files_free / node_filesystem_files < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0)'
                 severity: critical
                 for: 2m
               - name: Host filesystem device error

From 7889a9a29bc6a7d69ea68f06ce33509be80f5ad4 Mon Sep 17 00:00:00 2001
From: samber <samber@users.noreply.github.com>
Date: Sun, 16 Feb 2025 22:37:09 +0000
Subject: [PATCH 123/123] Publish

---
 dist/rules/host-and-hardware/node-exporter.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dist/rules/host-and-hardware/node-exporter.yml b/dist/rules/host-and-hardware/node-exporter.yml
index f168bcf..7b553eb 100644
--- a/dist/rules/host-and-hardware/node-exporter.yml
+++ b/dist/rules/host-and-hardware/node-exporter.yml
@@ -77,7 +77,7 @@ groups:
         description: "Filesystem will likely run out of space within the next 24 hours.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 
     - alert: HostOutOfInodes
-      expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0)'
+      expr: '(node_filesystem_files_free / node_filesystem_files < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0)'
       for: 2m
       labels:
         severity: critical