From 04a8ae2fe323e2dcc405f77f2419339e3dca912b Mon Sep 17 00:00:00 2001
From: Samuel Berthe <dev@samuel-berthe.fr>
Date: Mon, 6 Apr 2026 20:12:18 +0200
Subject: [PATCH] fix(data): PromQL type fixes, job filter cleanup, query
 correctness review

- Replace rate()/increase() with deriv()/delta() on gauge metrics:
  node_vmstat_pgmajfault, cassandra_stats (criteo exporter),
  gitlab_ci_pipeline_failure_reasons, flink_taskmanager_job_task_numRecordsIn
- Fix histogram_quantile on non-_bucket metric: cilium_policy_implementation_delay
- Fix Thanos bucket replicate latency: use _count instead of _bucket for guard clause
- Fix Thanos query latency: use _count instead of _bucket for guard clause
- Restore job filter in Thanos objstore guard clauses (compact + store)
- Remove redundant job= filters from unique metrics: ~30 Thanos rules,
  kube_persistentvolume_status_phase, otelcol_process_runtime_*
- Fix high-cardinality Istio latency grouping (drop source labels from by())
- Add division-by-zero guard to host context switch ratio
- Raise noisy ClickHouse thresholds: RejectedInserts > 2, DelayedInserts > 10
- Remove redundant for: 1m from HAProxy check failure rules
- Add job rename comments to up{job=...} rules (Hadoop, OpenStack, SNMP, OTel)
- Remove external mixin references from comments
- Fix Tempo dropped spans metric name: add missing _total suffix
- Fix Thanos bucket replicate run latency: add missing le label in by()
---
 _data/rules.yml | 91 ++++++++++++++++++++++++++-----------------------
 1 file changed, 48 insertions(+), 43 deletions(-)

diff --git a/_data/rules.yml b/_data/rules.yml
index 3653381..2da43cf 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -148,8 +148,10 @@ groups:
                 for: 2m
               - name: Host memory under memory pressure
                 description: "The node is under heavy memory pressure. High rate of major page faults ({{ $value }}/s)."
-                query: "(rate(node_vmstat_pgmajfault[5m]) > 1000)"
+                query: "(deriv(node_vmstat_pgmajfault[5m]) > 1000)"
                 severity: warning
+                comments: |
+                  node_vmstat_pgmajfault is exposed as untyped/gauge by node_exporter (from /proc/vmstat), so deriv() is used instead of rate().
               - name: Host Memory is underutilized
                 description: "Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})"
                 query: "min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8"
@@ -238,7 +240,7 @@ groups:
                 for: 5m
               - name: Host context switching high
                 description: Context switching is growing on the node (twice the daily average during the last 15m)
-                query: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2'
+                query: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2 and rate(node_context_switches_total[1d]) > 0'
                 severity: warning
                 comments: |
                   x2 context switches is an arbitrary number.
@@ -266,7 +268,7 @@ groups:
                 description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} has insufficient drives remaining."
                 query: '((node_md_disks_required - ignoring(state) node_md_disks{state="active"}) > 0)'
                 comments: |
-                  Uses ignoring(state) to handle additional labels on node_md_disks. Matches the official node-exporter mixin.
+                  Uses ignoring(state) to handle additional labels on node_md_disks.
                 severity: critical
               - name: Host software RAID disk failure
                 description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} needs attention."
@@ -1620,7 +1622,7 @@ groups:
                 for: 2m
               - name: Cassandra authentication failures
                 description: Increase of Cassandra authentication failures
-                query: 'rate(cassandra_stats{name="org:apache:cassandra:metrics:client:authfailure:count"}[1m]) > 5'
+                query: 'delta(cassandra_stats{name="org:apache:cassandra:metrics:client:authfailure:count"}[1m]) > 5'
                 severity: warning
                 for: 2m
               - name: Cassandra node down
@@ -1657,7 +1659,7 @@ groups:
                 for: 2m
               - name: Cassandra connection timeouts total (Criteo)
                 description: Some connection between nodes are ending in timeout
-                query: 'rate(cassandra_stats{name="org:apache:cassandra:metrics:connection:totaltimeouts:count"}[1m]) > 5'
+                query: 'delta(cassandra_stats{name="org:apache:cassandra:metrics:connection:totaltimeouts:count"}[1m]) > 5'
                 severity: critical
                 for: 2m
               - name: Cassandra storage exceptions (Criteo)
@@ -1776,12 +1778,12 @@ groups:
 
               - name: ClickHouse rejected insert queries
                 description: "INSERTs rejected due to too many active data parts. Reduce insert frequency."
-                query: "increase(ClickHouseProfileEvents_RejectedInserts[1m]) > 0"
+                query: "increase(ClickHouseProfileEvents_RejectedInserts[1m]) > 2"
                 severity: warning
                 for: 1m
               - name: ClickHouse delayed insert queries
                 description: "INSERTs delayed due to high number of active parts."
-                query: "increase(ClickHouseProfileEvents_DelayedInserts[5m]) > 0"
+                query: "increase(ClickHouseProfileEvents_DelayedInserts[5m]) > 10"
                 severity: warning
                 for: 2m
               - name: ClickHouse zookeeper hardware exception
@@ -2390,7 +2392,6 @@ groups:
                 description: Some server healthcheck are failing on {{ $labels.server }} ({{ $value }} in the last 1m)
                 query: increase(haproxy_server_check_failures_total[1m]) > 2
                 severity: warning
-                for: 1m
           - name: prometheus/haproxy_exporter (HAProxy < v2)
             slug: haproxy-exporter-v1
             doc_url: https://github.com/prometheus/haproxy_exporter
@@ -2470,7 +2471,6 @@ groups:
                 description: Some server healthcheck are failing on {{ $labels.server }} ({{ $value }} in the last 1m)
                 query: "increase(haproxy_server_check_failures_total[1m]) > 2"
                 severity: warning
-                for: 1m
 
       - name: Traefik
         exporters:
@@ -2696,7 +2696,7 @@ groups:
                 for: 1m
               - name: Istio latency 99 percentile
                 description: "Istio p99 request latency is {{ $value }}ms (threshold: 1000ms)."
-                query: "histogram_quantile(0.99, sum(rate(istio_request_duration_milliseconds_bucket[1m])) by (destination_canonical_service, destination_workload_namespace, source_canonical_service, source_workload_namespace, le)) > 1000"
+                query: "histogram_quantile(0.99, sum(rate(istio_request_duration_milliseconds_bucket[1m])) by (destination_canonical_service, destination_workload_namespace, le)) > 1000"
                 severity: warning
                 for: 1m
               - name: Istio Pilot Duplicate Entry
@@ -3041,7 +3041,7 @@ groups:
                   Threshold: more than 100ms/sec of GC time (10% of wall clock). Adjust based on your workload.
               - name: Flink no records processed
                 description: "Flink task {{ $labels.task_name }} has not processed any records in the last 5 minutes."
-                query: "rate(flink_taskmanager_job_task_numRecordsIn[5m]) == 0 and flink_taskmanager_job_task_numRecordsIn > 0"
+                query: "delta(flink_taskmanager_job_task_numRecordsIn[5m]) == 0 and flink_taskmanager_job_task_numRecordsIn > 0"
                 severity: warning
                 for: 5m
                 comments: |
@@ -3125,6 +3125,7 @@ groups:
                 comments: |
                   When targets are managed via service discovery, a disappeared target goes stale rather than reporting up==0,
                   so this alert may not fire. Prefer application-level availability metrics if available.
+                  Rename job="hadoop-namenode" to match the actual job name in your Prometheus scrape config.
 
               # Alert rule for ResourceManager availability
               - name: Hadoop Resource Manager Down
@@ -3135,6 +3136,7 @@ groups:
                 comments: |
                   When targets are managed via service discovery, a disappeared target goes stale rather than reporting up==0,
                   so this alert may not fire. Prefer application-level availability metrics if available.
+                  Rename job="hadoop-resourcemanager" to match the actual job name in your Prometheus scrape config.
 
               # Alert rule for DataNode status
               - name: Hadoop Data Node Out Of Service
@@ -3269,7 +3271,7 @@ groups:
                 severity: critical
               - name: Kubernetes PersistentVolume error
                 description: "Persistent volume {{ $labels.persistentvolume }} is in bad state"
-                query: 'kube_persistentvolume_status_phase{phase=~"Failed|Pending", job="kube-state-metrics"} > 0'
+                query: 'kube_persistentvolume_status_phase{phase=~"Failed|Pending"} > 0'
                 severity: critical
               - name: Kubernetes StatefulSet down
                 description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} went down
@@ -3510,6 +3512,8 @@ groups:
                 query: 'up{job=~".*openstack.*"} == 0'
                 severity: critical
                 for: 2m
+                comments: |
+                  Adjust the job label regex to match the actual job name in your Prometheus scrape config.
               - name: OpenStack Nova agent down
                 description: "Nova agent {{ $labels.hostname }} ({{ $labels.service }}) is down in zone {{ $labels.zone }}"
                 query: 'openstack_nova_agent_state{adminState="enabled"} == 0'
@@ -3804,7 +3808,7 @@ groups:
                 for: 5m
               - name: GitLab CI pipeline failures increasing
                 description: "GitLab CI pipeline failures are increasing on {{ $labels.instance }} ({{ $value }}/s)."
-                query: "rate(gitlab_ci_pipeline_failure_reasons[5m]) > 0.05"
+                query: "deriv(gitlab_ci_pipeline_failure_reasons[5m]) > 0.05"
                 severity: warning
                 for: 10m
                 comments: |
@@ -3903,7 +3907,6 @@ groups:
                 comments: |
                   ResourceExhausted errors from Gitaly mean Git operations are being rejected due to
                   concurrency limits. This directly impacts users trying to push, pull, or clone.
-                  This alert is derived from the GitLab Omnibus default rules.
               - name: GitLab Gitaly high RPC latency
                 description: "Gitaly on {{ $labels.instance }} p95 unary RPC latency exceeds 1 second ({{ $value }}s)."
                 query: 'histogram_quantile(0.95, sum(rate(grpc_server_handling_seconds_bucket{job="gitaly",grpc_type="unary"}[5m])) by (le)) > 1'
@@ -4238,7 +4241,8 @@ groups:
                 query: 'up{job=~"snmp.*"} == 0'
                 severity: critical
                 for: 5m
-                comments: From the official snmp-mixin.
+                comments: |
+                  Rename job=~"snmp.*" to match the actual job name in your Prometheus scrape config.
               - name: SNMP interface down
                 description: "Interface {{ $labels.ifDescr }} on {{ $labels.instance }} is operationally down while administratively up."
                 query: '(ifOperStatus{job=~"snmp.*"} == 2) and on(instance, job, ifIndex) (ifAdminStatus{job=~"snmp.*"} == 1)'
@@ -4378,7 +4382,7 @@ groups:
                 for: 5m
               - name: Cilium agent policy implementation delay
                 description: "Cilium agent {{ $labels.pod }} P99 policy deployment latency exceeds 60 seconds. Endpoints may run with stale policies."
-                query: "histogram_quantile(0.99, sum(rate(cilium_policy_implementation_delay[5m])) by (le, pod)) > 60"
+                query: "histogram_quantile(0.99, sum(rate(cilium_policy_implementation_delay_bucket[5m])) by (le, pod)) > 60"
                 severity: warning
                 for: 5m
                 comments: Threshold of 60s is a rough default. Adjust based on cluster size and policy complexity.
@@ -4510,8 +4514,7 @@ groups:
                 for: 1m
                 comments: |
                   ceph_health_status: 0=HEALTH_OK, 1=HEALTH_WARN, 2=HEALTH_ERR.
-                  The official Ceph mixin splits this into separate warning (==1) and critical (==2) alerts.
-                  This rule fires on any non-OK state. Adjust severity or split as needed.
+                  This rule fires on any non-OK state. Split into separate warning/critical rules by using ==1 and ==2 thresholds if needed.
               - name: Ceph monitor clock skew
                 description: Ceph monitor clock skew detected. Please check ntp and hardware clock settings
                 query: "abs(ceph_monitor_clock_skew_seconds) > 0.2"
@@ -4541,7 +4544,7 @@ groups:
                 for: 5m
                 comments: |
                   Ceph internally triggers OSD_NEARFULL based on the nearfull_ratio (default 85%).
-                  The official mixin uses ceph_health_detail for OSD space alerts.
+                  ceph_health_detail can also be used for more granular OSD space alerts.
               - name: Ceph OSD reweighted
                 description: Ceph Object Storage Daemon takes too much time to resize.
                 query: "ceph_osd_weight < 1"
@@ -4874,12 +4877,12 @@ groups:
                 for: 5m
               - name: Thanos Compactor Halted
                 description: "Thanos Compact {{$labels.job}} has failed to run and now is halted."
-                query: 'thanos_compact_halted{job=~".*thanos-compact.*"} == 1'
+                query: 'thanos_compact_halted == 1'
                 severity: warning
                 for: 5m
               - name: Thanos Compactor High Compaction Failures
                 description: "Thanos Compact {{$labels.job}} is failing to execute {{$value | humanize}}% of compactions."
-                query: '(sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~".*thanos-compact.*"}[5m])) / sum by (job) (rate(thanos_compact_group_compactions_total{job=~".*thanos-compact.*"}[5m])) * 100 > 5) and sum by (job) (rate(thanos_compact_group_compactions_total{job=~".*thanos-compact.*"}[5m])) > 0'
+                query: '(sum by (job) (rate(thanos_compact_group_compactions_failures_total[5m])) / sum by (job) (rate(thanos_compact_group_compactions_total[5m])) * 100 > 5) and sum by (job) (rate(thanos_compact_group_compactions_total[5m])) > 0'
                 severity: warning
                 for: 15m
               - name: Thanos Compact Bucket High Operation Failures
@@ -4919,12 +4922,12 @@ groups:
                   Filters to actual error codes only. grpc_code!="OK" would include benign codes like NotFound, AlreadyExists, and Cancelled.
               - name: Thanos Query High D N S Failures
                 description: "Thanos Query {{$labels.job}} have {{$value | humanize}}% of failing DNS queries for store endpoints."
-                query: '(sum by (job) (rate(thanos_query_store_apis_dns_failures_total{job=~".*thanos-query.*"}[5m])) / sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~".*thanos-query.*"}[5m]))) * 100 > 1 and sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~".*thanos-query.*"}[5m])) > 0'
+                query: '(sum by (job) (rate(thanos_query_store_apis_dns_failures_total[5m])) / sum by (job) (rate(thanos_query_store_apis_dns_lookups_total[5m]))) * 100 > 1 and sum by (job) (rate(thanos_query_store_apis_dns_lookups_total[5m])) > 0'
                 severity: warning
                 for: 15m
               - name: Thanos Query Instant Latency High
                 description: "Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for instant queries."
-                query: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m]))) > 40 and sum by (job) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m])) > 0)'
+                query: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m]))) > 40 and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-query.*", handler="query"}[5m])) > 0)'
                 severity: critical
                 for: 10m
               - name: Thanos Query Range Latency High
@@ -4952,22 +4955,22 @@ groups:
                 for: 10m
               - name: Thanos Receive High Replication Failures
                 description: "Thanos Receive {{$labels.job}} is failing to replicate {{$value | humanize}}% of requests."
-                query: 'thanos_receive_replication_factor > 1 and ((sum by (job) (rate(thanos_receive_replications_total{result="error", job=~".*thanos-receive.*"}[5m])) / sum by (job) (rate(thanos_receive_replications_total{job=~".*thanos-receive.*"}[5m]))) > (max by (job) (floor((thanos_receive_replication_factor{job=~".*thanos-receive.*"}+1)/ 2)) / max by (job) (thanos_receive_hashring_nodes{job=~".*thanos-receive.*"}))) * 100'
+                query: 'thanos_receive_replication_factor > 1 and ((sum by (job) (rate(thanos_receive_replications_total{result="error"}[5m])) / sum by (job) (rate(thanos_receive_replications_total[5m]))) > (max by (job) (floor((thanos_receive_replication_factor+1)/ 2)) / max by (job) (thanos_receive_hashring_nodes))) * 100'
                 severity: warning
                 for: 5m
               - name: Thanos Receive High Forward Request Failures
                 description: "Thanos Receive {{$labels.job}} is failing to forward {{$value | humanize}}% of requests."
-                query: '(sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~".*thanos-receive.*"}[5m]))/  sum by (job) (rate(thanos_receive_forward_requests_total{job=~".*thanos-receive.*"}[5m]))) * 100 > 20 and sum by (job) (rate(thanos_receive_forward_requests_total{job=~".*thanos-receive.*"}[5m])) > 0'
+                query: '(sum by (job) (rate(thanos_receive_forward_requests_total{result="error"}[5m]))/  sum by (job) (rate(thanos_receive_forward_requests_total[5m]))) * 100 > 20 and sum by (job) (rate(thanos_receive_forward_requests_total[5m])) > 0'
                 severity: info
                 for: 5m
               - name: Thanos Receive High Hashring File Refresh Failures
                 description: "Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{$value | humanize}} of attempts failed."
-                query: '(sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~".*thanos-receive.*"}[5m])) / sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{job=~".*thanos-receive.*"}[5m])) > 0) and sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{job=~".*thanos-receive.*"}[5m])) > 0'
+                query: '(sum by (job) (rate(thanos_receive_hashrings_file_errors_total[5m])) / sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total[5m])) > 0) and sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total[5m])) > 0'
                 severity: warning
                 for: 15m
               - name: Thanos Receive Config Reload Failure
                 description: "Thanos Receive {{$labels.job}} has not been able to reload hashring configurations."
-                query: 'avg by (job) (thanos_receive_config_last_reload_successful{job=~".*thanos-receive.*"}) != 1'
+                query: 'avg by (job) (thanos_receive_config_last_reload_successful) != 1'
                 severity: warning
                 for: 5m
               - name: Thanos Receive No Upload
@@ -4987,7 +4990,7 @@ groups:
                 for: 5m
               - name: Thanos Sidecar No Connection To Started Prometheus
                 description: "Thanos Sidecar {{$labels.instance}} is unhealthy."
-                query: 'thanos_sidecar_prometheus_up{job=~".*thanos-sidecar.*"} == 0 and on (namespace, pod) prometheus_tsdb_data_replay_duration_seconds != 0'
+                query: 'thanos_sidecar_prometheus_up == 0 and on (namespace, pod) prometheus_tsdb_data_replay_duration_seconds != 0'
                 severity: critical
                 for: 5m
           - name: Thanos Store
@@ -5000,7 +5003,7 @@ groups:
                 for: 5m
               - name: Thanos Store Series Gate Latency High
                 description: "Thanos Store {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for store series gate requests."
-                query: '(histogram_quantile(0.99, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=~".*thanos-store.*"}[5m]))) > 2 and sum by (job) (rate(thanos_bucket_store_series_gate_duration_seconds_count{job=~".*thanos-store.*"}[5m])) > 0)'
+                query: '(histogram_quantile(0.99, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket[5m]))) > 2 and sum by (job) (rate(thanos_bucket_store_series_gate_duration_seconds_count[5m])) > 0)'
                 severity: warning
                 for: 10m
               - name: Thanos Store Bucket High Operation Failures
@@ -5018,12 +5021,12 @@ groups:
             rules:
               - name: Thanos Rule Queue Is Dropping Alerts
                 description: "Thanos Rule {{$labels.instance}} is failing to queue alerts ({{ $value | humanize }}/s)."
-                query: 'sum by (job, instance) (rate(thanos_alert_queue_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0'
+                query: 'sum by (job, instance) (rate(thanos_alert_queue_alerts_dropped_total[5m])) > 0'
                 severity: critical
                 for: 5m
               - name: Thanos Rule Sender Is Failing Alerts
                 description: "Thanos Rule {{$labels.instance}} is failing to send alerts to alertmanager ({{ $value | humanize }}/s)."
-                query: 'sum by (job, instance) (rate(thanos_alert_sender_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0'
+                query: 'sum by (job, instance) (rate(thanos_alert_sender_alerts_dropped_total[5m])) > 0'
                 severity: critical
                 for: 5m
               - name: Thanos Rule High Rule Evaluation Failures
@@ -5033,7 +5036,7 @@ groups:
                 for: 5m
               - name: Thanos Rule High Rule Evaluation Warnings
                 description: "Thanos Rule {{$labels.instance}} has high number of evaluation warnings ({{ $value | humanize }}/s)."
-                query: 'sum by (job, instance) (rate(thanos_rule_evaluation_with_warnings_total{job=~".*thanos-rule.*"}[5m])) > 0.05'
+                query: 'sum by (job, instance) (rate(thanos_rule_evaluation_with_warnings_total[5m])) > 0.05'
                 comments: |
                   Threshold of 0.05/s avoids firing on transient single-event spikes.
                 severity: info
@@ -5050,17 +5053,17 @@ groups:
                 for: 5m
               - name: Thanos Rule Config Reload Failure
                 description: "Thanos Rule {{$labels.job}} has not been able to reload its configuration."
-                query: 'avg by (job, instance) (thanos_rule_config_last_reload_successful{job=~".*thanos-rule.*"}) != 1'
+                query: 'avg by (job, instance) (thanos_rule_config_last_reload_successful) != 1'
                 severity: info
                 for: 5m
               - name: Thanos Rule Query High D N S Failures
                 description: "Thanos Rule {{$labels.job}} has {{$value | humanize}}% of failing DNS queries for query endpoints."
-                query: '(sum by (job, instance) (rate(thanos_rule_query_apis_dns_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) * 100 > 1) and sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) > 0'
+                query: '(sum by (job, instance) (rate(thanos_rule_query_apis_dns_failures_total[5m])) / sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total[5m])) * 100 > 1) and sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total[5m])) > 0'
                 severity: warning
                 for: 15m
               - name: Thanos Rule Alertmanager High D N S Failures
                 description: "Thanos Rule {{$labels.instance}} has {{$value | humanize}}% of failing DNS queries for Alertmanager endpoints."
-                query: '(sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) * 100 > 1) and sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) > 0'
+                query: '(sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_failures_total[5m])) / sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total[5m])) * 100 > 1) and sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total[5m])) > 0'
                 severity: warning
                 for: 15m
               - name: Thanos Rule No Evaluation For10 Intervals
@@ -5070,7 +5073,7 @@ groups:
                 for: 5m
               - name: Thanos No Rule Evaluations
                 description: "Thanos Rule {{$labels.instance}} did not perform any rule evaluations in the past 10 minutes."
-                query: 'sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) <= 0  and sum by (job, instance) (thanos_rule_loaded_rules{job=~".*thanos-rule.*"}) > 0'
+                query: 'sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) <= 0  and sum by (job, instance) (thanos_rule_loaded_rules) > 0'
                 severity: critical
                 for: 5m
           - name: Thanos Bucket Replicate
@@ -5078,12 +5081,12 @@ groups:
             rules:
               - name: Thanos Bucket Replicate Error Rate
                 description: "Thanos Replicate is failing to run, {{$value | humanize}}% of attempts failed."
-                query: '(sum by (job) (rate(thanos_replicate_replication_runs_total{result="error", job=~".*thanos-bucket-replicate.*"}[5m])) / on (job) group_left sum by (job) (rate(thanos_replicate_replication_runs_total{job=~".*thanos-bucket-replicate.*"}[5m]))) * 100 >= 10 and sum by (job) (rate(thanos_replicate_replication_runs_total{job=~".*thanos-bucket-replicate.*"}[5m])) > 0'
+                query: '(sum by (job) (rate(thanos_replicate_replication_runs_total{result="error"}[5m])) / on (job) group_left sum by (job) (rate(thanos_replicate_replication_runs_total[5m]))) * 100 >= 10 and sum by (job) (rate(thanos_replicate_replication_runs_total[5m])) > 0'
                 severity: critical
                 for: 5m
               - name: Thanos Bucket Replicate Run Latency
                 description: "Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for the replicate operations."
-                query: '(histogram_quantile(0.99, sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m]))) > 20 and  sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m])) > 0)'
+                query: '(histogram_quantile(0.99, sum by (job, le) (rate(thanos_replicate_replication_run_duration_seconds_bucket[5m]))) > 20 and  sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_count[5m])) > 0)'
                 severity: critical
                 for: 5m
           - name: Thanos Component Absent
@@ -5270,7 +5273,7 @@ groups:
                 severity: critical
                 for: 24h
                 comments: |
-                  Official Tempo mixin normalizes by backend-worker count. Adjust threshold based on your compactor configuration.
+                  Threshold of 100 blocks per compactor instance. Normalize by backend-worker count if needed. Adjust based on your environment.
               - name: Tempo distributor usage tracker errors
                 description: "Tempo distributor usage tracker errors for {{ $labels.job }} at {{ $value | humanize }}/s (reason {{ $labels.reason }})."
                 query: sum by (job, reason) (rate(tempo_distributor_usage_tracker_errors_total[5m])) > 0.05
@@ -5285,7 +5288,7 @@ groups:
                 for: 15m
               - name: Tempo metrics generator service graphs dropping spans
                 description: Tempo metrics generator is dropping {{ printf "%.2f" $value }}% of spans in service graphs for {{ $labels.job }}.
-                query: '100 * sum by (job) (rate(tempo_metrics_generator_processor_service_graphs_dropped_spans[5m])) / sum by (job) (rate(tempo_metrics_generator_spans_received_total[5m])) > 0.5 and sum by (job) (rate(tempo_metrics_generator_spans_received_total[5m])) > 0'
+                query: '100 * sum by (job) (rate(tempo_metrics_generator_processor_service_graphs_dropped_spans_total[5m])) / sum by (job) (rate(tempo_metrics_generator_spans_received_total[5m])) > 0.5 and sum by (job) (rate(tempo_metrics_generator_spans_received_total[5m])) > 0'
                 severity: warning
                 for: 15m
               - name: Tempo metrics generator collections failing
@@ -5453,7 +5456,7 @@ groups:
                 description: Mimir store-gateway {{ $labels.instance }} has not synced the bucket for more than 30 minutes.
                 query: (time() - cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 1800) and cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 0
                 comments: |
-                  Threshold aligned with official Mimir mixin (30 minutes).
+                  Threshold of 30 minutes. Adjust based on your sync interval.
                 severity: critical
                 for: 5m
               - name: Mimir store gateway no synced tenants
@@ -5495,7 +5498,7 @@ groups:
                 description: "Mimir compactor has found {{ $value }} blocks that cannot be compacted (reason {{ $labels.reason }})."
                 query: increase(cortex_compactor_blocks_marked_for_no_compaction_total[24h]) > 0
                 comments: |
-                  Using 24h window per official mixin — compaction skips are rare events.
+                  Using a 24h window as compaction skips are rare events.
                 severity: warning
                 for: 5m
               # Ruler
@@ -5616,6 +5619,8 @@ groups:
                 query: 'up{job=~".*otel.*collector.*"} == 0'
                 severity: critical
                 for: 1m
+                comments: |
+                  Adjust the job label regex to match the actual job name in your Prometheus scrape config.
               - name: OpenTelemetry Collector receiver refused spans
                 description: "OpenTelemetry Collector is refusing {{ $value | humanize }}/s spans on {{ $labels.receiver }}."
                 query: 'rate(otelcol_receiver_refused_spans[5m]) > 0.05'
@@ -5680,7 +5685,7 @@ groups:
                 for: 5m
               - name: OpenTelemetry Collector high memory usage
                 description: "OpenTelemetry Collector memory usage is above 90%"
-                query: '(otelcol_process_runtime_heap_alloc_bytes{job=~".*otel.*collector.*"} / on(instance, job) otelcol_process_runtime_total_sys_memory_bytes{job=~".*otel.*collector.*"}) > 0.9'
+                query: '(otelcol_process_runtime_heap_alloc_bytes / on(instance, job) otelcol_process_runtime_total_sys_memory_bytes) > 0.9'
                 severity: warning
                 for: 5m
               - name: OpenTelemetry Collector OTLP receiver errors