fix(data): PromQL type fixes, job filter cleanup, query correctness review

- Replace rate()/increase() with deriv()/delta() on gauge metrics: node_vmstat_pgmajfault, cassandra_stats (criteo exporter), gitlab_ci_pipeline_failure_reasons, flink_taskmanager_job_task_numRecordsIn - Fix histogram_quantile on non-_bucket metric: cilium_policy_implementation_delay - Fix Thanos bucket replicate latency: use _count instead of _bucket for guard clause - Fix Thanos query latency: use _count instead of _bucket for guard clause - Restore job filter in Thanos objstore guard clauses (compact + store) - Remove redundant job= filters from unique metrics: ~30 Thanos rules, kube_persistentvolume_status_phase, otelcol_process_runtime_* - Fix high-cardinality Istio latency grouping (drop source labels from by()) - Add division-by-zero guard to host context switch ratio - Raise noisy ClickHouse thresholds: RejectedInserts > 2, DelayedInserts > 10 - Remove redundant for: 1m from HAProxy check failure rules - Add job rename comments to up{job=...} rules (Hadoop, OpenStack, SNMP, OTel) - Remove external mixin references from comments - Fix Tempo dropped spans metric name: add missing _total suffix - Fix Thanos bucket replicate run latency: add missing le label in by()
2026-06-24 18:36:59 +08:00 · 2026-04-06 20:12:18 +02:00 · 2026-04-06 20:12:18 +02:00 · 04a8ae2fe3
commit 04a8ae2fe3
parent ac32c98098
1 changed files with 48 additions and 43 deletions
--- a/_data/rules.yml
+++ b/_data/rules.yml
@ -148,8 +148,10 @@ groups:
                for: 2m
              - name: Host memory under memory pressure
                description: "The node is under heavy memory pressure. High rate of major page faults ({{ $value }}/s)."
-                query: "(rate(node_vmstat_pgmajfault[5m]) > 1000)"
+                query: "(deriv(node_vmstat_pgmajfault[5m]) > 1000)"
                severity: warning
+                comments: |
+                  node_vmstat_pgmajfault is exposed as untyped/gauge by node_exporter (from /proc/vmstat), so deriv() is used instead of rate().
              - name: Host Memory is underutilized
                description: "Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})"
                query: "min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8"
@ -238,7 +240,7 @@ groups:
                for: 5m
              - name: Host context switching high
                description: Context switching is growing on the node (twice the daily average during the last 15m)
-                query: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2'
+                query: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2 and rate(node_context_switches_total[1d]) > 0'
                severity: warning
                comments: |
                  x2 context switches is an arbitrary number.
@ -266,7 +268,7 @@ groups:
                description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} has insufficient drives remaining."
                query: '((node_md_disks_required - ignoring(state) node_md_disks{state="active"}) > 0)'
                comments: |
-                  Uses ignoring(state) to handle additional labels on node_md_disks. Matches the official node-exporter mixin.
+                  Uses ignoring(state) to handle additional labels on node_md_disks.
                severity: critical
              - name: Host software RAID disk failure
                description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} needs attention."
@ -1620,7 +1622,7 @@ groups:
                for: 2m
              - name: Cassandra authentication failures
                description: Increase of Cassandra authentication failures
-                query: 'rate(cassandra_stats{name="org:apache:cassandra:metrics:client:authfailure:count"}[1m]) > 5'
+                query: 'delta(cassandra_stats{name="org:apache:cassandra:metrics:client:authfailure:count"}[1m]) > 5'
                severity: warning
                for: 2m
              - name: Cassandra node down
@ -1657,7 +1659,7 @@ groups:
                for: 2m
              - name: Cassandra connection timeouts total (Criteo)
                description: Some connection between nodes are ending in timeout
-                query: 'rate(cassandra_stats{name="org:apache:cassandra:metrics:connection:totaltimeouts:count"}[1m]) > 5'
+                query: 'delta(cassandra_stats{name="org:apache:cassandra:metrics:connection:totaltimeouts:count"}[1m]) > 5'
                severity: critical
                for: 2m
              - name: Cassandra storage exceptions (Criteo)
@ -1776,12 +1778,12 @@ groups:

              - name: ClickHouse rejected insert queries
                description: "INSERTs rejected due to too many active data parts. Reduce insert frequency."
-                query: "increase(ClickHouseProfileEvents_RejectedInserts[1m]) > 0"
+                query: "increase(ClickHouseProfileEvents_RejectedInserts[1m]) > 2"
                severity: warning
                for: 1m
              - name: ClickHouse delayed insert queries
                description: "INSERTs delayed due to high number of active parts."
-                query: "increase(ClickHouseProfileEvents_DelayedInserts[5m]) > 0"
+                query: "increase(ClickHouseProfileEvents_DelayedInserts[5m]) > 10"
                severity: warning
                for: 2m
              - name: ClickHouse zookeeper hardware exception
@ -2390,7 +2392,6 @@ groups:
                description: Some server healthcheck are failing on {{ $labels.server }} ({{ $value }} in the last 1m)
                query: increase(haproxy_server_check_failures_total[1m]) > 2
                severity: warning
-                for: 1m
          - name: prometheus/haproxy_exporter (HAProxy < v2)
            slug: haproxy-exporter-v1
            doc_url: https://github.com/prometheus/haproxy_exporter
@ -2470,7 +2471,6 @@ groups:
                description: Some server healthcheck are failing on {{ $labels.server }} ({{ $value }} in the last 1m)
                query: "increase(haproxy_server_check_failures_total[1m]) > 2"
                severity: warning
-                for: 1m

      - name: Traefik
        exporters:
@ -2696,7 +2696,7 @@ groups:
                for: 1m
              - name: Istio latency 99 percentile
                description: "Istio p99 request latency is {{ $value }}ms (threshold: 1000ms)."
-                query: "histogram_quantile(0.99, sum(rate(istio_request_duration_milliseconds_bucket[1m])) by (destination_canonical_service, destination_workload_namespace, source_canonical_service, source_workload_namespace, le)) > 1000"
+                query: "histogram_quantile(0.99, sum(rate(istio_request_duration_milliseconds_bucket[1m])) by (destination_canonical_service, destination_workload_namespace, le)) > 1000"
                severity: warning
                for: 1m
              - name: Istio Pilot Duplicate Entry
@ -3041,7 +3041,7 @@ groups:
                  Threshold: more than 100ms/sec of GC time (10% of wall clock). Adjust based on your workload.
              - name: Flink no records processed
                description: "Flink task {{ $labels.task_name }} has not processed any records in the last 5 minutes."
-                query: "rate(flink_taskmanager_job_task_numRecordsIn[5m]) == 0 and flink_taskmanager_job_task_numRecordsIn > 0"
+                query: "delta(flink_taskmanager_job_task_numRecordsIn[5m]) == 0 and flink_taskmanager_job_task_numRecordsIn > 0"
                severity: warning
                for: 5m
                comments: |
@ -3125,6 +3125,7 @@ groups:
                comments: |
                  When targets are managed via service discovery, a disappeared target goes stale rather than reporting up==0,
                  so this alert may not fire. Prefer application-level availability metrics if available.
+                  Rename job="hadoop-namenode" to match the actual job name in your Prometheus scrape config.

              # Alert rule for ResourceManager availability
              - name: Hadoop Resource Manager Down
@ -3135,6 +3136,7 @@ groups:
                comments: |
                  When targets are managed via service discovery, a disappeared target goes stale rather than reporting up==0,
                  so this alert may not fire. Prefer application-level availability metrics if available.
+                  Rename job="hadoop-resourcemanager" to match the actual job name in your Prometheus scrape config.

              # Alert rule for DataNode status
              - name: Hadoop Data Node Out Of Service
@ -3269,7 +3271,7 @@ groups:
                severity: critical
              - name: Kubernetes PersistentVolume error
                description: "Persistent volume {{ $labels.persistentvolume }} is in bad state"
-                query: 'kube_persistentvolume_status_phase{phase=~"Failed|Pending", job="kube-state-metrics"} > 0'
+                query: 'kube_persistentvolume_status_phase{phase=~"Failed|Pending"} > 0'
                severity: critical
              - name: Kubernetes StatefulSet down
                description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} went down
@ -3510,6 +3512,8 @@ groups:
                query: 'up{job=~".*openstack.*"} == 0'
                severity: critical
                for: 2m
+                comments: |
+                  Adjust the job label regex to match the actual job name in your Prometheus scrape config.
              - name: OpenStack Nova agent down
                description: "Nova agent {{ $labels.hostname }} ({{ $labels.service }}) is down in zone {{ $labels.zone }}"
                query: 'openstack_nova_agent_state{adminState="enabled"} == 0'
@ -3804,7 +3808,7 @@ groups:
                for: 5m
              - name: GitLab CI pipeline failures increasing
                description: "GitLab CI pipeline failures are increasing on {{ $labels.instance }} ({{ $value }}/s)."
-                query: "rate(gitlab_ci_pipeline_failure_reasons[5m]) > 0.05"
+                query: "deriv(gitlab_ci_pipeline_failure_reasons[5m]) > 0.05"
                severity: warning
                for: 10m
                comments: |
@ -3903,7 +3907,6 @@ groups:
                comments: |
                  ResourceExhausted errors from Gitaly mean Git operations are being rejected due to
                  concurrency limits. This directly impacts users trying to push, pull, or clone.
-                  This alert is derived from the GitLab Omnibus default rules.
              - name: GitLab Gitaly high RPC latency
                description: "Gitaly on {{ $labels.instance }} p95 unary RPC latency exceeds 1 second ({{ $value }}s)."
                query: 'histogram_quantile(0.95, sum(rate(grpc_server_handling_seconds_bucket{job="gitaly",grpc_type="unary"}[5m])) by (le)) > 1'
@ -4238,7 +4241,8 @@ groups:
                query: 'up{job=~"snmp.*"} == 0'
                severity: critical
                for: 5m
-                comments: From the official snmp-mixin.
+                comments: |
+                  Rename job=~"snmp.*" to match the actual job name in your Prometheus scrape config.
              - name: SNMP interface down
                description: "Interface {{ $labels.ifDescr }} on {{ $labels.instance }} is operationally down while administratively up."
                query: '(ifOperStatus{job=~"snmp.*"} == 2) and on(instance, job, ifIndex) (ifAdminStatus{job=~"snmp.*"} == 1)'
@ -4378,7 +4382,7 @@ groups:
                for: 5m
              - name: Cilium agent policy implementation delay
                description: "Cilium agent {{ $labels.pod }} P99 policy deployment latency exceeds 60 seconds. Endpoints may run with stale policies."
-                query: "histogram_quantile(0.99, sum(rate(cilium_policy_implementation_delay[5m])) by (le, pod)) > 60"
+                query: "histogram_quantile(0.99, sum(rate(cilium_policy_implementation_delay_bucket[5m])) by (le, pod)) > 60"
                severity: warning
                for: 5m
                comments: Threshold of 60s is a rough default. Adjust based on cluster size and policy complexity.
@ -4510,8 +4514,7 @@ groups:
                for: 1m
                comments: |
                  ceph_health_status: 0=HEALTH_OK, 1=HEALTH_WARN, 2=HEALTH_ERR.
-                  The official Ceph mixin splits this into separate warning (==1) and critical (==2) alerts.
-                  This rule fires on any non-OK state. Adjust severity or split as needed.
+                  This rule fires on any non-OK state. Split into separate warning/critical rules by using ==1 and ==2 thresholds if needed.
              - name: Ceph monitor clock skew
                description: Ceph monitor clock skew detected. Please check ntp and hardware clock settings
                query: "abs(ceph_monitor_clock_skew_seconds) > 0.2"
@ -4541,7 +4544,7 @@ groups:
                for: 5m
                comments: |
                  Ceph internally triggers OSD_NEARFULL based on the nearfull_ratio (default 85%).
-                  The official mixin uses ceph_health_detail for OSD space alerts.
+                  ceph_health_detail can also be used for more granular OSD space alerts.
              - name: Ceph OSD reweighted
                description: Ceph Object Storage Daemon takes too much time to resize.
                query: "ceph_osd_weight < 1"
@ -4874,12 +4877,12 @@ groups:
                for: 5m
              - name: Thanos Compactor Halted
                description: "Thanos Compact {{$labels.job}} has failed to run and now is halted."
-                query: 'thanos_compact_halted{job=~".*thanos-compact.*"} == 1'
+                query: 'thanos_compact_halted == 1'
                severity: warning
                for: 5m
              - name: Thanos Compactor High Compaction Failures
                description: "Thanos Compact {{$labels.job}} is failing to execute {{$value | humanize}}% of compactions."
-                query: '(sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~".*thanos-compact.*"}[5m])) / sum by (job) (rate(thanos_compact_group_compactions_total{job=~".*thanos-compact.*"}[5m])) * 100 > 5) and sum by (job) (rate(thanos_compact_group_compactions_total{job=~".*thanos-compact.*"}[5m])) > 0'
+                query: '(sum by (job) (rate(thanos_compact_group_compactions_failures_total[5m])) / sum by (job) (rate(thanos_compact_group_compactions_total[5m])) * 100 > 5) and sum by (job) (rate(thanos_compact_group_compactions_total[5m])) > 0'
                severity: warning
                for: 15m
              - name: Thanos Compact Bucket High Operation Failures
@ -4919,12 +4922,12 @@ groups:
                  Filters to actual error codes only. grpc_code!="OK" would include benign codes like NotFound, AlreadyExists, and Cancelled.
              - name: Thanos Query High D N S Failures
                description: "Thanos Query {{$labels.job}} have {{$value | humanize}}% of failing DNS queries for store endpoints."
-                query: '(sum by (job) (rate(thanos_query_store_apis_dns_failures_total{job=~".*thanos-query.*"}[5m])) / sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~".*thanos-query.*"}[5m]))) * 100 > 1 and sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~".*thanos-query.*"}[5m])) > 0'
+                query: '(sum by (job) (rate(thanos_query_store_apis_dns_failures_total[5m])) / sum by (job) (rate(thanos_query_store_apis_dns_lookups_total[5m]))) * 100 > 1 and sum by (job) (rate(thanos_query_store_apis_dns_lookups_total[5m])) > 0'
                severity: warning
                for: 15m
              - name: Thanos Query Instant Latency High
                description: "Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for instant queries."
-                query: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m]))) > 40 and sum by (job) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m])) > 0)'
+                query: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m]))) > 40 and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-query.*", handler="query"}[5m])) > 0)'
                severity: critical
                for: 10m
              - name: Thanos Query Range Latency High
@ -4952,22 +4955,22 @@ groups:
                for: 10m
              - name: Thanos Receive High Replication Failures
                description: "Thanos Receive {{$labels.job}} is failing to replicate {{$value | humanize}}% of requests."
-                query: 'thanos_receive_replication_factor > 1 and ((sum by (job) (rate(thanos_receive_replications_total{result="error", job=~".*thanos-receive.*"}[5m])) / sum by (job) (rate(thanos_receive_replications_total{job=~".*thanos-receive.*"}[5m]))) > (max by (job) (floor((thanos_receive_replication_factor{job=~".*thanos-receive.*"}+1)/ 2)) / max by (job) (thanos_receive_hashring_nodes{job=~".*thanos-receive.*"}))) * 100'
+                query: 'thanos_receive_replication_factor > 1 and ((sum by (job) (rate(thanos_receive_replications_total{result="error"}[5m])) / sum by (job) (rate(thanos_receive_replications_total[5m]))) > (max by (job) (floor((thanos_receive_replication_factor+1)/ 2)) / max by (job) (thanos_receive_hashring_nodes))) * 100'
                severity: warning
                for: 5m
              - name: Thanos Receive High Forward Request Failures
                description: "Thanos Receive {{$labels.job}} is failing to forward {{$value | humanize}}% of requests."
-                query: '(sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~".*thanos-receive.*"}[5m]))/  sum by (job) (rate(thanos_receive_forward_requests_total{job=~".*thanos-receive.*"}[5m]))) * 100 > 20 and sum by (job) (rate(thanos_receive_forward_requests_total{job=~".*thanos-receive.*"}[5m])) > 0'
+                query: '(sum by (job) (rate(thanos_receive_forward_requests_total{result="error"}[5m]))/  sum by (job) (rate(thanos_receive_forward_requests_total[5m]))) * 100 > 20 and sum by (job) (rate(thanos_receive_forward_requests_total[5m])) > 0'
                severity: info
                for: 5m
              - name: Thanos Receive High Hashring File Refresh Failures
                description: "Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{$value | humanize}} of attempts failed."
-                query: '(sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~".*thanos-receive.*"}[5m])) / sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{job=~".*thanos-receive.*"}[5m])) > 0) and sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{job=~".*thanos-receive.*"}[5m])) > 0'
+                query: '(sum by (job) (rate(thanos_receive_hashrings_file_errors_total[5m])) / sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total[5m])) > 0) and sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total[5m])) > 0'
                severity: warning
                for: 15m
              - name: Thanos Receive Config Reload Failure
                description: "Thanos Receive {{$labels.job}} has not been able to reload hashring configurations."
-                query: 'avg by (job) (thanos_receive_config_last_reload_successful{job=~".*thanos-receive.*"}) != 1'
+                query: 'avg by (job) (thanos_receive_config_last_reload_successful) != 1'
                severity: warning
                for: 5m
              - name: Thanos Receive No Upload
@ -4987,7 +4990,7 @@ groups:
                for: 5m
              - name: Thanos Sidecar No Connection To Started Prometheus
                description: "Thanos Sidecar {{$labels.instance}} is unhealthy."
-                query: 'thanos_sidecar_prometheus_up{job=~".*thanos-sidecar.*"} == 0 and on (namespace, pod) prometheus_tsdb_data_replay_duration_seconds != 0'
+                query: 'thanos_sidecar_prometheus_up == 0 and on (namespace, pod) prometheus_tsdb_data_replay_duration_seconds != 0'
                severity: critical
                for: 5m
          - name: Thanos Store
@ -5000,7 +5003,7 @@ groups:
                for: 5m
              - name: Thanos Store Series Gate Latency High
                description: "Thanos Store {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for store series gate requests."
-                query: '(histogram_quantile(0.99, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=~".*thanos-store.*"}[5m]))) > 2 and sum by (job) (rate(thanos_bucket_store_series_gate_duration_seconds_count{job=~".*thanos-store.*"}[5m])) > 0)'
+                query: '(histogram_quantile(0.99, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket[5m]))) > 2 and sum by (job) (rate(thanos_bucket_store_series_gate_duration_seconds_count[5m])) > 0)'
                severity: warning
                for: 10m
              - name: Thanos Store Bucket High Operation Failures
@ -5018,12 +5021,12 @@ groups:
            rules:
              - name: Thanos Rule Queue Is Dropping Alerts
                description: "Thanos Rule {{$labels.instance}} is failing to queue alerts ({{ $value | humanize }}/s)."
-                query: 'sum by (job, instance) (rate(thanos_alert_queue_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0'
+                query: 'sum by (job, instance) (rate(thanos_alert_queue_alerts_dropped_total[5m])) > 0'
                severity: critical
                for: 5m
              - name: Thanos Rule Sender Is Failing Alerts
                description: "Thanos Rule {{$labels.instance}} is failing to send alerts to alertmanager ({{ $value | humanize }}/s)."
-                query: 'sum by (job, instance) (rate(thanos_alert_sender_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0'
+                query: 'sum by (job, instance) (rate(thanos_alert_sender_alerts_dropped_total[5m])) > 0'
                severity: critical
                for: 5m
              - name: Thanos Rule High Rule Evaluation Failures
@ -5033,7 +5036,7 @@ groups:
                for: 5m
              - name: Thanos Rule High Rule Evaluation Warnings
                description: "Thanos Rule {{$labels.instance}} has high number of evaluation warnings ({{ $value | humanize }}/s)."
-                query: 'sum by (job, instance) (rate(thanos_rule_evaluation_with_warnings_total{job=~".*thanos-rule.*"}[5m])) > 0.05'
+                query: 'sum by (job, instance) (rate(thanos_rule_evaluation_with_warnings_total[5m])) > 0.05'
                comments: |
                  Threshold of 0.05/s avoids firing on transient single-event spikes.
                severity: info
@ -5050,17 +5053,17 @@ groups:
                for: 5m
              - name: Thanos Rule Config Reload Failure
                description: "Thanos Rule {{$labels.job}} has not been able to reload its configuration."
-                query: 'avg by (job, instance) (thanos_rule_config_last_reload_successful{job=~".*thanos-rule.*"}) != 1'
+                query: 'avg by (job, instance) (thanos_rule_config_last_reload_successful) != 1'
                severity: info
                for: 5m
              - name: Thanos Rule Query High D N S Failures
                description: "Thanos Rule {{$labels.job}} has {{$value | humanize}}% of failing DNS queries for query endpoints."
-                query: '(sum by (job, instance) (rate(thanos_rule_query_apis_dns_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) * 100 > 1) and sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) > 0'
+                query: '(sum by (job, instance) (rate(thanos_rule_query_apis_dns_failures_total[5m])) / sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total[5m])) * 100 > 1) and sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total[5m])) > 0'
                severity: warning
                for: 15m
              - name: Thanos Rule Alertmanager High D N S Failures
                description: "Thanos Rule {{$labels.instance}} has {{$value | humanize}}% of failing DNS queries for Alertmanager endpoints."
-                query: '(sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) * 100 > 1) and sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) > 0'
+                query: '(sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_failures_total[5m])) / sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total[5m])) * 100 > 1) and sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total[5m])) > 0'
                severity: warning
                for: 15m
              - name: Thanos Rule No Evaluation For10 Intervals
@ -5070,7 +5073,7 @@ groups:
                for: 5m
              - name: Thanos No Rule Evaluations
                description: "Thanos Rule {{$labels.instance}} did not perform any rule evaluations in the past 10 minutes."
-                query: 'sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) <= 0  and sum by (job, instance) (thanos_rule_loaded_rules{job=~".*thanos-rule.*"}) > 0'
+                query: 'sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) <= 0  and sum by (job, instance) (thanos_rule_loaded_rules) > 0'
                severity: critical
                for: 5m
          - name: Thanos Bucket Replicate
@ -5078,12 +5081,12 @@ groups:
            rules:
              - name: Thanos Bucket Replicate Error Rate
                description: "Thanos Replicate is failing to run, {{$value | humanize}}% of attempts failed."
-                query: '(sum by (job) (rate(thanos_replicate_replication_runs_total{result="error", job=~".*thanos-bucket-replicate.*"}[5m])) / on (job) group_left sum by (job) (rate(thanos_replicate_replication_runs_total{job=~".*thanos-bucket-replicate.*"}[5m]))) * 100 >= 10 and sum by (job) (rate(thanos_replicate_replication_runs_total{job=~".*thanos-bucket-replicate.*"}[5m])) > 0'
+                query: '(sum by (job) (rate(thanos_replicate_replication_runs_total{result="error"}[5m])) / on (job) group_left sum by (job) (rate(thanos_replicate_replication_runs_total[5m]))) * 100 >= 10 and sum by (job) (rate(thanos_replicate_replication_runs_total[5m])) > 0'
                severity: critical
                for: 5m
              - name: Thanos Bucket Replicate Run Latency
                description: "Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for the replicate operations."
-                query: '(histogram_quantile(0.99, sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m]))) > 20 and  sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m])) > 0)'
+                query: '(histogram_quantile(0.99, sum by (job, le) (rate(thanos_replicate_replication_run_duration_seconds_bucket[5m]))) > 20 and  sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_count[5m])) > 0)'
                severity: critical
                for: 5m
          - name: Thanos Component Absent
@ -5270,7 +5273,7 @@ groups:
                severity: critical
                for: 24h
                comments: |
-                  Official Tempo mixin normalizes by backend-worker count. Adjust threshold based on your compactor configuration.
+                  Threshold of 100 blocks per compactor instance. Normalize by backend-worker count if needed. Adjust based on your environment.
              - name: Tempo distributor usage tracker errors
                description: "Tempo distributor usage tracker errors for {{ $labels.job }} at {{ $value | humanize }}/s (reason {{ $labels.reason }})."
                query: sum by (job, reason) (rate(tempo_distributor_usage_tracker_errors_total[5m])) > 0.05
@ -5285,7 +5288,7 @@ groups:
                for: 15m
              - name: Tempo metrics generator service graphs dropping spans
                description: Tempo metrics generator is dropping {{ printf "%.2f" $value }}% of spans in service graphs for {{ $labels.job }}.
-                query: '100 * sum by (job) (rate(tempo_metrics_generator_processor_service_graphs_dropped_spans[5m])) / sum by (job) (rate(tempo_metrics_generator_spans_received_total[5m])) > 0.5 and sum by (job) (rate(tempo_metrics_generator_spans_received_total[5m])) > 0'
+                query: '100 * sum by (job) (rate(tempo_metrics_generator_processor_service_graphs_dropped_spans_total[5m])) / sum by (job) (rate(tempo_metrics_generator_spans_received_total[5m])) > 0.5 and sum by (job) (rate(tempo_metrics_generator_spans_received_total[5m])) > 0'
                severity: warning
                for: 15m
              - name: Tempo metrics generator collections failing
@ -5453,7 +5456,7 @@ groups:
                description: Mimir store-gateway {{ $labels.instance }} has not synced the bucket for more than 30 minutes.
                query: (time() - cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 1800) and cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 0
                comments: |
-                  Threshold aligned with official Mimir mixin (30 minutes).
+                  Threshold of 30 minutes. Adjust based on your sync interval.
                severity: critical
                for: 5m
              - name: Mimir store gateway no synced tenants
@ -5495,7 +5498,7 @@ groups:
                description: "Mimir compactor has found {{ $value }} blocks that cannot be compacted (reason {{ $labels.reason }})."
                query: increase(cortex_compactor_blocks_marked_for_no_compaction_total[24h]) > 0
                comments: |
-                  Using 24h window per official mixin — compaction skips are rare events.
+                  Using a 24h window as compaction skips are rare events.
                severity: warning
                for: 5m
              # Ruler
@ -5616,6 +5619,8 @@ groups:
                query: 'up{job=~".*otel.*collector.*"} == 0'
                severity: critical
                for: 1m
+                comments: |
+                  Adjust the job label regex to match the actual job name in your Prometheus scrape config.
              - name: OpenTelemetry Collector receiver refused spans
                description: "OpenTelemetry Collector is refusing {{ $value | humanize }}/s spans on {{ $labels.receiver }}."
                query: 'rate(otelcol_receiver_refused_spans[5m]) > 0.05'
@ -5680,7 +5685,7 @@ groups:
                for: 5m
              - name: OpenTelemetry Collector high memory usage
                description: "OpenTelemetry Collector memory usage is above 90%"
-                query: '(otelcol_process_runtime_heap_alloc_bytes{job=~".*otel.*collector.*"} / on(instance, job) otelcol_process_runtime_total_sys_memory_bytes{job=~".*otel.*collector.*"}) > 0.9'
+                query: '(otelcol_process_runtime_heap_alloc_bytes / on(instance, job) otelcol_process_runtime_total_sys_memory_bytes) > 0.9'
                severity: warning
                for: 5m
              - name: OpenTelemetry Collector OTLP receiver errors