Publish

2026-06-21 00:47:18 +08:00 · 2026-03-16 13:37:19 +00:00 · 2026-03-16 13:37:19 +00:00 · 7f346ede99
commit 7f346ede99
parent b58b498bbb
2 changed files with 622 additions and 0 deletions
--- a/dist/rules/grafana-mimir/embedded-exporter.yml
+++ b/dist/rules/grafana-mimir/embedded-exporter.yml
@ -0,0 +1,449 @@
+groups:
+
+- name: EmbeddedExporter
+
+  # Mimir uses the `cortex_` metric prefix for backward compatibility with Cortex. This is intentional and expected.
+  
+  rules:
+
+    - alert: MimirIngesterUnhealthy
+      expr: 'min by (job) (cortex_ring_members{state="Unhealthy", name="ingester"}) > 0'
+      for: 15m
+      labels:
+        severity: critical
+      annotations:
+        summary: Mimir ingester unhealthy (instance {{ $labels.instance }})
+        description: "Mimir has {{ $value }} unhealthy ingester(s) in the ring.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MimirRequestErrors
+      expr: '100 * sum by (job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..", route!~"ready|debug_pprof"}[5m])) / sum by (job, route) (rate(cortex_request_duration_seconds_count{route!~"ready|debug_pprof"}[5m])) > 1'
+      for: 15m
+      labels:
+        severity: critical
+      annotations:
+        summary: Mimir request errors (instance {{ $labels.instance }})
+        description: "Mimir {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}% errors.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MimirInconsistentRuntimeConfig
+      expr: 'count(count by (job, sha256) (cortex_runtime_config_hash)) without(sha256) > 1'
+      for: 1h
+      labels:
+        severity: critical
+      annotations:
+        summary: Mimir inconsistent runtime config (instance {{ $labels.instance }})
+        description: "An inconsistent runtime config file is used across Mimir instances.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MimirBadRuntimeConfig
+      expr: 'sum by (job) (cortex_runtime_config_last_reload_successful == 0) > 0'
+      for: 5m
+      labels:
+        severity: critical
+      annotations:
+        summary: Mimir bad runtime config (instance {{ $labels.instance }})
+        description: "{{ $labels.job }} failed to reload runtime config.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MimirSchedulerQueriesStuck
+      expr: 'sum by (job) (min_over_time(cortex_query_scheduler_queue_length[1m])) > 0'
+      for: 7m
+      labels:
+        severity: critical
+      annotations:
+        summary: Mimir scheduler queries stuck (instance {{ $labels.instance }})
+        description: "There are {{ $value }} queued up queries in {{ $labels.job }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MimirCacheRequestErrors
+      expr: '(sum by (name, operation, job) (rate(thanos_cache_operation_failures_total[5m])) / sum by (name, operation, job) (rate(thanos_cache_operations_total[5m]))) * 100 > 5'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Mimir cache request errors (instance {{ $labels.instance }})
+        description: "Mimir cache {{ $labels.name }} is experiencing {{ printf \"%.2f\" $value }}% errors for {{ $labels.operation }} operation.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MimirKvStoreFailure
+      expr: '(sum by (job, kv_name) (rate(cortex_kv_request_duration_seconds_count{status_code!~"2.."}[5m])) / sum by (job, kv_name) (rate(cortex_kv_request_duration_seconds_count[5m]))) == 1'
+      for: 5m
+      labels:
+        severity: critical
+      annotations:
+        summary: Mimir KV store failure (instance {{ $labels.instance }})
+        description: "Mimir {{ $labels.job }} KV store {{ $labels.kv_name }} is failing with 100% error rate.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MimirMemoryMapAreasTooHigh
+      expr: 'process_memory_map_areas{job=~".*(ingester|cortex|mimir|store-gateway).*"} / process_memory_map_areas_limit{job=~".*(ingester|cortex|mimir|store-gateway).*"} * 100 > 80'
+      for: 5m
+      labels:
+        severity: critical
+      annotations:
+        summary: Mimir memory map areas too high (instance {{ $labels.instance }})
+        description: "Mimir {{ $labels.job }} is using {{ printf \"%.0f\" $value }}% of its memory map area limit.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MimirIngesterInstanceHasNoTenants
+      expr: '(cortex_ingester_memory_users == 0) and on (instance) (cortex_ingester_memory_users offset 1h > 0)'
+      for: 1h
+      labels:
+        severity: warning
+      annotations:
+        summary: Mimir ingester instance has no tenants (instance {{ $labels.instance }})
+        description: "Mimir ingester {{ $labels.instance }} has no tenants assigned.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MimirRulerInstanceHasNoRuleGroups
+      expr: '(cortex_ruler_managers_total == 0) and on (instance) (cortex_ruler_managers_total offset 1h > 0)'
+      for: 1h
+      labels:
+        severity: warning
+      annotations:
+        summary: Mimir ruler instance has no rule groups (instance {{ $labels.instance }})
+        description: "Mimir ruler {{ $labels.instance }} has no rule groups assigned.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MimirIngestedDataTooFarInTheFuture
+      expr: 'max by (job) (cortex_ingester_tsdb_head_max_timestamp_seconds - time() and cortex_ingester_tsdb_head_max_timestamp_seconds > 0) > 3600'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Mimir ingested data too far in the future (instance {{ $labels.instance }})
+        description: "Mimir ingester {{ $labels.job }} has ingested samples with timestamps more than 1 hour in the future.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MimirStoreGatewayTooManyFailedOperations
+      expr: 'sum by (job) (rate(thanos_objstore_bucket_operation_failures_total[5m])) > 0'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Mimir store gateway too many failed operations (instance {{ $labels.instance }})
+        description: "Mimir store-gateway {{ $labels.job }} bucket operations are failing.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MimirRingMembersMismatch
+      expr: 'max by (name, job) (sum by (name, job, instance) (cortex_ring_members)) != min by (name, job) (sum by (name, job, instance) (cortex_ring_members))'
+      for: 15m
+      labels:
+        severity: warning
+      annotations:
+        summary: Mimir ring members mismatch (instance {{ $labels.instance }})
+        description: "Mimir {{ $labels.name }} ring has inconsistent member counts across instances.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MimirIngesterReachingSeriesLimitWarning
+      expr: '(cortex_ingester_memory_series / ignoring(limit) cortex_ingester_instance_limits{limit="max_series"} * 100 > 80) and cortex_ingester_instance_limits{limit="max_series"} > 0'
+      for: 3h
+      labels:
+        severity: warning
+      annotations:
+        summary: Mimir ingester reaching series limit warning (instance {{ $labels.instance }})
+        description: "Mimir ingester {{ $labels.instance }} has reached {{ printf \"%.0f\" $value }}% of its series limit.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MimirIngesterReachingSeriesLimitCritical
+      expr: '(cortex_ingester_memory_series / ignoring(limit) cortex_ingester_instance_limits{limit="max_series"} * 100 > 90) and cortex_ingester_instance_limits{limit="max_series"} > 0'
+      for: 5m
+      labels:
+        severity: critical
+      annotations:
+        summary: Mimir ingester reaching series limit critical (instance {{ $labels.instance }})
+        description: "Mimir ingester {{ $labels.instance }} has reached {{ printf \"%.0f\" $value }}% of its series limit.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MimirIngesterReachingTenantsLimitWarning
+      expr: '(cortex_ingester_memory_users / ignoring(limit) cortex_ingester_instance_limits{limit="max_tenants"} * 100 > 70) and cortex_ingester_instance_limits{limit="max_tenants"} > 0'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Mimir ingester reaching tenants limit warning (instance {{ $labels.instance }})
+        description: "Mimir ingester {{ $labels.instance }} has reached {{ printf \"%.0f\" $value }}% of its tenants limit.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MimirIngesterReachingTenantsLimitCritical
+      expr: '(cortex_ingester_memory_users / ignoring(limit) cortex_ingester_instance_limits{limit="max_tenants"} * 100 > 80) and cortex_ingester_instance_limits{limit="max_tenants"} > 0'
+      for: 5m
+      labels:
+        severity: critical
+      annotations:
+        summary: Mimir ingester reaching tenants limit critical (instance {{ $labels.instance }})
+        description: "Mimir ingester {{ $labels.instance }} has reached {{ printf \"%.0f\" $value }}% of its tenants limit.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MimirReachingTcpConnectionsLimit
+      expr: 'cortex_tcp_connections / cortex_tcp_connections_limit * 100 > 80 and cortex_tcp_connections_limit > 0'
+      for: 5m
+      labels:
+        severity: critical
+      annotations:
+        summary: Mimir reaching TCP connections limit (instance {{ $labels.instance }})
+        description: "Mimir instance {{ $labels.instance }} is using {{ printf \"%.0f\" $value }}% of its TCP connections limit.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MimirDistributorInflightRequestsHigh
+      expr: '(cortex_distributor_inflight_push_requests / ignoring(limit) cortex_distributor_instance_limits{limit="max_inflight_push_requests"} * 100 > 80) and cortex_distributor_instance_limits{limit="max_inflight_push_requests"} > 0'
+      for: 5m
+      labels:
+        severity: critical
+      annotations:
+        summary: Mimir distributor inflight requests high (instance {{ $labels.instance }})
+        description: "Mimir distributor {{ $labels.instance }} is using {{ printf \"%.0f\" $value }}% of its inflight push requests limit.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MimirIngesterTsdbHeadCompactionFailed
+      expr: 'rate(cortex_ingester_tsdb_compactions_failed_total[5m]) > 0'
+      for: 15m
+      labels:
+        severity: critical
+      annotations:
+        summary: Mimir ingester TSDB head compaction failed (instance {{ $labels.instance }})
+        description: "Mimir ingester {{ $labels.instance }} is failing to compact TSDB head.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MimirIngesterTsdbHeadTruncationFailed
+      expr: 'rate(cortex_ingester_tsdb_head_truncations_failed_total[5m]) > 0'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Mimir ingester TSDB head truncation failed (instance {{ $labels.instance }})
+        description: "Mimir ingester {{ $labels.instance }} is failing to truncate TSDB head.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MimirIngesterTsdbCheckpointCreationFailed
+      expr: 'rate(cortex_ingester_tsdb_checkpoint_creations_failed_total[5m]) > 0'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Mimir ingester TSDB checkpoint creation failed (instance {{ $labels.instance }})
+        description: "Mimir ingester {{ $labels.instance }} is failing to create TSDB checkpoints.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MimirIngesterTsdbCheckpointDeletionFailed
+      expr: 'rate(cortex_ingester_tsdb_checkpoint_deletions_failed_total[5m]) > 0'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Mimir ingester TSDB checkpoint deletion failed (instance {{ $labels.instance }})
+        description: "Mimir ingester {{ $labels.instance }} is failing to delete TSDB checkpoints.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MimirIngesterTsdbWalTruncationFailed
+      expr: 'rate(cortex_ingester_tsdb_wal_truncations_failed_total[5m]) > 0'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Mimir ingester TSDB WAL truncation failed (instance {{ $labels.instance }})
+        description: "Mimir ingester {{ $labels.instance }} is failing to truncate TSDB WAL.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MimirIngesterTsdbWalWritesFailed
+      expr: 'rate(cortex_ingester_tsdb_wal_writes_failed_total[1m]) > 0'
+      for: 3m
+      labels:
+        severity: critical
+      annotations:
+        summary: Mimir ingester TSDB WAL writes failed (instance {{ $labels.instance }})
+        description: "Mimir ingester {{ $labels.instance }} is failing to write to TSDB WAL.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MimirStoreGatewayHasNotSyncedBucket
+      expr: '(time() - cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 600) and cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 0'
+      for: 5m
+      labels:
+        severity: critical
+      annotations:
+        summary: Mimir store gateway has not synced bucket (instance {{ $labels.instance }})
+        description: "Mimir store-gateway {{ $labels.instance }} has not synced the bucket for more than 10 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MimirStoreGatewayNoSyncedTenants
+      expr: '(min by (instance, job) (cortex_bucket_stores_tenants_synced{component="store-gateway"}) == 0) and on (instance) (cortex_bucket_stores_tenants_synced{component="store-gateway"} offset 1h > 0)'
+      for: 1h
+      labels:
+        severity: warning
+      annotations:
+        summary: Mimir store gateway no synced tenants (instance {{ $labels.instance }})
+        description: "Mimir store-gateway {{ $labels.instance }} has no synced tenants.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MimirBucketIndexNotUpdated
+      expr: 'min by (user, job) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 2100'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Mimir bucket index not updated (instance {{ $labels.instance }})
+        description: "Mimir bucket index for tenant {{ $labels.user }} has not been updated for more than 35 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MimirCompactorNotCleaningUpBlocks
+      expr: '(time() - cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds > 21600) and cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds > 0'
+      for: 1h
+      labels:
+        severity: critical
+      annotations:
+        summary: Mimir compactor not cleaning up blocks (instance {{ $labels.instance }})
+        description: "Mimir compactor {{ $labels.instance }} has not cleaned up blocks in the last 6 hours.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MimirCompactorNotRunningCompaction
+      expr: '(time() - cortex_compactor_last_successful_run_timestamp_seconds > 86400) and cortex_compactor_last_successful_run_timestamp_seconds > 0'
+      for: 15m
+      labels:
+        severity: critical
+      annotations:
+        summary: Mimir compactor not running compaction (instance {{ $labels.instance }})
+        description: "Mimir compactor {{ $labels.instance }} has not run compaction in the last 24 hours.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MimirCompactorHasConsecutiveFailures
+      expr: 'increase(cortex_compactor_runs_failed_total[2h]) > 1'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Mimir compactor has consecutive failures (instance {{ $labels.instance }})
+        description: "Mimir compactor {{ $labels.instance }} has had 2+ compaction failures in the last 2 hours.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MimirCompactorHasRunOutOfDiskSpace
+      expr: 'increase(cortex_compactor_disk_out_of_space_errors_total[24h]) >= 1'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Mimir compactor has run out of disk space (instance {{ $labels.instance }})
+        description: "Mimir compactor {{ $labels.instance }} has run out of disk space.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MimirCompactorHasNotUploadedBlocks
+      expr: '(time() - thanos_objstore_bucket_last_successful_upload_time{component="compactor"} > 86400) and thanos_objstore_bucket_last_successful_upload_time{component="compactor"} > 0'
+      for: 15m
+      labels:
+        severity: critical
+      annotations:
+        summary: Mimir compactor has not uploaded blocks (instance {{ $labels.instance }})
+        description: "Mimir compactor {{ $labels.instance }} has not uploaded any block in the last 24 hours.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MimirCompactorSkippedBlocks
+      expr: 'increase(cortex_compactor_blocks_marked_for_no_compaction_total[5m]) > 0'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Mimir compactor skipped blocks (instance {{ $labels.instance }})
+        description: "Mimir compactor has found blocks that cannot be compacted (reason {{ $labels.reason }}).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MimirRulerTooManyFailedPushes
+      expr: '100 * sum by (instance, job) (rate(cortex_ruler_write_requests_failed_total[5m])) / sum by (instance, job) (rate(cortex_ruler_write_requests_total[5m])) > 1'
+      for: 5m
+      labels:
+        severity: critical
+      annotations:
+        summary: Mimir ruler too many failed pushes (instance {{ $labels.instance }})
+        description: "Mimir ruler {{ $labels.instance }} is failing to push {{ printf \"%.2f\" $value }}% of write requests.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MimirRulerTooManyFailedQueries
+      expr: '100 * sum by (instance, job) (rate(cortex_ruler_queries_failed_total[5m])) / sum by (instance, job) (rate(cortex_ruler_queries_total[5m])) > 1'
+      for: 5m
+      labels:
+        severity: critical
+      annotations:
+        summary: Mimir ruler too many failed queries (instance {{ $labels.instance }})
+        description: "Mimir ruler {{ $labels.instance }} is failing {{ printf \"%.2f\" $value }}% of query evaluations.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MimirRulerMissedEvaluations
+      expr: '100 * sum by (instance, job) (rate(cortex_prometheus_rule_group_iterations_missed_total[5m])) / sum by (instance, job) (rate(cortex_prometheus_rule_group_iterations_total[5m])) > 1'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Mimir ruler missed evaluations (instance {{ $labels.instance }})
+        description: "Mimir ruler {{ $labels.instance }} is missing {{ printf \"%.2f\" $value }}% of rule group evaluations.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MimirRulerFailedRingCheck
+      expr: 'sum by (job) (rate(cortex_ruler_ring_check_errors_total[5m])) > 0'
+      for: 5m
+      labels:
+        severity: critical
+      annotations:
+        summary: Mimir ruler failed ring check (instance {{ $labels.instance }})
+        description: "Mimir ruler {{ $labels.job }} is failing ring checks.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MimirAlertmanagerSyncConfigsFailing
+      expr: 'rate(cortex_alertmanager_sync_configs_failed_total[5m]) > 0'
+      for: 30m
+      labels:
+        severity: critical
+      annotations:
+        summary: Mimir alertmanager sync configs failing (instance {{ $labels.instance }})
+        description: "Mimir alertmanager {{ $labels.job }} is failing to sync configs.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MimirAlertmanagerRingCheckFailing
+      expr: 'rate(cortex_alertmanager_ring_check_errors_total[5m]) > 0'
+      for: 10m
+      labels:
+        severity: critical
+      annotations:
+        summary: Mimir alertmanager ring check failing (instance {{ $labels.instance }})
+        description: "Mimir alertmanager {{ $labels.job }} is failing ring checks.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MimirAlertmanagerStateMergeFailing
+      expr: 'rate(cortex_alertmanager_partial_state_merges_failed_total[5m]) > 0'
+      for: 10m
+      labels:
+        severity: critical
+      annotations:
+        summary: Mimir alertmanager state merge failing (instance {{ $labels.instance }})
+        description: "Mimir alertmanager {{ $labels.job }} is failing to merge state updates.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MimirAlertmanagerReplicationFailing
+      expr: 'rate(cortex_alertmanager_state_replication_failed_total[5m]) > 0'
+      for: 10m
+      labels:
+        severity: critical
+      annotations:
+        summary: Mimir alertmanager replication failing (instance {{ $labels.instance }})
+        description: "Mimir alertmanager {{ $labels.job }} is failing to replicate state.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MimirAlertmanagerPersistStateFailing
+      expr: 'rate(cortex_alertmanager_state_persist_failed_total[15m]) > 0'
+      for: 1h
+      labels:
+        severity: critical
+      annotations:
+        summary: Mimir alertmanager persist state failing (instance {{ $labels.instance }})
+        description: "Mimir alertmanager {{ $labels.job }} is failing to persist state.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MimirAlertmanagerInitialSyncFailed
+      expr: 'increase(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed"}[1m]) > 0'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Mimir alertmanager initial sync failed (instance {{ $labels.instance }})
+        description: "Mimir alertmanager {{ $labels.job }} failed initial state sync.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MimirAlertmanagerInstanceHasNoTenants
+      expr: '(cortex_alertmanager_tenants_owned == 0) and on (instance) (cortex_alertmanager_tenants_owned offset 1h > 0)'
+      for: 1h
+      labels:
+        severity: warning
+      annotations:
+        summary: Mimir alertmanager instance has no tenants (instance {{ $labels.instance }})
+        description: "Mimir alertmanager {{ $labels.instance }} has no tenants assigned.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MimirGossipMembersCountTooHigh
+      expr: 'avg(memberlist_client_cluster_members_count{job=~".*(mimir|cortex).*"}) by (job) * 1.15 + 10 < max(memberlist_client_cluster_members_count{job=~".*(mimir|cortex).*"}) by (job)'
+      for: 20m
+      labels:
+        severity: warning
+      annotations:
+        summary: Mimir gossip members count too high (instance {{ $labels.instance }})
+        description: "Mimir gossip cluster has more members than expected.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MimirGossipMembersCountTooLow
+      expr: 'avg(memberlist_client_cluster_members_count{job=~".*(mimir|cortex).*"}) by (job) * 0.5 > min(memberlist_client_cluster_members_count{job=~".*(mimir|cortex).*"}) by (job)'
+      for: 20m
+      labels:
+        severity: warning
+      annotations:
+        summary: Mimir gossip members count too low (instance {{ $labels.instance }})
+        description: "Mimir gossip cluster has fewer members than expected.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    # A high number of Go threads may indicate a goroutine leak.
+    - alert: MimirGoThreadsTooHighWarning
+      expr: 'go_threads{job=~".*(mimir|cortex).*"} > 5000'
+      for: 15m
+      labels:
+        severity: warning
+      annotations:
+        summary: Mimir go threads too high warning (instance {{ $labels.instance }})
+        description: "Mimir {{ $labels.instance }} has {{ $value }} Go threads.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: MimirGoThreadsTooHighCritical
+      expr: 'go_threads{job=~".*(mimir|cortex).*"} > 8000'
+      for: 15m
+      labels:
+        severity: critical
+      annotations:
+        summary: Mimir go threads too high critical (instance {{ $labels.instance }})
+        description: "Mimir {{ $labels.instance }} has {{ $value }} Go threads.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/dist/rules/grafana-tempo/embedded-exporter.yml
+++ b/dist/rules/grafana-tempo/embedded-exporter.yml
@ -0,0 +1,173 @@
+groups:
+
+- name: EmbeddedExporter
+
+  
+  rules:
+
+    - alert: TempoDistributorUnhealthy
+      expr: 'max by (job) (tempo_ring_members{state="Unhealthy", name="distributor"}) > 0'
+      for: 15m
+      labels:
+        severity: warning
+      annotations:
+        summary: Tempo distributor unhealthy (instance {{ $labels.instance }})
+        description: "Tempo has {{ $value }} unhealthy distributor(s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: TempoLiveStoreUnhealthy
+      expr: 'max by (job) (tempo_ring_members{state="Unhealthy", name="live-store"}) > 0'
+      for: 15m
+      labels:
+        severity: critical
+      annotations:
+        summary: Tempo live store unhealthy (instance {{ $labels.instance }})
+        description: "Tempo has {{ $value }} unhealthy live store(s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: TempoMetricsGeneratorUnhealthy
+      expr: 'max by (job) (tempo_ring_members{state="Unhealthy", name="metrics-generator"}) > 0'
+      for: 15m
+      labels:
+        severity: critical
+      annotations:
+        summary: Tempo metrics generator unhealthy (instance {{ $labels.instance }})
+        description: "Tempo has {{ $value }} unhealthy metrics generator(s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    # Uses a two-window approach: 1h for historical count and 5m to confirm the issue is ongoing.
+    - alert: TempoCompactionsFailing
+      expr: 'sum by (job) (increase(tempodb_compaction_errors_total[1h])) > 2 and sum by (job) (increase(tempodb_compaction_errors_total[5m])) > 0'
+      for: 1h
+      labels:
+        severity: critical
+      annotations:
+        summary: Tempo compactions failing (instance {{ $labels.instance }})
+        description: "Greater than 2 compactions have failed in the past hour.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: TempoPollsFailing
+      expr: 'sum by (job) (increase(tempodb_blocklist_poll_errors_total[1h])) > 2 and sum by (job) (increase(tempodb_blocklist_poll_errors_total[5m])) > 0'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Tempo polls failing (instance {{ $labels.instance }})
+        description: "Greater than 2 blocklist polls have failed in the past hour.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: TempoTenantIndexFailures
+      expr: 'sum by (job) (increase(tempodb_blocklist_tenant_index_errors_total[1h])) > 2 and sum by (job) (increase(tempodb_blocklist_tenant_index_errors_total[5m])) > 0'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Tempo tenant index failures (instance {{ $labels.instance }})
+        description: "Greater than 2 tenant index failures in the past hour.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: TempoNoTenantIndexBuilders
+      expr: 'sum by (tenant) (tempodb_blocklist_tenant_index_builder) == 0 and on() max(tempodb_blocklist_length) > 0'
+      for: 5m
+      labels:
+        severity: critical
+      annotations:
+        summary: Tempo no tenant index builders (instance {{ $labels.instance }})
+        description: "No tenant index builders for tenant {{ $labels.tenant }}. Tenant index will quickly become stale.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    # Threshold of 600s (10 minutes). Adjust based on your tenant index build interval.
+    - alert: TempoTenantIndexTooOld
+      expr: 'max by (tenant) (tempodb_blocklist_tenant_index_age_seconds) > 600'
+      for: 5m
+      labels:
+        severity: critical
+      annotations:
+        summary: Tempo tenant index too old (instance {{ $labels.instance }})
+        description: "Tenant index for {{ $labels.tenant }} is {{ $value }}s old.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    # Fires when the blocklist grows more than 40% over 7 days.
+    - alert: TempoBlockListRisingQuickly
+      expr: '(avg(tempodb_blocklist_length) / avg(tempodb_blocklist_length offset 7d) - 1) * 100 > 40'
+      for: 15m
+      labels:
+        severity: critical
+      annotations:
+        summary: Tempo block list rising quickly (instance {{ $labels.instance }})
+        description: "Tempo blocklist length is up {{ printf \"%.0f\" $value }}% over the last 7 days. Consider scaling compactors.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: TempoBadOverrides
+      expr: 'sum by (job) (tempo_runtime_config_last_reload_successful == 0) > 0'
+      for: 15m
+      labels:
+        severity: critical
+      annotations:
+        summary: Tempo bad overrides (instance {{ $labels.instance }})
+        description: "{{ $labels.job }} failed to reload runtime overrides.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: TempoUserConfigurableOverridesReloadFailing
+      expr: 'sum by (job) (increase(tempo_overrides_user_configurable_overrides_reload_failed_total[1h])) > 5 and sum by (job) (increase(tempo_overrides_user_configurable_overrides_reload_failed_total[5m])) > 0'
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: Tempo user configurable overrides reload failing (instance {{ $labels.instance }})
+        description: "Greater than 5 user-configurable overrides reloads have failed in the past hour.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    # Threshold of 100 blocks per compactor instance. Adjust based on your environment.
+    - alert: TempoCompactionTooManyOutstandingBlocksWarning
+      expr: 'sum by (instance) (tempodb_compaction_outstanding_blocks) > 100'
+      for: 6h
+      labels:
+        severity: warning
+      annotations:
+        summary: Tempo compaction too many outstanding blocks warning (instance {{ $labels.instance }})
+        description: "There are too many outstanding compaction blocks for {{ $labels.instance }}. Consider increasing compactor resources.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: TempoCompactionTooManyOutstandingBlocksCritical
+      expr: 'sum by (instance) (tempodb_compaction_outstanding_blocks) > 250'
+      for: 24h
+      labels:
+        severity: critical
+      annotations:
+        summary: Tempo compaction too many outstanding blocks critical (instance {{ $labels.instance }})
+        description: "There are too many outstanding compaction blocks for {{ $labels.instance }}. Increase compactor resources immediately.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: TempoDistributorUsageTrackerErrors
+      expr: 'sum by (job, reason) (rate(tempo_distributor_usage_tracker_errors_total[5m])) > 0'
+      for: 30m
+      labels:
+        severity: critical
+      annotations:
+        summary: Tempo distributor usage tracker errors (instance {{ $labels.instance }})
+        description: "Tempo distributor usage tracker errors for {{ $labels.job }} (reason {{ $labels.reason }}).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: TempoMetricsGeneratorProcessorUpdatesFailing
+      expr: 'sum by (job) (increase(tempo_metrics_generator_active_processors_update_failed_total[5m])) > 0'
+      for: 15m
+      labels:
+        severity: critical
+      annotations:
+        summary: Tempo metrics generator processor updates failing (instance {{ $labels.instance }})
+        description: "Tempo metrics generator processor updates are failing for {{ $labels.job }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: TempoMetricsGeneratorServiceGraphsDroppingSpans
+      expr: '100 * sum by (job) (rate(tempo_metrics_generator_processor_service_graphs_dropped_spans[5m])) / sum by (job) (rate(tempo_metrics_generator_spans_received_total[5m])) > 0.5'
+      for: 15m
+      labels:
+        severity: warning
+      annotations:
+        summary: Tempo metrics generator service graphs dropping spans (instance {{ $labels.instance }})
+        description: "Tempo metrics generator is dropping {{ printf \"%.2f\" $value }}% of spans in service graphs for {{ $labels.job }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: TempoMetricsGeneratorCollectionsFailing
+      expr: 'sum by (job) (increase(tempo_metrics_generator_registry_collections_failed_total[5m])) > 2'
+      for: 5m
+      labels:
+        severity: critical
+      annotations:
+        summary: Tempo metrics generator collections failing (instance {{ $labels.instance }})
+        description: "Tempo metrics generator collections are failing for {{ $labels.job }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    # Fires when the memcached error rate exceeds 20%. Only relevant if Tempo is configured with memcached caching.
+    - alert: TempoMemcachedErrorsElevated
+      expr: '100 * sum by (name, job) (rate(tempo_memcache_request_duration_seconds_count{status_code="500"}[5m])) / sum by (name, job) (rate(tempo_memcache_request_duration_seconds_count[5m])) > 20'
+      for: 10m
+      labels:
+        severity: warning
+      annotations:
+        summary: Tempo memcached errors elevated (instance {{ $labels.instance }})
+        description: "Tempo memcached error rate is {{ printf \"%.2f\" $value }}% for {{ $labels.name }} in {{ $labels.job }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"