From 03963ef6f94ab706f83a1c046caea74fe4df0560 Mon Sep 17 00:00:00 2001 From: Samuel Berthe Date: Tue, 17 Mar 2026 13:30:13 +0100 Subject: [PATCH] refactor(categories): change categories and move some exporters (#528) --- README.md | 52 +- _data/rules.yml | 1586 ++++++++++++++++++++++++----------------------- 2 files changed, 833 insertions(+), 805 deletions(-) diff --git a/README.md b/README.md index 337cc9b..a0ca983 100644 --- a/README.md +++ b/README.md @@ -54,7 +54,7 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts - [Process Exporter](https://samber.github.io/awesome-prometheus-alerts/rules#process-exporter) - [Systemd](https://samber.github.io/awesome-prometheus-alerts/rules#systemd) -#### Databases and brokers +#### Databases - [MySQL](https://samber.github.io/awesome-prometheus-alerts/rules#mysql) - [PostgreSQL](https://samber.github.io/awesome-prometheus-alerts/rules#postgresql) @@ -65,20 +65,22 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts - [Redis](https://samber.github.io/awesome-prometheus-alerts/rules#redis) - [Memcached](https://samber.github.io/awesome-prometheus-alerts/rules#memcached) - [MongoDB](https://samber.github.io/awesome-prometheus-alerts/rules#mongodb) -- [RabbitMQ](https://samber.github.io/awesome-prometheus-alerts/rules#rabbitmq) - [Elasticsearch](https://samber.github.io/awesome-prometheus-alerts/rules#elasticsearch) - [Meilisearch](https://samber.github.io/awesome-prometheus-alerts/rules#meilisearch) - [Cassandra](https://samber.github.io/awesome-prometheus-alerts/rules#cassandra) - [Clickhouse](https://samber.github.io/awesome-prometheus-alerts/rules#clickhouse) - [CouchDB](https://samber.github.io/awesome-prometheus-alerts/rules#couchdb) +- [Solr](https://samber.github.io/awesome-prometheus-alerts/rules#solr) + +#### Message brokers + +- [RabbitMQ](https://samber.github.io/awesome-prometheus-alerts/rules#rabbitmq) - [Zookeeper](https://samber.github.io/awesome-prometheus-alerts/rules#zookeeper) - [Kafka](https://samber.github.io/awesome-prometheus-alerts/rules#kafka) - [Pulsar](https://samber.github.io/awesome-prometheus-alerts/rules#pulsar) - [Nats](https://samber.github.io/awesome-prometheus-alerts/rules#nats) -- [Solr](https://samber.github.io/awesome-prometheus-alerts/rules#solr) -- [Hadoop](https://samber.github.io/awesome-prometheus-alerts/rules#hadoop) -#### Reverse proxies and load balancers +#### Proxies, load balancers and service meshes - [Nginx](https://samber.github.io/awesome-prometheus-alerts/rules#nginx) - [Apache](https://samber.github.io/awesome-prometheus-alerts/rules#apache) @@ -86,6 +88,8 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts - [Traefik](https://samber.github.io/awesome-prometheus-alerts/rules#traefik) - [Caddy](https://samber.github.io/awesome-prometheus-alerts/rules#caddy) - [Envoy](https://samber.github.io/awesome-prometheus-alerts/rules#envoy) +- [Linkerd](https://samber.github.io/awesome-prometheus-alerts/rules#linkerd) +- [Istio](https://samber.github.io/awesome-prometheus-alerts/rules#istio) #### Runtimes @@ -95,27 +99,32 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts - [Ruby](https://samber.github.io/awesome-prometheus-alerts/rules#ruby) - [Python](https://samber.github.io/awesome-prometheus-alerts/rules#python) - [Sidekiq](https://samber.github.io/awesome-prometheus-alerts/rules#sidekiq) + +#### Data engineering + - [Apache Flink](https://samber.github.io/awesome-prometheus-alerts/rules#apache-flink) - [Apache Spark](https://samber.github.io/awesome-prometheus-alerts/rules#apache-spark) +- [Hadoop](https://samber.github.io/awesome-prometheus-alerts/rules#hadoop) #### Orchestrators + - [Kubernetes](https://samber.github.io/awesome-prometheus-alerts/rules#kubernetes) - [Nomad](https://samber.github.io/awesome-prometheus-alerts/rules#nomad) - [Consul](https://samber.github.io/awesome-prometheus-alerts/rules#consul) - [Etcd](https://samber.github.io/awesome-prometheus-alerts/rules#etcd) -- [Linkerd](https://samber.github.io/awesome-prometheus-alerts/rules#linkerd) -- [Istio](https://samber.github.io/awesome-prometheus-alerts/rules#istio) +- [OpenStack](https://samber.github.io/awesome-prometheus-alerts/rules#openstack) + +#### CI/CD + +- [Jenkins](https://samber.github.io/awesome-prometheus-alerts/rules#jenkins) - [ArgoCD](https://samber.github.io/awesome-prometheus-alerts/rules#argocd) - [FluxCD](https://samber.github.io/awesome-prometheus-alerts/rules#fluxcd) -- [OpenStack](https://samber.github.io/awesome-prometheus-alerts/rules#openstack) +- [GitLab CI](https://samber.github.io/awesome-prometheus-alerts/rules#gitlab-ci) - [Spinnaker](https://samber.github.io/awesome-prometheus-alerts/rules#spinnaker) -#### Network, security and storage +#### Network and security -- [Ceph](https://samber.github.io/awesome-prometheus-alerts/rules#ceph) -- [ZFS](https://samber.github.io/awesome-prometheus-alerts/rules#zfs) -- [OpenEBS](https://samber.github.io/awesome-prometheus-alerts/rules#openebs) -- [Minio](https://samber.github.io/awesome-prometheus-alerts/rules#minio) +- [SpeedTest](https://samber.github.io/awesome-prometheus-alerts/rules#speedtest) - [SSL/TLS](https://samber.github.io/awesome-prometheus-alerts/rules#ssl/tls) - [cert-manager](https://samber.github.io/awesome-prometheus-alerts/rules#cert-manager) - [Juniper](https://samber.github.io/awesome-prometheus-alerts/rules#juniper) @@ -128,6 +137,13 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts - [Cilium](https://samber.github.io/awesome-prometheus-alerts/rules#cilium) - [WireGuard](https://samber.github.io/awesome-prometheus-alerts/rules#wireguard) +#### Storage + +- [Ceph](https://samber.github.io/awesome-prometheus-alerts/rules#ceph) +- [ZFS](https://samber.github.io/awesome-prometheus-alerts/rules#zfs) +- [OpenEBS](https://samber.github.io/awesome-prometheus-alerts/rules#openebs) +- [Minio](https://samber.github.io/awesome-prometheus-alerts/rules#minio) + #### Cloud providers - [AWS CloudWatch](https://samber.github.io/awesome-prometheus-alerts/rules#aws-cloudwatch) @@ -135,7 +151,7 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts - [DigitalOcean](https://samber.github.io/awesome-prometheus-alerts/rules#digitalocean) - [Azure](https://samber.github.io/awesome-prometheus-alerts/rules#azure) -#### Other +#### Observability - [Thanos](https://samber.github.io/awesome-prometheus-alerts/rules#thanos) - [Loki](https://samber.github.io/awesome-prometheus-alerts/rules#loki) @@ -145,11 +161,13 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts - [Grafana Mimir](https://samber.github.io/awesome-prometheus-alerts/rules#grafana-mimir) - [Grafana Alloy](https://samber.github.io/awesome-prometheus-alerts/rules#grafana-alloy) - [OpenTelemetry Collector](https://samber.github.io/awesome-prometheus-alerts/rules#opentelemetry-collector) -- [Jenkins](https://samber.github.io/awesome-prometheus-alerts/rules#jenkins) -- [GitLab](https://samber.github.io/awesome-prometheus-alerts/rules#gitlab) -- [Graph Node](https://samber.github.io/awesome-prometheus-alerts/rules#graph-node) - [Jaeger](https://samber.github.io/awesome-prometheus-alerts/rules#jaeger) +#### Other + +- [APC UPS](https://samber.github.io/awesome-prometheus-alerts/rules#apc-ups) +- [Graph Node](https://samber.github.io/awesome-prometheus-alerts/rules#graph-node) + ## 🤝 Contributing Contributions from community (you!) are most welcome! diff --git a/_data/rules.yml b/_data/rules.yml index 16b0ad2..7ddb8d3 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -856,7 +856,7 @@ groups: comments: | Triggers if timer hasn't fired in 24 hours. Adjust threshold per timer schedule. - - name: Databases and brokers + - name: Databases services: - name: MySQL exporters: @@ -1385,141 +1385,6 @@ groups: query: 'changes(mgob_scheduler_backup_total{status="500"}[1h]) > 0' severity: critical - - name: RabbitMQ - exporters: - - name: rabbitmq/rabbitmq-prometheus - slug: rabbitmq-exporter - doc_url: https://github.com/rabbitmq/rabbitmq-prometheus - rules: - - name: RabbitMQ node down - description: Less than 3 nodes running in RabbitMQ cluster - query: "sum(rabbitmq_build_info) < 3" - severity: critical - for: 1m - comments: | - 1m delay allows a restart without triggering an alert. - - name: RabbitMQ node not distributed - description: Distribution link state is not 'up' - query: "erlang_vm_dist_node_state < 3" - severity: critical - for: 1m - comments: | - 1m delay allows a restart without triggering an alert. - - name: RabbitMQ instances different versions - description: Running different version of RabbitMQ in the same cluster, can lead to failure. - query: "count(count(rabbitmq_build_info) by (rabbitmq_version)) > 1" - severity: warning - for: 1h - - name: RabbitMQ memory high - description: A node use more than 90% of allocated RAM - query: "rabbitmq_process_resident_memory_bytes / rabbitmq_resident_memory_limit_bytes * 100 > 90 and rabbitmq_resident_memory_limit_bytes > 0" - severity: warning - for: 2m - - name: RabbitMQ file descriptors usage - description: A node use more than 90% of file descriptors - query: "rabbitmq_process_open_fds / rabbitmq_process_max_fds * 100 > 90 and rabbitmq_process_max_fds > 0" - severity: warning - for: 2m - - name: RabbitMQ too many ready messages - description: RabbitMQ too many ready messages on {{ $labels.instance }} - query: "sum(rabbitmq_queue_messages_ready) BY (queue) > 1000" - severity: warning - for: 1m - - name: RabbitMQ too many unack messages - description: Too many unacknowledged messages - query: "sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000" - severity: warning - for: 1m - - name: RabbitMQ too many connections - description: The total connections of a node is too high - query: "rabbitmq_connections > 1000" - severity: warning - for: 2m - - name: RabbitMQ no queue consumer - description: A queue has less than 1 consumer - query: "rabbitmq_queue_consumers < 1" - severity: warning - for: 1m # allows a short service restart - - name: RabbitMQ unroutable messages - description: A queue has unroutable messages - query: "increase(rabbitmq_channel_messages_unroutable_returned_total[1m]) > 0 or increase(rabbitmq_channel_messages_unroutable_dropped_total[1m]) > 0" - severity: warning - for: 2m - - - name: kbudde/rabbitmq-exporter - slug: kbudde-rabbitmq-exporter - doc_url: https://github.com/kbudde/rabbitmq_exporter - rules: - - name: RabbitMQ down - description: RabbitMQ node down - query: "rabbitmq_up == 0" - severity: critical - for: 1m - comments: | - 1m delay allows a restart without triggering an alert. - - name: RabbitMQ cluster down - description: Less than 3 nodes running in RabbitMQ cluster - query: "sum(rabbitmq_running) < 3" - severity: critical - for: 1m - comments: | - 1m delay allows a restart without triggering an alert. - - name: RabbitMQ cluster partition - description: Cluster partition - query: "rabbitmq_partitions > 0" - severity: critical - - name: RabbitMQ out of memory - description: Memory available for RabbitMQ is low (< 10%) - query: "rabbitmq_node_mem_used / rabbitmq_node_mem_limit * 100 > 90 and rabbitmq_node_mem_limit > 0" - severity: warning - for: 2m - - name: RabbitMQ too many connections - description: RabbitMQ instance has too many connections (> 1000) - query: "rabbitmq_connectionsTotal > 1000" - severity: warning - for: 2m - - name: RabbitMQ dead letter queue filling up - description: Dead letter queue is filling up (> 10 msgs) - query: 'rabbitmq_queue_messages{queue="my-dead-letter-queue"} > 10' - severity: warning - for: 1m - comments: | - Indicate the queue name in dedicated label. - - name: RabbitMQ too many messages in queue - description: Queue is filling up (> 1000 msgs) - query: 'rabbitmq_queue_messages_ready{queue="my-queue"} > 1000' - severity: warning - for: 2m - comments: | - Indicate the queue name in dedicated label. - - name: RabbitMQ slow queue consuming - description: Queue messages are consumed slowly (> 60s) - query: 'time() - rabbitmq_queue_head_message_timestamp{queue="my-queue"} > 60' - severity: warning - for: 2m - comments: | - Indicate the queue name in dedicated label. - - name: RabbitMQ no consumer - description: Queue has no consumer - query: "rabbitmq_queue_consumers == 0" - severity: critical - for: 5m - comments: | - Allows a short service restart. - - name: RabbitMQ too many consumers - description: Queue should have only 1 consumer - query: 'rabbitmq_queue_consumers{queue="my-queue"} > 1' - severity: critical - comments: | - Indicate the queue name in dedicated label. - - name: RabbitMQ inactive exchange - description: Exchange receive less than 5 msgs per second - query: 'rate(rabbitmq_exchange_messages_published_in_total{exchange="my-exchange"}[1m]) < 5' - severity: warning - comments: | - Indicate the exchange name in dedicated label. - for: 2m - - name: Elasticsearch exporters: - name: prometheus-community/elasticsearch_exporter @@ -1999,6 +1864,167 @@ groups: severity: critical for: 1m + - name: Solr + exporters: + - name: embedded exporter + slug: embedded-exporter + doc_url: https://solr.apache.org/guide/8_11/monitoring-solr-with-prometheus-and-grafana.html + rules: + - name: Solr update errors + description: Solr collection {{ $labels.collection }} has failed updates for replica {{ $labels.replica }} on {{ $labels.base_url }}. + query: "increase(solr_metrics_core_update_handler_errors_total[1m]) > 1" + severity: critical + - name: Solr query errors + description: Solr has increased query errors in collection {{ $labels.collection }} for replica {{ $labels.replica }} on {{ $labels.base_url }}. + query: 'increase(solr_metrics_core_errors_total{category="QUERY"}[1m]) > 1' + severity: warning + for: 5m + - name: Solr replication errors + description: Solr collection {{ $labels.collection }} has replication errors for replica {{ $labels.replica }} on {{ $labels.base_url }}. + query: 'increase(solr_metrics_core_errors_total{category="REPLICATION"}[1m]) > 1' + severity: critical + - name: Solr low live node count + description: Solr collection {{ $labels.collection }} has less than two live nodes for replica {{ $labels.replica }} on {{ $labels.base_url }}. + query: "solr_collections_live_nodes < 2" + severity: critical + + - name: Message brokers + services: + - name: RabbitMQ + exporters: + - name: rabbitmq/rabbitmq-prometheus + slug: rabbitmq-exporter + doc_url: https://github.com/rabbitmq/rabbitmq-prometheus + rules: + - name: RabbitMQ node down + description: Less than 3 nodes running in RabbitMQ cluster + query: "sum(rabbitmq_build_info) < 3" + severity: critical + for: 1m + comments: | + 1m delay allows a restart without triggering an alert. + - name: RabbitMQ node not distributed + description: Distribution link state is not 'up' + query: "erlang_vm_dist_node_state < 3" + severity: critical + for: 1m + comments: | + 1m delay allows a restart without triggering an alert. + - name: RabbitMQ instances different versions + description: Running different version of RabbitMQ in the same cluster, can lead to failure. + query: "count(count(rabbitmq_build_info) by (rabbitmq_version)) > 1" + severity: warning + for: 1h + - name: RabbitMQ memory high + description: A node use more than 90% of allocated RAM + query: "rabbitmq_process_resident_memory_bytes / rabbitmq_resident_memory_limit_bytes * 100 > 90 and rabbitmq_resident_memory_limit_bytes > 0" + severity: warning + for: 2m + - name: RabbitMQ file descriptors usage + description: A node use more than 90% of file descriptors + query: "rabbitmq_process_open_fds / rabbitmq_process_max_fds * 100 > 90 and rabbitmq_process_max_fds > 0" + severity: warning + for: 2m + - name: RabbitMQ too many ready messages + description: RabbitMQ too many ready messages on {{ $labels.instance }} + query: "sum(rabbitmq_queue_messages_ready) BY (queue) > 1000" + severity: warning + for: 1m + - name: RabbitMQ too many unack messages + description: Too many unacknowledged messages + query: "sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000" + severity: warning + for: 1m + - name: RabbitMQ too many connections + description: The total connections of a node is too high + query: "rabbitmq_connections > 1000" + severity: warning + for: 2m + - name: RabbitMQ no queue consumer + description: A queue has less than 1 consumer + query: "rabbitmq_queue_consumers < 1" + severity: warning + for: 1m # allows a short service restart + - name: RabbitMQ unroutable messages + description: A queue has unroutable messages + query: "increase(rabbitmq_channel_messages_unroutable_returned_total[1m]) > 0 or increase(rabbitmq_channel_messages_unroutable_dropped_total[1m]) > 0" + severity: warning + for: 2m + + - name: kbudde/rabbitmq-exporter + slug: kbudde-rabbitmq-exporter + doc_url: https://github.com/kbudde/rabbitmq_exporter + rules: + - name: RabbitMQ down + description: RabbitMQ node down + query: "rabbitmq_up == 0" + severity: critical + for: 1m + comments: | + 1m delay allows a restart without triggering an alert. + - name: RabbitMQ cluster down + description: Less than 3 nodes running in RabbitMQ cluster + query: "sum(rabbitmq_running) < 3" + severity: critical + for: 1m + comments: | + 1m delay allows a restart without triggering an alert. + - name: RabbitMQ cluster partition + description: Cluster partition + query: "rabbitmq_partitions > 0" + severity: critical + - name: RabbitMQ out of memory + description: Memory available for RabbitMQ is low (< 10%) + query: "rabbitmq_node_mem_used / rabbitmq_node_mem_limit * 100 > 90 and rabbitmq_node_mem_limit > 0" + severity: warning + for: 2m + - name: RabbitMQ too many connections + description: RabbitMQ instance has too many connections (> 1000) + query: "rabbitmq_connectionsTotal > 1000" + severity: warning + for: 2m + - name: RabbitMQ dead letter queue filling up + description: Dead letter queue is filling up (> 10 msgs) + query: 'rabbitmq_queue_messages{queue="my-dead-letter-queue"} > 10' + severity: warning + for: 1m + comments: | + Indicate the queue name in dedicated label. + - name: RabbitMQ too many messages in queue + description: Queue is filling up (> 1000 msgs) + query: 'rabbitmq_queue_messages_ready{queue="my-queue"} > 1000' + severity: warning + for: 2m + comments: | + Indicate the queue name in dedicated label. + - name: RabbitMQ slow queue consuming + description: Queue messages are consumed slowly (> 60s) + query: 'time() - rabbitmq_queue_head_message_timestamp{queue="my-queue"} > 60' + severity: warning + for: 2m + comments: | + Indicate the queue name in dedicated label. + - name: RabbitMQ no consumer + description: Queue has no consumer + query: "rabbitmq_queue_consumers == 0" + severity: critical + for: 5m + comments: | + Allows a short service restart. + - name: RabbitMQ too many consumers + description: Queue should have only 1 consumer + query: 'rabbitmq_queue_consumers{queue="my-queue"} > 1' + severity: critical + comments: | + Indicate the queue name in dedicated label. + - name: RabbitMQ inactive exchange + description: Exchange receive less than 5 msgs per second + query: 'rate(rabbitmq_exchange_messages_published_in_total{exchange="my-exchange"}[1m]) < 5' + severity: warning + comments: | + Indicate the exchange name in dedicated label. + for: 2m + - name: Zookeeper exporters: - name: cloudflare/kafka_zookeeper_exporter @@ -2190,107 +2216,7 @@ groups: severity: warning for: 5m - - name: Solr - exporters: - - name: embedded exporter - slug: embedded-exporter - doc_url: https://solr.apache.org/guide/8_11/monitoring-solr-with-prometheus-and-grafana.html - rules: - - name: Solr update errors - description: Solr collection {{ $labels.collection }} has failed updates for replica {{ $labels.replica }} on {{ $labels.base_url }}. - query: "increase(solr_metrics_core_update_handler_errors_total[1m]) > 1" - severity: critical - - name: Solr query errors - description: Solr has increased query errors in collection {{ $labels.collection }} for replica {{ $labels.replica }} on {{ $labels.base_url }}. - query: 'increase(solr_metrics_core_errors_total{category="QUERY"}[1m]) > 1' - severity: warning - for: 5m - - name: Solr replication errors - description: Solr collection {{ $labels.collection }} has replication errors for replica {{ $labels.replica }} on {{ $labels.base_url }}. - query: 'increase(solr_metrics_core_errors_total{category="REPLICATION"}[1m]) > 1' - severity: critical - - name: Solr low live node count - description: Solr collection {{ $labels.collection }} has less than two live nodes for replica {{ $labels.replica }} on {{ $labels.base_url }}. - query: "solr_collections_live_nodes < 2" - severity: critical - - - name: Hadoop - exporters: - - name: hadoop/jmx_exporter - slug: jmx_exporter - doc_url: https://github.com/prometheus/jmx_exporter - rules: - # Alert rule for NameNode availability - - name: Hadoop Name Node Down - query: up{job="hadoop-namenode"} == 0 - for: 5m - severity: critical - description: "The Hadoop NameNode service is unavailable." - - # Alert rule for ResourceManager availability - - name: Hadoop Resource Manager Down - query: up{job="hadoop-resourcemanager"} == 0 - for: 5m - severity: critical - description: "The Hadoop ResourceManager service is unavailable." - - # Alert rule for DataNode status - - name: Hadoop Data Node Out Of Service - query: hadoop_datanode_last_heartbeat == 0 - for: 10m - severity: warning - description: "The Hadoop DataNode is not sending heartbeats." - - # Alert rule for low HDFS disk space - - name: Hadoop HDFS Disk Space Low - query: (hadoop_hdfs_bytes_total - hadoop_hdfs_bytes_used) / hadoop_hdfs_bytes_total < 0.1 - for: 15m - severity: warning - description: "Available HDFS disk space is running low." - - # Alert rule for excessive MapReduce task failures - - name: Hadoop Map Reduce Task Failures - query: increase(hadoop_mapreduce_task_failures_total[1h]) > 100 - for: 10m - severity: critical - description: "There is an unusually high number of MapReduce task failures." - - # Alert rule for high ResourceManager memory usage - - name: Hadoop Resource Manager Memory High - query: hadoop_resourcemanager_memory_bytes / hadoop_resourcemanager_memory_max_bytes > 0.8 - for: 15m - severity: warning - description: "The Hadoop ResourceManager is approaching its memory limit." - - # Alert rule for high YARN container allocation failures - - name: Hadoop YARN Container Allocation Failures - query: increase(hadoop_yarn_container_allocation_failures_total[1h]) > 10 - for: 10m - severity: warning - description: "There is a significant number of YARN container allocation failures." - - # Alert rule for excessive HBase region server region count - - name: Hadoop HBase Region Count High - query: hadoop_hbase_region_count > 5000 - for: 15m - severity: warning - description: "The HBase cluster has an unusually high number of regions." - - # Alert rule for low HBase region server heap space - - name: Hadoop HBase Region Server Heap Low - query: hadoop_hbase_region_server_heap_bytes / hadoop_hbase_region_server_max_heap_bytes > 0.8 - for: 10m - severity: warning - description: "HBase Region Servers are running low on heap space." - - # Alert rule for high HBase Write Requests latency - - name: Hadoop HBase Write Requests Latency High - query: hadoop_hbase_write_requests_latency_seconds > 0.5 - for: 10m - severity: warning - description: "HBase Write Requests are experiencing high latency." - - - name: Reverse proxies and load balancers + - name: Proxies, load balancers and service meshes services: - name: Nginx exporters: @@ -2645,6 +2571,74 @@ groups: severity: warning for: 5m + - name: Linkerd + exporters: + - name: Embedded exporter + slug: embedded-exporter + doc_url: https://linkerd.io/2/tasks/exporting-metrics/ + rules: + - name: Linkerd high error rate + description: Linkerd error rate for {{ $labels.deployment | $labels.statefulset | $labels.daemonset }} is over 10% + query: "sum(rate(request_errors_total[1m])) by (deployment, statefulset, daemonset) / sum(rate(request_total[1m])) by (deployment, statefulset, daemonset) * 100 > 10" + severity: warning + for: 1m + + - name: Istio + exporters: + - name: Embedded exporter + slug: embedded-exporter + doc_url: https://istio.io/latest/docs/tasks/observability/metrics/querying-metrics/ + rules: + - name: Istio Kubernetes gateway availability drop + description: Gateway pods have dropped. Inbound traffic will likely be affected. + query: 'min(kube_deployment_status_replicas_available{deployment="istio-ingressgateway", namespace="istio-system"}) without (instance, pod) < 2' + severity: warning + for: 1m + - name: Istio Pilot high total request rate + description: Number of Istio Pilot push errors is too high (> 5%). Envoy sidecars might have outdated configuration. + query: "sum(rate(pilot_xds_push_errors[1m])) / sum(rate(pilot_xds_pushes[1m])) * 100 > 5" + severity: warning + for: 1m + - name: Istio Mixer Prometheus dispatches low + description: Number of Mixer dispatches to Prometheus is too low. Istio metrics might not be being exported properly. + query: 'sum(rate(mixer_runtime_dispatches_total{adapter=~"prometheus"}[1m])) < 180' + severity: warning + for: 1m + - name: Istio high total request rate + description: Global request rate in the service mesh is unusually high. + query: 'sum(rate(istio_requests_total{reporter="destination"}[5m])) > 1000' + severity: warning + for: 2m + - name: Istio low total request rate + description: Global request rate in the service mesh is unusually low. + query: 'sum(rate(istio_requests_total{reporter="destination"}[5m])) < 100' + severity: warning + for: 2m + - name: Istio high 4xx error rate + description: High percentage of HTTP 4xx responses in Istio (> 5%). + query: 'sum(rate(istio_requests_total{reporter="destination", response_code=~"4.*"}[5m])) / sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100 > 5' + severity: warning + for: 1m + - name: Istio high 5xx error rate + description: High percentage of HTTP 5xx responses in Istio (> 5%). + query: 'sum(rate(istio_requests_total{reporter="destination", response_code=~"5.*"}[5m])) / sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100 > 5' + severity: warning + for: 1m + - name: Istio high request latency + description: Istio average requests execution is longer than 100ms. + query: 'rate(istio_request_duration_milliseconds_sum{reporter="destination"}[1m]) / rate(istio_request_duration_milliseconds_count{reporter="destination"}[1m]) > 100' + severity: warning + for: 1m + - name: Istio latency 99 percentile + description: Istio 1% slowest requests are longer than 1000ms. + query: "histogram_quantile(0.99, sum(rate(istio_request_duration_milliseconds_bucket[1m])) by (destination_canonical_service, destination_workload_namespace, source_canonical_service, source_workload_namespace, le)) > 1000" + severity: warning + for: 1m + - name: Istio Pilot Duplicate Entry + description: Istio pilot duplicate entry error. + query: "sum(rate(pilot_duplicate_envoy_clusters{}[5m])) > 0" + severity: critical + - name: Runtimes services: - name: PHP-FPM @@ -2892,6 +2886,8 @@ groups: query: "max(sidekiq_queue_latency) > 60" severity: critical + - name: Data engineering + services: - name: Apache Flink exporters: - name: Built-in Prometheus reporter @@ -3029,6 +3025,82 @@ groups: comments: | Disk spilling indicates insufficient memory for the workload. + - name: Hadoop + exporters: + - name: hadoop/jmx_exporter + slug: jmx_exporter + doc_url: https://github.com/prometheus/jmx_exporter + rules: + # Alert rule for NameNode availability + - name: Hadoop Name Node Down + query: up{job="hadoop-namenode"} == 0 + for: 5m + severity: critical + description: "The Hadoop NameNode service is unavailable." + + # Alert rule for ResourceManager availability + - name: Hadoop Resource Manager Down + query: up{job="hadoop-resourcemanager"} == 0 + for: 5m + severity: critical + description: "The Hadoop ResourceManager service is unavailable." + + # Alert rule for DataNode status + - name: Hadoop Data Node Out Of Service + query: hadoop_datanode_last_heartbeat == 0 + for: 10m + severity: warning + description: "The Hadoop DataNode is not sending heartbeats." + + # Alert rule for low HDFS disk space + - name: Hadoop HDFS Disk Space Low + query: (hadoop_hdfs_bytes_total - hadoop_hdfs_bytes_used) / hadoop_hdfs_bytes_total < 0.1 + for: 15m + severity: warning + description: "Available HDFS disk space is running low." + + # Alert rule for excessive MapReduce task failures + - name: Hadoop Map Reduce Task Failures + query: increase(hadoop_mapreduce_task_failures_total[1h]) > 100 + for: 10m + severity: critical + description: "There is an unusually high number of MapReduce task failures." + + # Alert rule for high ResourceManager memory usage + - name: Hadoop Resource Manager Memory High + query: hadoop_resourcemanager_memory_bytes / hadoop_resourcemanager_memory_max_bytes > 0.8 + for: 15m + severity: warning + description: "The Hadoop ResourceManager is approaching its memory limit." + + # Alert rule for high YARN container allocation failures + - name: Hadoop YARN Container Allocation Failures + query: increase(hadoop_yarn_container_allocation_failures_total[1h]) > 10 + for: 10m + severity: warning + description: "There is a significant number of YARN container allocation failures." + + # Alert rule for excessive HBase region server region count + - name: Hadoop HBase Region Count High + query: hadoop_hbase_region_count > 5000 + for: 15m + severity: warning + description: "The HBase cluster has an unusually high number of regions." + + # Alert rule for low HBase region server heap space + - name: Hadoop HBase Region Server Heap Low + query: hadoop_hbase_region_server_heap_bytes / hadoop_hbase_region_server_max_heap_bytes > 0.8 + for: 10m + severity: warning + description: "HBase Region Servers are running low on heap space." + + # Alert rule for high HBase Write Requests latency + - name: Hadoop HBase Write Requests Latency High + query: hadoop_hbase_write_requests_latency_seconds > 0.5 + for: 10m + severity: warning + description: "HBase Write Requests are experiencing high latency." + - name: Orchestrators services: - name: Kubernetes @@ -3350,118 +3422,6 @@ groups: severity: warning for: 2m - - name: Linkerd - exporters: - - name: Embedded exporter - slug: embedded-exporter - doc_url: https://linkerd.io/2/tasks/exporting-metrics/ - rules: - - name: Linkerd high error rate - description: Linkerd error rate for {{ $labels.deployment | $labels.statefulset | $labels.daemonset }} is over 10% - query: "sum(rate(request_errors_total[1m])) by (deployment, statefulset, daemonset) / sum(rate(request_total[1m])) by (deployment, statefulset, daemonset) * 100 > 10" - severity: warning - for: 1m - - - name: Istio - exporters: - - name: Embedded exporter - slug: embedded-exporter - doc_url: https://istio.io/latest/docs/tasks/observability/metrics/querying-metrics/ - rules: - - name: Istio Kubernetes gateway availability drop - description: Gateway pods have dropped. Inbound traffic will likely be affected. - query: 'min(kube_deployment_status_replicas_available{deployment="istio-ingressgateway", namespace="istio-system"}) without (instance, pod) < 2' - severity: warning - for: 1m - - name: Istio Pilot high total request rate - description: Number of Istio Pilot push errors is too high (> 5%). Envoy sidecars might have outdated configuration. - query: "sum(rate(pilot_xds_push_errors[1m])) / sum(rate(pilot_xds_pushes[1m])) * 100 > 5" - severity: warning - for: 1m - - name: Istio Mixer Prometheus dispatches low - description: Number of Mixer dispatches to Prometheus is too low. Istio metrics might not be being exported properly. - query: 'sum(rate(mixer_runtime_dispatches_total{adapter=~"prometheus"}[1m])) < 180' - severity: warning - for: 1m - - name: Istio high total request rate - description: Global request rate in the service mesh is unusually high. - query: 'sum(rate(istio_requests_total{reporter="destination"}[5m])) > 1000' - severity: warning - for: 2m - - name: Istio low total request rate - description: Global request rate in the service mesh is unusually low. - query: 'sum(rate(istio_requests_total{reporter="destination"}[5m])) < 100' - severity: warning - for: 2m - - name: Istio high 4xx error rate - description: High percentage of HTTP 4xx responses in Istio (> 5%). - query: 'sum(rate(istio_requests_total{reporter="destination", response_code=~"4.*"}[5m])) / sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100 > 5' - severity: warning - for: 1m - - name: Istio high 5xx error rate - description: High percentage of HTTP 5xx responses in Istio (> 5%). - query: 'sum(rate(istio_requests_total{reporter="destination", response_code=~"5.*"}[5m])) / sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100 > 5' - severity: warning - for: 1m - - name: Istio high request latency - description: Istio average requests execution is longer than 100ms. - query: 'rate(istio_request_duration_milliseconds_sum{reporter="destination"}[1m]) / rate(istio_request_duration_milliseconds_count{reporter="destination"}[1m]) > 100' - severity: warning - for: 1m - - name: Istio latency 99 percentile - description: Istio 1% slowest requests are longer than 1000ms. - query: "histogram_quantile(0.99, sum(rate(istio_request_duration_milliseconds_bucket[1m])) by (destination_canonical_service, destination_workload_namespace, source_canonical_service, source_workload_namespace, le)) > 1000" - severity: warning - for: 1m - - name: Istio Pilot Duplicate Entry - description: Istio pilot duplicate entry error. - query: "sum(rate(pilot_duplicate_envoy_clusters{}[5m])) > 0" - severity: critical - - - name: ArgoCD - exporters: - - name: Embedded exporter - slug: embedded-exporter - doc_url: https://argo-cd.readthedocs.io/en/stable/operator-manual/metrics/ - rules: - - name: ArgoCD service not synced - description: Service {{ $labels.name }} run by argo is currently not in sync. - query: 'argocd_app_info{sync_status!="Synced"} != 0' - severity: warning - for: 15m - - name: ArgoCD service unhealthy - description: Service {{ $labels.name }} run by argo is currently not healthy. - query: 'argocd_app_info{health_status!="Healthy"} != 0' - severity: warning - for: 15m - - - name: FluxCD - exporters: - - name: Embedded exporter - slug: embedded-exporter - doc_url: https://fluxcd.io/flux/monitoring/metrics/ - rules: - - name: Flux Kustomization Failure - description: The {{ $labels.customresource_kind }} '{{ $labels.name }}' in namespace {{ $labels.exported_namespace }} is marked as not ready. - query: 'gotk_resource_info{ready="False", customresource_kind="Kustomization"} > 0' - severity: warning - for: 15m - - name: Flux HelmRelease Failure - description: The {{ $labels.customresource_kind }} '{{ $labels.name }}' in namespace {{ $labels.exported_namespace }} is marked as not ready. - query: 'gotk_resource_info{ready="False", customresource_kind="HelmRelease"} > 0' - severity: warning - for: 15m - - name: Flux Source Issue - description: Flux source {{ $labels.customresource_kind }} '{{ $labels.name }}' has issue(s). - query: 'gotk_resource_info{ready="False", customresource_kind=~"GitRepository|HelmRepository|Bucket|OCIRepository"} > 0' - severity: warning - for: 15m - - name: Flux Image Issue - description: The {{ $labels.customresource_kind }} '{{ $labels.name }}' is marked as not ready. - query: 'gotk_resource_info{ready="False", customresource_kind=~"ImagePolicy|ImageRepository|ImageUpdateAutomation"} > 0' - severity: warning - for: 15m - - name: OpenStack exporters: - name: openstack-exporter/openstack-exporter @@ -3573,6 +3533,318 @@ groups: This alert factors in the allocation ratio to compute effective capacity. The threshold of 90% is a rough default. Adjust based on your allocation ratios and workload patterns. + - name: CI/CD + services: + - name: Jenkins + exporters: + - name: Metric plugin + slug: metric-plugin + doc_url: https://plugins.jenkins.io/prometheus/ + rules: + - name: Jenkins node offline + description: "At least one Jenkins node offline: `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})" + query: "jenkins_node_offline_value > 0" + severity: critical + for: 5m + - name: Jenkins no node online + description: "No Jenkins nodes are online: `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})" + query: "jenkins_node_online_value == 0" + severity: critical + - name: Jenkins healthcheck + description: "Jenkins healthcheck score: {{$value}}. Healthcheck failure for `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})" + query: "jenkins_health_check_score < 1" + severity: critical + - name: Jenkins outdated plugins + description: "{{ $value }} plugins need update" + query: "sum(jenkins_plugins_withUpdate) by (instance) > 3" + severity: warning + for: 1d + - name: Jenkins builds health score + description: "Healthcheck failure for `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})" + query: "default_jenkins_builds_health_score < 1" + severity: critical + - name: Jenkins run failure total + description: "Job run failures: ({{$value}}) {{$labels.jenkins_job}}. Healthcheck failure for `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})" + query: "delta(jenkins_runs_failure_total[1h]) > 100" + severity: warning + - name: Jenkins build tests failing + description: "Last build tests failed: {{$labels.jenkins_job}}. Failed build Tests for job `{{$labels.jenkins_job}}` on {{$labels.instance}}/{{$labels.env}} ({{$labels.region}})" + query: "default_jenkins_builds_last_build_tests_failing > 0" + severity: warning + - name: Jenkins last build failed + description: "Last build failed: {{$labels.jenkins_job}}. Failed build for job `{{$labels.jenkins_job}}` on {{$labels.instance}}/{{$labels.env}} ({{$labels.region}})" + query: "default_jenkins_builds_last_build_result_ordinal == 2" + severity: warning + comments: | + * RUNNING -1 true - The build had no errors. + * SUCCESS 0 true - The build had no errors. + * UNSTABLE 1 true - The build had some errors but they were not fatal. For example, some tests failed. + * FAILURE 2 false - The build had a fatal error. + * NOT_BUILT 3 false - The module was not built. + * ABORTED 4 false - The build was manually aborted. + + - name: ArgoCD + exporters: + - name: Embedded exporter + slug: embedded-exporter + doc_url: https://argo-cd.readthedocs.io/en/stable/operator-manual/metrics/ + rules: + - name: ArgoCD service not synced + description: Service {{ $labels.name }} run by argo is currently not in sync. + query: 'argocd_app_info{sync_status!="Synced"} != 0' + severity: warning + for: 15m + - name: ArgoCD service unhealthy + description: Service {{ $labels.name }} run by argo is currently not healthy. + query: 'argocd_app_info{health_status!="Healthy"} != 0' + severity: warning + for: 15m + + - name: FluxCD + exporters: + - name: Embedded exporter + slug: embedded-exporter + doc_url: https://fluxcd.io/flux/monitoring/metrics/ + rules: + - name: Flux Kustomization Failure + description: The {{ $labels.customresource_kind }} '{{ $labels.name }}' in namespace {{ $labels.exported_namespace }} is marked as not ready. + query: 'gotk_resource_info{ready="False", customresource_kind="Kustomization"} > 0' + severity: warning + for: 15m + - name: Flux HelmRelease Failure + description: The {{ $labels.customresource_kind }} '{{ $labels.name }}' in namespace {{ $labels.exported_namespace }} is marked as not ready. + query: 'gotk_resource_info{ready="False", customresource_kind="HelmRelease"} > 0' + severity: warning + for: 15m + - name: Flux Source Issue + description: Flux source {{ $labels.customresource_kind }} '{{ $labels.name }}' has issue(s). + query: 'gotk_resource_info{ready="False", customresource_kind=~"GitRepository|HelmRepository|Bucket|OCIRepository"} > 0' + severity: warning + for: 15m + - name: Flux Image Issue + description: The {{ $labels.customresource_kind }} '{{ $labels.name }}' is marked as not ready. + query: 'gotk_resource_info{ready="False", customresource_kind=~"ImagePolicy|ImageRepository|ImageUpdateAutomation"} > 0' + severity: warning + for: 15m + + - name: GitLab CI + exporters: + - name: GitLab built-in exporter + slug: gitlab-built-in-exporter + doc_url: https://docs.gitlab.com/administration/monitoring/prometheus/gitlab_metrics/ + rules: + # Puma web server + - name: GitLab Puma high queued connections + description: "GitLab Puma has {{ $value }} queued connections on {{ $labels.instance }}. Requests are waiting for an available worker thread." + query: "avg_over_time(puma_queued_connections[5m]) > 5" + severity: warning + for: 5m + comments: | + Queued connections indicate Puma workers are saturated. + Consider increasing puma['worker_processes'] or puma['max_threads'] in gitlab.rb. + - name: GitLab Puma no available pool capacity + description: "GitLab Puma pool capacity on {{ $labels.instance }} has been at 0 for 5 minutes. All threads are busy." + query: "puma_pool_capacity == 0" + severity: critical + for: 5m + - name: GitLab Puma workers not running + description: "GitLab Puma on {{ $labels.instance }} has {{ $value }} running workers out of expected total." + query: "puma_running_workers < puma_workers" + severity: warning + for: 5m + # HTTP request handling + - name: GitLab high HTTP error rate + description: "GitLab is returning more than 5% HTTP 5xx errors on {{ $labels.instance }}." + query: 'sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m])) * 100 > 5' + severity: critical + for: 5m + comments: | + Threshold is 5% of all requests returning server errors. + Check GitLab logs at /var/log/gitlab/ for root cause. + - name: GitLab high HTTP request latency + description: "GitLab p95 HTTP request latency on {{ $labels.instance }} is above 10 seconds." + query: "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le)) > 10" + severity: warning + for: 5m + comments: | + Threshold of 10s may need adjustment based on your instance size and workload. + # Sidekiq background jobs + - name: GitLab Sidekiq jobs failing + description: "GitLab Sidekiq jobs are failing at a rate of {{ $value }} per second on {{ $labels.instance }}." + query: "rate(sidekiq_jobs_failed_total[5m]) > 0" + severity: warning + for: 10m + comments: | + This metric requires the emit_sidekiq_histogram_metrics feature flag to be enabled. + A sustained failure rate indicates background processing issues. + - name: GitLab Sidekiq queue too large + description: "GitLab Sidekiq has {{ $value }} running jobs, approaching concurrency limit on {{ $labels.instance }}." + query: "sum(sidekiq_running_jobs) >= sum(sidekiq_concurrency) * 0.9" + severity: warning + for: 10m + comments: | + When running jobs approach the concurrency limit, new jobs will queue up. + Consider scaling Sidekiq workers or increasing concurrency. + - name: GitLab Sidekiq high job completion time + description: "GitLab Sidekiq job average completion time on {{ $labels.instance }} is above 5 minutes." + query: "histogram_quantile(0.95, sum(rate(sidekiq_jobs_completion_seconds_bucket[5m])) by (le, worker)) > 300" + severity: warning + for: 10m + comments: | + This metric requires the emit_sidekiq_histogram_metrics feature flag to be enabled. + - name: GitLab Sidekiq high queue latency + description: "GitLab Sidekiq jobs on {{ $labels.instance }} are waiting more than 60 seconds before being processed." + query: "histogram_quantile(0.95, sum(rate(sidekiq_jobs_queue_duration_seconds_bucket[5m])) by (le)) > 60" + severity: warning + for: 5m + comments: | + This metric requires the emit_sidekiq_histogram_metrics feature flag to be enabled. + High queue latency means jobs are stuck waiting. Check Sidekiq concurrency and queue sizes. + # Database connection pool + - name: GitLab database connection pool saturation + description: "GitLab database connection pool on {{ $labels.instance }} ({{ $labels.class }}) is {{ $value }}% busy." + query: "gitlab_database_connection_pool_busy / gitlab_database_connection_pool_size * 100 > 90" + severity: warning + for: 5m + comments: | + When the pool is near saturation, requests may block waiting for a connection. + Increase db_pool_size in gitlab.rb or investigate slow queries. + - name: GitLab database connection pool dead connections + description: "GitLab database connection pool on {{ $labels.instance }} ({{ $labels.class }}) has {{ $value }} dead connections." + query: "gitlab_database_connection_pool_dead > 0" + severity: warning + for: 5m + - name: GitLab database connection pool waiting + description: "GitLab on {{ $labels.instance }} has {{ $value }} threads waiting for a database connection." + query: "gitlab_database_connection_pool_waiting > 0" + severity: warning + for: 5m + # CI/CD pipelines + - name: GitLab CI pipeline creation slow + description: "GitLab CI pipeline creation p95 latency on {{ $labels.instance }} is above 30 seconds." + query: "histogram_quantile(0.95, sum(rate(gitlab_ci_pipeline_creation_duration_seconds_bucket[5m])) by (le)) > 30" + severity: warning + for: 5m + - name: GitLab CI pipeline failures increasing + description: "GitLab CI pipeline failures are increasing on {{ $labels.instance }} ({{ $value }}/s)." + query: "rate(gitlab_ci_pipeline_failure_reasons[5m]) > 0" + severity: warning + for: 10m + - name: GitLab CI runner authentication failures + description: "GitLab CI runners are experiencing authentication failures on {{ $labels.instance }} ({{ $value }} failures)." + query: "increase(gitlab_ci_runner_authentication_failure_total[5m]) > 5" + severity: warning + for: 5m + comments: | + Frequent runner auth failures may indicate expired tokens or misconfigured runners. + # Ruby process health + - name: GitLab high memory usage + description: "GitLab process on {{ $labels.instance }} is using {{ $value | humanize1024 }}B of RSS memory." + query: "process_resident_memory_bytes{job=~\".*gitlab.*\"} > 2e+9" + severity: warning + for: 10m + comments: | + Threshold of 2GB may need adjustment based on your instance size. + High memory usage can lead to OOM kills and service disruptions. + - name: GitLab Ruby heap fragmentation + description: "GitLab Ruby heap fragmentation on {{ $labels.instance }} is {{ $value }}. High fragmentation wastes memory." + query: "ruby_gc_stat_ext_heap_fragmentation{job=~\".*gitlab.*\"} > 0.5" + severity: warning + for: 15m + comments: | + Heap fragmentation above 50% means a significant amount of memory is wasted. + A Puma worker restart may help reclaim memory. + # Uncaught errors + - name: GitLab rack uncaught errors + description: "GitLab is experiencing uncaught errors in the Rack layer on {{ $labels.instance }} ({{ $value }}/s)." + query: "rate(rack_uncaught_errors_total[5m]) > 0" + severity: warning + for: 5m + # Application version / deployment + - name: GitLab version mismatch + description: "Multiple GitLab versions are running across the fleet." + query: 'count(count by (version) (deployments{version!=""})) > 1' + severity: warning + comments: | + This may happen during a rolling deployment. If it persists, investigate incomplete upgrades. + # File descriptors + - name: GitLab high file descriptor usage + description: "GitLab on {{ $labels.instance }} is using {{ $value }}% of available file descriptors." + query: 'process_open_fds{job=~".*gitlab.*"} / process_max_fds * 100 > 80' + severity: warning + for: 5m + # Ruby threads + - name: GitLab Ruby threads saturated + description: "GitLab running threads on {{ $labels.instance }} have exceeded the expected maximum ({{ $value }})." + query: "sum by (instance) (gitlab_ruby_threads_running_threads) > on(instance) gitlab_ruby_threads_max_expected_threads * 1.5" + severity: warning + for: 10m + + - name: Workhorse + slug: workhorse + doc_url: https://docs.gitlab.com/administration/monitoring/prometheus/gitlab_metrics/#gitlab-workhorse + rules: + - name: GitLab Workhorse high error rate + description: "GitLab Workhorse on {{ $labels.instance }} is returning more than 10% HTTP 5xx errors." + query: 'sum(rate(gitlab_workhorse_http_request_duration_seconds_count{code=~"5.."}[5m])) / sum(rate(gitlab_workhorse_http_request_duration_seconds_count[5m])) * 100 > 10' + severity: critical + for: 5m + comments: | + Workhorse sits in front of Puma and handles Git HTTP, file uploads, and proxying. + Threshold from GitLab Omnibus default rules: 10% for high-traffic instances. + - name: GitLab Workhorse high latency + description: "GitLab Workhorse on {{ $labels.instance }} p95 request latency is above 10 seconds." + query: "histogram_quantile(0.95, sum(rate(gitlab_workhorse_http_request_duration_seconds_bucket[5m])) by (le)) > 10" + severity: warning + for: 5m + - name: GitLab Workhorse high in-flight requests + description: "GitLab Workhorse on {{ $labels.instance }} has {{ $value }} in-flight requests." + query: "gitlab_workhorse_http_in_flight_requests > 100" + severity: warning + for: 5m + comments: | + Threshold of 100 may need adjustment based on instance size. + + - name: Gitaly + slug: gitaly + doc_url: https://docs.gitlab.com/administration/gitaly/monitoring/ + rules: + - name: GitLab Gitaly high gRPC error rate + description: "Gitaly on {{ $labels.instance }} is returning more than 5% gRPC errors." + query: 'sum(rate(grpc_server_handled_total{job="gitaly",grpc_code!="OK"}[5m])) / sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) * 100 > 5' + severity: warning + for: 5m + - name: GitLab Gitaly resource exhausted + description: "Gitaly on {{ $labels.instance }} is returning ResourceExhausted errors, indicating overload ({{ $value }}%)." + query: 'sum(rate(grpc_server_handled_total{job="gitaly",grpc_code="ResourceExhausted"}[5m])) / sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) * 100 > 1' + severity: critical + for: 5m + comments: | + ResourceExhausted errors from Gitaly mean Git operations are being rejected due to + concurrency limits. This directly impacts users trying to push, pull, or clone. + This alert is derived from the GitLab Omnibus default rules. + - name: GitLab Gitaly high RPC latency + description: "Gitaly on {{ $labels.instance }} p95 unary RPC latency exceeds 1 second ({{ $value }}s)." + query: 'histogram_quantile(0.95, sum(rate(grpc_server_handling_seconds_bucket{job="gitaly",grpc_type="unary"}[5m])) by (le)) > 1' + severity: warning + for: 5m + - name: GitLab Gitaly CPU throttled + description: "Gitaly processes on {{ $labels.instance }} are being CPU throttled by cgroups." + query: "rate(gitaly_cgroup_cpu_cfs_throttled_seconds_total[5m]) > 0" + severity: warning + for: 5m + - name: GitLab Gitaly authentication failures + description: "Gitaly on {{ $labels.instance }} has authentication failures ({{ $value }})." + query: 'increase(gitaly_authentications_total{status="failed"}[5m]) > 0' + severity: warning + - name: GitLab Gitaly circuit breaker tripped + description: "Gitaly circuit breaker has tripped on {{ $labels.instance }}. Git operations are failing." + query: 'increase(gitaly_circuit_breaker_transitions_total{to_state="open"}[5m]) > 0' + severity: critical + comments: | + When the circuit breaker trips to "open" state, Git operations (push, pull, clone) will fail. + Check Gitaly service health and logs. + - name: Spinnaker exporters: - name: Embedded exporter @@ -3656,74 +3928,8 @@ groups: This metric is specific to AWS cloud providers in Clouddriver. The 1000ms threshold is a rough default. Adjust based on your AWS usage patterns. - - name: Network, security and storage + - name: Network and security services: - - name: Ceph - exporters: - - name: Embedded exporter - slug: embedded-exporter - doc_url: https://docs.ceph.com/en/quincy/mgr/prometheus/ - rules: - - name: Ceph State - description: Ceph instance unhealthy - query: "ceph_health_status != 0" - severity: critical - - name: Ceph monitor clock skew - description: Ceph monitor clock skew detected. Please check ntp and hardware clock settings - query: "abs(ceph_monitor_clock_skew_seconds) > 0.2" - severity: warning - for: 2m - - name: Ceph monitor low space - description: Ceph monitor storage is low. - query: "ceph_monitor_avail_percent < 10" - severity: warning - for: 2m - - name: Ceph OSD Down - description: Ceph Object Storage Daemon Down - query: "ceph_osd_up == 0" - severity: critical - - name: Ceph high OSD latency - description: "Ceph Object Storage Daemon latency is high. Please check if it doesn't stuck in weird state." - query: "ceph_osd_perf_apply_latency_seconds > 5" - severity: warning - for: 1m - - name: Ceph OSD low space - description: Ceph Object Storage Daemon is going out of space. Please add more disks. - query: ceph_osd_utilization > 90 - severity: warning - for: 2m - - name: Ceph OSD reweighted - description: Ceph Object Storage Daemon takes too much time to resize. - query: "ceph_osd_weight < 1" - severity: warning - for: 2m - - name: Ceph PG down - description: Some Ceph placement groups are down. Please ensure that all the data are available. - query: "ceph_pg_down > 0" - severity: critical - - name: Ceph PG incomplete - description: Some Ceph placement groups are incomplete. Please ensure that all the data are available. - query: "ceph_pg_incomplete > 0" - severity: critical - - name: Ceph PG inconsistent - description: Some Ceph placement groups are inconsistent. Data is available but inconsistent across nodes. - query: ceph_pg_inconsistent > 0 - severity: warning - - name: Ceph PG activation long - description: Some Ceph placement groups are too long to activate. - query: "ceph_pg_activating > 0" - severity: warning - for: 2m - - name: Ceph PG backfill full - description: Some Ceph placement groups are located on full Object Storage Daemon on cluster. Those PGs can be unavailable shortly. Please check OSDs, change weight or reconfigure CRUSH rules. - query: "ceph_pg_backfill_toofull > 0" - severity: warning - for: 2m - - name: Ceph PG unavailable - description: Some Ceph placement groups are unavailable. - query: "ceph_pg_total - ceph_pg_active > 0" - severity: critical - - name: SpeedTest exporters: - name: Speedtest exporter @@ -3739,71 +3945,6 @@ groups: query: "avg_over_time(speedtest_upload[10m]) < 20" severity: warning - - name: ZFS - exporters: - - name: node-exporter - slug: node-exporter - doc_url: https://github.com/prometheus/node_exporter - rules: - - name: ZFS offline pool - description: "A ZFS zpool is in a unexpected state: {{ $labels.state }}." - query: 'node_zfs_zpool_state{state!="online"} > 0' - severity: critical - for: 1m - - name: ZFS exporter - slug: zfs_exporter - doc_url: https://github.com/pdf/zfs_exporter - rules: - - name: ZFS pool out of space - description: Disk is almost full (< 10% left) - query: "zfs_pool_free_bytes * 100 / zfs_pool_size_bytes < 10 and ON (instance, device, mountpoint) zfs_pool_readonly == 0" - severity: warning - - name: ZFS pool unhealthy - description: ZFS pool state is {{ $value }}. See comments for more information. - query: "zfs_pool_health > 0" - severity: critical - comments: | - 0: ONLINE - 1: DEGRADED - 2: FAULTED - 3: OFFLINE - 4: UNAVAIL - 5: REMOVED - 6: SUSPENDED - - name: ZFS collector failed - description: ZFS collector for {{ $labels.instance }} has failed to collect information - query: "zfs_scrape_collector_success != 1" - severity: warning - - - name: OpenEBS - exporters: - - name: Embedded exporter - slug: embedded-exporter - rules: - - name: OpenEBS used pool capacity - description: "OpenEBS Pool use more than 80% of his capacity" - query: "openebs_used_pool_capacity_percent > 80" - severity: warning - for: 2m - - - name: Minio - exporters: - - name: Embedded exporter - slug: embedded-exporter - rules: - - name: Minio cluster disk offline - description: "Minio cluster disk is offline" - query: "minio_cluster_drive_offline_total > 0" - severity: critical - - name: Minio node disk offline - description: "Minio cluster node disk is offline" - query: "minio_cluster_nodes_offline_total > 0" - severity: critical - - name: Minio disk space usage - description: "Minio available free space is low (< 10%)" - query: minio_cluster_capacity_raw_free_bytes / minio_cluster_capacity_raw_total_bytes * 100 < 10 - severity: warning - - name: SSL/TLS exporters: - name: ssl_exporter @@ -4252,6 +4393,139 @@ groups: May indicate routing issues or a misconfigured allowed-ips. Only useful if you expect continuous traffic on all peers. + - name: Storage + services: + - name: Ceph + exporters: + - name: Embedded exporter + slug: embedded-exporter + doc_url: https://docs.ceph.com/en/quincy/mgr/prometheus/ + rules: + - name: Ceph State + description: Ceph instance unhealthy + query: "ceph_health_status != 0" + severity: critical + - name: Ceph monitor clock skew + description: Ceph monitor clock skew detected. Please check ntp and hardware clock settings + query: "abs(ceph_monitor_clock_skew_seconds) > 0.2" + severity: warning + for: 2m + - name: Ceph monitor low space + description: Ceph monitor storage is low. + query: "ceph_monitor_avail_percent < 10" + severity: warning + for: 2m + - name: Ceph OSD Down + description: Ceph Object Storage Daemon Down + query: "ceph_osd_up == 0" + severity: critical + - name: Ceph high OSD latency + description: "Ceph Object Storage Daemon latency is high. Please check if it doesn't stuck in weird state." + query: "ceph_osd_perf_apply_latency_seconds > 5" + severity: warning + for: 1m + - name: Ceph OSD low space + description: Ceph Object Storage Daemon is going out of space. Please add more disks. + query: ceph_osd_utilization > 90 + severity: warning + for: 2m + - name: Ceph OSD reweighted + description: Ceph Object Storage Daemon takes too much time to resize. + query: "ceph_osd_weight < 1" + severity: warning + for: 2m + - name: Ceph PG down + description: Some Ceph placement groups are down. Please ensure that all the data are available. + query: "ceph_pg_down > 0" + severity: critical + - name: Ceph PG incomplete + description: Some Ceph placement groups are incomplete. Please ensure that all the data are available. + query: "ceph_pg_incomplete > 0" + severity: critical + - name: Ceph PG inconsistent + description: Some Ceph placement groups are inconsistent. Data is available but inconsistent across nodes. + query: ceph_pg_inconsistent > 0 + severity: warning + - name: Ceph PG activation long + description: Some Ceph placement groups are too long to activate. + query: "ceph_pg_activating > 0" + severity: warning + for: 2m + - name: Ceph PG backfill full + description: Some Ceph placement groups are located on full Object Storage Daemon on cluster. Those PGs can be unavailable shortly. Please check OSDs, change weight or reconfigure CRUSH rules. + query: "ceph_pg_backfill_toofull > 0" + severity: warning + for: 2m + - name: Ceph PG unavailable + description: Some Ceph placement groups are unavailable. + query: "ceph_pg_total - ceph_pg_active > 0" + severity: critical + + - name: ZFS + exporters: + - name: node-exporter + slug: node-exporter + doc_url: https://github.com/prometheus/node_exporter + rules: + - name: ZFS offline pool + description: "A ZFS zpool is in a unexpected state: {{ $labels.state }}." + query: 'node_zfs_zpool_state{state!="online"} > 0' + severity: critical + for: 1m + - name: ZFS exporter + slug: zfs_exporter + doc_url: https://github.com/pdf/zfs_exporter + rules: + - name: ZFS pool out of space + description: Disk is almost full (< 10% left) + query: "zfs_pool_free_bytes * 100 / zfs_pool_size_bytes < 10 and ON (instance, device, mountpoint) zfs_pool_readonly == 0" + severity: warning + - name: ZFS pool unhealthy + description: ZFS pool state is {{ $value }}. See comments for more information. + query: "zfs_pool_health > 0" + severity: critical + comments: | + 0: ONLINE + 1: DEGRADED + 2: FAULTED + 3: OFFLINE + 4: UNAVAIL + 5: REMOVED + 6: SUSPENDED + - name: ZFS collector failed + description: ZFS collector for {{ $labels.instance }} has failed to collect information + query: "zfs_scrape_collector_success != 1" + severity: warning + + - name: OpenEBS + exporters: + - name: Embedded exporter + slug: embedded-exporter + rules: + - name: OpenEBS used pool capacity + description: "OpenEBS Pool use more than 80% of his capacity" + query: "openebs_used_pool_capacity_percent > 80" + severity: warning + for: 2m + + - name: Minio + exporters: + - name: Embedded exporter + slug: embedded-exporter + rules: + - name: Minio cluster disk offline + description: "Minio cluster disk is offline" + query: "minio_cluster_drive_offline_total > 0" + severity: critical + - name: Minio node disk offline + description: "Minio cluster node disk is offline" + query: "minio_cluster_nodes_offline_total > 0" + severity: critical + - name: Minio disk space usage + description: "Minio available free space is low (< 10%)" + query: minio_cluster_capacity_raw_free_bytes / minio_cluster_capacity_raw_total_bytes * 100 < 10 + severity: warning + - name: Cloud providers services: - name: AWS CloudWatch @@ -4469,7 +4743,7 @@ groups: for: 5m - - name: Other + - name: Observability services: - name: Thanos exporters: @@ -5232,54 +5506,55 @@ groups: severity: critical for: 2m - - name: Jenkins + - name: Jaeger exporters: - - name: Metric plugin - slug: metric-plugin - doc_url: https://plugins.jenkins.io/prometheus/ + - name: Embedded exporter + slug: embedded-exporter + doc_url: https://www.jaegertracing.io/docs/latest/monitoring/ rules: - - name: Jenkins node offline - description: "At least one Jenkins node offline: `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})" - query: "jenkins_node_offline_value > 0" - severity: critical - for: 5m - - name: Jenkins no node online - description: "No Jenkins nodes are online: `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})" - query: "jenkins_node_online_value == 0" - severity: critical - - name: Jenkins healthcheck - description: "Jenkins healthcheck score: {{$value}}. Healthcheck failure for `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})" - query: "jenkins_health_check_score < 1" - severity: critical - - name: Jenkins outdated plugins - description: "{{ $value }} plugins need update" - query: "sum(jenkins_plugins_withUpdate) by (instance) > 3" + - name: Jaeger agent HTTP server errors + description: "Jaeger agent on {{ $labels.instance }} is experiencing {{ $value | humanize }}% HTTP server errors." + query: '100 * sum(rate(jaeger_agent_http_server_errors_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_agent_http_server_total[1m])) by (instance, job, namespace) > 1' severity: warning - for: 1d - - name: Jenkins builds health score - description: "Healthcheck failure for `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})" - query: "default_jenkins_builds_health_score < 1" - severity: critical - - name: Jenkins run failure total - description: "Job run failures: ({{$value}}) {{$labels.jenkins_job}}. Healthcheck failure for `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})" - query: "delta(jenkins_runs_failure_total[1h]) > 100" + for: 15m + - name: Jaeger client RPC request errors + description: "Jaeger client on {{ $labels.instance }} is experiencing {{ $value | humanize }}% RPC HTTP errors." + query: '100 * sum(rate(jaeger_client_jaeger_rpc_http_requests{status_code=~"4xx|5xx"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_client_jaeger_rpc_http_requests[1m])) by (instance, job, namespace) > 1' severity: warning - - name: Jenkins build tests failing - description: "Last build tests failed: {{$labels.jenkins_job}}. Failed build Tests for job `{{$labels.jenkins_job}}` on {{$labels.instance}}/{{$labels.env}} ({{$labels.region}})" - query: "default_jenkins_builds_last_build_tests_failing > 0" + for: 15m + - name: Jaeger client spans dropped + description: "Jaeger client on {{ $labels.instance }} is dropping {{ $value | humanize }}% of spans." + query: '100 * sum(rate(jaeger_reporter_spans{result=~"dropped|err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_reporter_spans[1m])) by (instance, job, namespace) > 1' severity: warning - - name: Jenkins last build failed - description: "Last build failed: {{$labels.jenkins_job}}. Failed build for job `{{$labels.jenkins_job}}` on {{$labels.instance}}/{{$labels.env}} ({{$labels.region}})" - query: "default_jenkins_builds_last_build_result_ordinal == 2" + for: 15m + - name: Jaeger agent spans dropped + description: "Jaeger agent on {{ $labels.instance }} is dropping {{ $value | humanize }}% of span batches." + query: '100 * sum(rate(jaeger_agent_reporter_batches_failures_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_agent_reporter_batches_submitted_total[1m])) by (instance, job, namespace) > 1' severity: warning - comments: | - * RUNNING -1 true - The build had no errors. - * SUCCESS 0 true - The build had no errors. - * UNSTABLE 1 true - The build had some errors but they were not fatal. For example, some tests failed. - * FAILURE 2 false - The build had a fatal error. - * NOT_BUILT 3 false - The module was not built. - * ABORTED 4 false - The build was manually aborted. + for: 15m + - name: Jaeger collector dropping spans + description: "Jaeger collector on {{ $labels.instance }} is dropping {{ $value | humanize }}% of spans." + query: '100 * sum(rate(jaeger_collector_spans_dropped_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_collector_spans_received_total[1m])) by (instance, job, namespace) > 1' + severity: warning + for: 15m + - name: Jaeger sampling update failing + description: "Jaeger on {{ $labels.instance }} is failing {{ $value | humanize }}% of sampling policy updates." + query: '100 * sum(rate(jaeger_sampler_queries{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_sampler_queries[1m])) by (instance, job, namespace) > 1' + severity: warning + for: 15m + - name: Jaeger throttling update failing + description: "Jaeger on {{ $labels.instance }} is failing {{ $value | humanize }}% of throttling policy updates." + query: '100 * sum(rate(jaeger_throttler_updates{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_throttler_updates[1m])) by (instance, job, namespace) > 1' + severity: warning + for: 15m + - name: Jaeger query request failures + description: "Jaeger query on {{ $labels.instance }} is failing {{ $value | humanize }}% of requests." + query: '100 * sum(rate(jaeger_query_requests_total{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_query_requests_total[1m])) by (instance, job, namespace) > 1' + severity: warning + for: 15m + - name: Other + services: - name: APC UPS exporters: - name: mdlayher/apcupsd_exporter @@ -5341,268 +5616,3 @@ groups: description: "Store connection is too slow to `{{$labels.pool}}` pool, `{{$labels.shard}}` shard in Graph node `{{$labels.instance}}`" query: "store_connection_wait_time_ms > 20" severity: critical - - - name: GitLab - exporters: - - name: GitLab built-in exporter - slug: gitlab-built-in-exporter - doc_url: https://docs.gitlab.com/administration/monitoring/prometheus/gitlab_metrics/ - rules: - # Puma web server - - name: GitLab Puma high queued connections - description: "GitLab Puma has {{ $value }} queued connections on {{ $labels.instance }}. Requests are waiting for an available worker thread." - query: "avg_over_time(puma_queued_connections[5m]) > 5" - severity: warning - for: 5m - comments: | - Queued connections indicate Puma workers are saturated. - Consider increasing puma['worker_processes'] or puma['max_threads'] in gitlab.rb. - - name: GitLab Puma no available pool capacity - description: "GitLab Puma pool capacity on {{ $labels.instance }} has been at 0 for 5 minutes. All threads are busy." - query: "puma_pool_capacity == 0" - severity: critical - for: 5m - - name: GitLab Puma workers not running - description: "GitLab Puma on {{ $labels.instance }} has {{ $value }} running workers out of expected total." - query: "puma_running_workers < puma_workers" - severity: warning - for: 5m - # HTTP request handling - - name: GitLab high HTTP error rate - description: "GitLab is returning more than 5% HTTP 5xx errors on {{ $labels.instance }}." - query: 'sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m])) * 100 > 5' - severity: critical - for: 5m - comments: | - Threshold is 5% of all requests returning server errors. - Check GitLab logs at /var/log/gitlab/ for root cause. - - name: GitLab high HTTP request latency - description: "GitLab p95 HTTP request latency on {{ $labels.instance }} is above 10 seconds." - query: "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le)) > 10" - severity: warning - for: 5m - comments: | - Threshold of 10s may need adjustment based on your instance size and workload. - # Sidekiq background jobs - - name: GitLab Sidekiq jobs failing - description: "GitLab Sidekiq jobs are failing at a rate of {{ $value }} per second on {{ $labels.instance }}." - query: "rate(sidekiq_jobs_failed_total[5m]) > 0" - severity: warning - for: 10m - comments: | - This metric requires the emit_sidekiq_histogram_metrics feature flag to be enabled. - A sustained failure rate indicates background processing issues. - - name: GitLab Sidekiq queue too large - description: "GitLab Sidekiq has {{ $value }} running jobs, approaching concurrency limit on {{ $labels.instance }}." - query: "sum(sidekiq_running_jobs) >= sum(sidekiq_concurrency) * 0.9" - severity: warning - for: 10m - comments: | - When running jobs approach the concurrency limit, new jobs will queue up. - Consider scaling Sidekiq workers or increasing concurrency. - - name: GitLab Sidekiq high job completion time - description: "GitLab Sidekiq job average completion time on {{ $labels.instance }} is above 5 minutes." - query: "histogram_quantile(0.95, sum(rate(sidekiq_jobs_completion_seconds_bucket[5m])) by (le, worker)) > 300" - severity: warning - for: 10m - comments: | - This metric requires the emit_sidekiq_histogram_metrics feature flag to be enabled. - - name: GitLab Sidekiq high queue latency - description: "GitLab Sidekiq jobs on {{ $labels.instance }} are waiting more than 60 seconds before being processed." - query: "histogram_quantile(0.95, sum(rate(sidekiq_jobs_queue_duration_seconds_bucket[5m])) by (le)) > 60" - severity: warning - for: 5m - comments: | - This metric requires the emit_sidekiq_histogram_metrics feature flag to be enabled. - High queue latency means jobs are stuck waiting. Check Sidekiq concurrency and queue sizes. - # Database connection pool - - name: GitLab database connection pool saturation - description: "GitLab database connection pool on {{ $labels.instance }} ({{ $labels.class }}) is {{ $value }}% busy." - query: "gitlab_database_connection_pool_busy / gitlab_database_connection_pool_size * 100 > 90" - severity: warning - for: 5m - comments: | - When the pool is near saturation, requests may block waiting for a connection. - Increase db_pool_size in gitlab.rb or investigate slow queries. - - name: GitLab database connection pool dead connections - description: "GitLab database connection pool on {{ $labels.instance }} ({{ $labels.class }}) has {{ $value }} dead connections." - query: "gitlab_database_connection_pool_dead > 0" - severity: warning - for: 5m - - name: GitLab database connection pool waiting - description: "GitLab on {{ $labels.instance }} has {{ $value }} threads waiting for a database connection." - query: "gitlab_database_connection_pool_waiting > 0" - severity: warning - for: 5m - # CI/CD pipelines - - name: GitLab CI pipeline creation slow - description: "GitLab CI pipeline creation p95 latency on {{ $labels.instance }} is above 30 seconds." - query: "histogram_quantile(0.95, sum(rate(gitlab_ci_pipeline_creation_duration_seconds_bucket[5m])) by (le)) > 30" - severity: warning - for: 5m - - name: GitLab CI pipeline failures increasing - description: "GitLab CI pipeline failures are increasing on {{ $labels.instance }} ({{ $value }}/s)." - query: "rate(gitlab_ci_pipeline_failure_reasons[5m]) > 0" - severity: warning - for: 10m - - name: GitLab CI runner authentication failures - description: "GitLab CI runners are experiencing authentication failures on {{ $labels.instance }} ({{ $value }} failures)." - query: "increase(gitlab_ci_runner_authentication_failure_total[5m]) > 5" - severity: warning - for: 5m - comments: | - Frequent runner auth failures may indicate expired tokens or misconfigured runners. - # Ruby process health - - name: GitLab high memory usage - description: "GitLab process on {{ $labels.instance }} is using {{ $value | humanize1024 }}B of RSS memory." - query: "process_resident_memory_bytes{job=~\".*gitlab.*\"} > 2e+9" - severity: warning - for: 10m - comments: | - Threshold of 2GB may need adjustment based on your instance size. - High memory usage can lead to OOM kills and service disruptions. - - name: GitLab Ruby heap fragmentation - description: "GitLab Ruby heap fragmentation on {{ $labels.instance }} is {{ $value }}. High fragmentation wastes memory." - query: "ruby_gc_stat_ext_heap_fragmentation{job=~\".*gitlab.*\"} > 0.5" - severity: warning - for: 15m - comments: | - Heap fragmentation above 50% means a significant amount of memory is wasted. - A Puma worker restart may help reclaim memory. - # Uncaught errors - - name: GitLab rack uncaught errors - description: "GitLab is experiencing uncaught errors in the Rack layer on {{ $labels.instance }} ({{ $value }}/s)." - query: "rate(rack_uncaught_errors_total[5m]) > 0" - severity: warning - for: 5m - # Application version / deployment - - name: GitLab version mismatch - description: "Multiple GitLab versions are running across the fleet." - query: 'count(count by (version) (deployments{version!=""})) > 1' - severity: warning - comments: | - This may happen during a rolling deployment. If it persists, investigate incomplete upgrades. - # File descriptors - - name: GitLab high file descriptor usage - description: "GitLab on {{ $labels.instance }} is using {{ $value }}% of available file descriptors." - query: 'process_open_fds{job=~".*gitlab.*"} / process_max_fds * 100 > 80' - severity: warning - for: 5m - # Ruby threads - - name: GitLab Ruby threads saturated - description: "GitLab running threads on {{ $labels.instance }} have exceeded the expected maximum ({{ $value }})." - query: "sum by (instance) (gitlab_ruby_threads_running_threads) > on(instance) gitlab_ruby_threads_max_expected_threads * 1.5" - severity: warning - for: 10m - - - name: Workhorse - slug: workhorse - doc_url: https://docs.gitlab.com/administration/monitoring/prometheus/gitlab_metrics/#gitlab-workhorse - rules: - - name: GitLab Workhorse high error rate - description: "GitLab Workhorse on {{ $labels.instance }} is returning more than 10% HTTP 5xx errors." - query: 'sum(rate(gitlab_workhorse_http_request_duration_seconds_count{code=~"5.."}[5m])) / sum(rate(gitlab_workhorse_http_request_duration_seconds_count[5m])) * 100 > 10' - severity: critical - for: 5m - comments: | - Workhorse sits in front of Puma and handles Git HTTP, file uploads, and proxying. - Threshold from GitLab Omnibus default rules: 10% for high-traffic instances. - - name: GitLab Workhorse high latency - description: "GitLab Workhorse on {{ $labels.instance }} p95 request latency is above 10 seconds." - query: "histogram_quantile(0.95, sum(rate(gitlab_workhorse_http_request_duration_seconds_bucket[5m])) by (le)) > 10" - severity: warning - for: 5m - - name: GitLab Workhorse high in-flight requests - description: "GitLab Workhorse on {{ $labels.instance }} has {{ $value }} in-flight requests." - query: "gitlab_workhorse_http_in_flight_requests > 100" - severity: warning - for: 5m - comments: | - Threshold of 100 may need adjustment based on instance size. - - - name: Gitaly - slug: gitaly - doc_url: https://docs.gitlab.com/administration/gitaly/monitoring/ - rules: - - name: GitLab Gitaly high gRPC error rate - description: "Gitaly on {{ $labels.instance }} is returning more than 5% gRPC errors." - query: 'sum(rate(grpc_server_handled_total{job="gitaly",grpc_code!="OK"}[5m])) / sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) * 100 > 5' - severity: warning - for: 5m - - name: GitLab Gitaly resource exhausted - description: "Gitaly on {{ $labels.instance }} is returning ResourceExhausted errors, indicating overload ({{ $value }}%)." - query: 'sum(rate(grpc_server_handled_total{job="gitaly",grpc_code="ResourceExhausted"}[5m])) / sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) * 100 > 1' - severity: critical - for: 5m - comments: | - ResourceExhausted errors from Gitaly mean Git operations are being rejected due to - concurrency limits. This directly impacts users trying to push, pull, or clone. - This alert is derived from the GitLab Omnibus default rules. - - name: GitLab Gitaly high RPC latency - description: "Gitaly on {{ $labels.instance }} p95 unary RPC latency exceeds 1 second ({{ $value }}s)." - query: 'histogram_quantile(0.95, sum(rate(grpc_server_handling_seconds_bucket{job="gitaly",grpc_type="unary"}[5m])) by (le)) > 1' - severity: warning - for: 5m - - name: GitLab Gitaly CPU throttled - description: "Gitaly processes on {{ $labels.instance }} are being CPU throttled by cgroups." - query: "rate(gitaly_cgroup_cpu_cfs_throttled_seconds_total[5m]) > 0" - severity: warning - for: 5m - - name: GitLab Gitaly authentication failures - description: "Gitaly on {{ $labels.instance }} has authentication failures ({{ $value }})." - query: 'increase(gitaly_authentications_total{status="failed"}[5m]) > 0' - severity: warning - - name: GitLab Gitaly circuit breaker tripped - description: "Gitaly circuit breaker has tripped on {{ $labels.instance }}. Git operations are failing." - query: 'increase(gitaly_circuit_breaker_transitions_total{to_state="open"}[5m]) > 0' - severity: critical - comments: | - When the circuit breaker trips to "open" state, Git operations (push, pull, clone) will fail. - Check Gitaly service health and logs. - - - name: Jaeger - exporters: - - name: Embedded exporter - slug: embedded-exporter - doc_url: https://www.jaegertracing.io/docs/latest/monitoring/ - rules: - - name: Jaeger agent HTTP server errors - description: "Jaeger agent on {{ $labels.instance }} is experiencing {{ $value | humanize }}% HTTP server errors." - query: '100 * sum(rate(jaeger_agent_http_server_errors_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_agent_http_server_total[1m])) by (instance, job, namespace) > 1' - severity: warning - for: 15m - - name: Jaeger client RPC request errors - description: "Jaeger client on {{ $labels.instance }} is experiencing {{ $value | humanize }}% RPC HTTP errors." - query: '100 * sum(rate(jaeger_client_jaeger_rpc_http_requests{status_code=~"4xx|5xx"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_client_jaeger_rpc_http_requests[1m])) by (instance, job, namespace) > 1' - severity: warning - for: 15m - - name: Jaeger client spans dropped - description: "Jaeger client on {{ $labels.instance }} is dropping {{ $value | humanize }}% of spans." - query: '100 * sum(rate(jaeger_reporter_spans{result=~"dropped|err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_reporter_spans[1m])) by (instance, job, namespace) > 1' - severity: warning - for: 15m - - name: Jaeger agent spans dropped - description: "Jaeger agent on {{ $labels.instance }} is dropping {{ $value | humanize }}% of span batches." - query: '100 * sum(rate(jaeger_agent_reporter_batches_failures_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_agent_reporter_batches_submitted_total[1m])) by (instance, job, namespace) > 1' - severity: warning - for: 15m - - name: Jaeger collector dropping spans - description: "Jaeger collector on {{ $labels.instance }} is dropping {{ $value | humanize }}% of spans." - query: '100 * sum(rate(jaeger_collector_spans_dropped_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_collector_spans_received_total[1m])) by (instance, job, namespace) > 1' - severity: warning - for: 15m - - name: Jaeger sampling update failing - description: "Jaeger on {{ $labels.instance }} is failing {{ $value | humanize }}% of sampling policy updates." - query: '100 * sum(rate(jaeger_sampler_queries{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_sampler_queries[1m])) by (instance, job, namespace) > 1' - severity: warning - for: 15m - - name: Jaeger throttling update failing - description: "Jaeger on {{ $labels.instance }} is failing {{ $value | humanize }}% of throttling policy updates." - query: '100 * sum(rate(jaeger_throttler_updates{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_throttler_updates[1m])) by (instance, job, namespace) > 1' - severity: warning - for: 15m - - name: Jaeger query request failures - description: "Jaeger query on {{ $labels.instance }} is failing {{ $value | humanize }}% of requests." - query: '100 * sum(rate(jaeger_query_requests_total{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_query_requests_total[1m])) by (instance, job, namespace) > 1' - severity: warning - for: 15m