From 6e2db9859060967f71b2a77eb6acf7dc254788e3 Mon Sep 17 00:00:00 2001 From: Samuel Berthe Date: Wed, 5 Nov 2025 17:04:30 +0100 Subject: [PATCH 1/8] feat: add support for exporter-level comments (#481) --- dist/template.yml | 6 +++++- rules.md | 1 + 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/dist/template.yml b/dist/template.yml index cdde4ea..5c991cb 100644 --- a/dist/template.yml +++ b/dist/template.yml @@ -2,9 +2,13 @@ groups: {% assign groupName = slug | split: '-' %}{% capture groupNameCamelcase %}{% for word in groupName %}{{ word | capitalize }} {% endfor %}{% endcapture %} - name: {{ groupNameCamelcase | remove: ' ' | remove: '-' }} + {% assign lines = comments | split: " +" %}{% for line in lines %}# {{ line | strip }} + {% endfor %} rules: {% for rule in rules %}{% assign ruleName = rule.name | split: ' ' %}{% capture ruleNameCamelcase %}{% for word in ruleName %}{{ word | capitalize }} {% endfor %}{% endcapture %} - {% for comment in comments %}# {{ comment | strip }} + {% assign lines = rule.comments | split: " +" %}{% for line in lines %}# {{ line | strip }} {% endfor %}- alert: {{ ruleNameCamelcase | remove: ' ' }} expr: '{{ rule.query }}' for: {% if rule.for %}{{ rule.for }}{% else %}0m{% endif %} diff --git a/rules.md b/rules.md index f786f81..02364e5 100644 --- a/rules.md +++ b/rules.md @@ -62,6 +62,7 @@ // @TODO: Please contribute => https://github.com/samber/awesome-prometheus-alerts 👋 {% endhighlight %} {% else %} +{{ exporter.comments | strip | newline_to_br }} {% highlight bash %} $ wget https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/{{ service.name | replace: " ", "-" | downcase }}/{{ exporter.slug }}.yml {% endhighlight %} From 4acbddb21accf6092ed3867cdf6e58cc70ed300d Mon Sep 17 00:00:00 2001 From: samber Date: Wed, 5 Nov 2025 16:04:56 +0000 Subject: [PATCH 2/8] Publish --- dist/rules/apache/lusitaniae-apache-exporter.yml | 1 + dist/rules/apc-ups/apcupsd_exporter.yml | 1 + dist/rules/argocd/embedded-exporter.yml | 1 + dist/rules/blackbox/blackbox-exporter.yml | 5 +++++ dist/rules/caddy/embedded-exporter.yml | 1 + dist/rules/cassandra/criteo-cassandra-exporter.yml | 1 + .../cassandra/instaclustr-cassandra-exporter.yml | 1 + dist/rules/ceph/embedded-exporter.yml | 1 + dist/rules/clickhouse/embedded-exporter.yml | 4 ++++ .../rules/cloudflare/lablabs-cloudflare-exporter.yml | 1 + dist/rules/consul/consul-exporter.yml | 1 + dist/rules/coredns/embedded-exporter.yml | 1 + dist/rules/cortex/embedded-exporter.yml | 1 + .../couchdb/gesellix-couchdb-prometheus-exporter.yml | 1 + dist/rules/docker-containers/google-cadvisor.yml | 4 ++++ .../prometheus-community-elasticsearch-exporter.yml | 1 + dist/rules/etcd/embedded-exporter.yml | 1 + dist/rules/fluxcd/embedded-exporter.yml | 1 + dist/rules/freeswitch/znerol-freeswitch-exporter.yml | 1 + dist/rules/grafana-alloy/embedded-exporter.yml | 1 + dist/rules/graph-node/embedded-exporter.yml | 1 + dist/rules/hadoop/jmx_exporter.yml | 1 + dist/rules/haproxy/embedded-exporter-v2.yml | 1 + dist/rules/haproxy/haproxy-exporter-v1.yml | 1 + dist/rules/hashicorp-vault/embedded-exporter.yml | 1 + dist/rules/host-and-hardware/node-exporter.yml | 12 ++++++++++++ dist/rules/istio/embedded-exporter.yml | 1 + dist/rules/jenkins/metric-plugin.yml | 7 +++++++ dist/rules/juniper/czerwonk-junos-exporter.yml | 1 + dist/rules/jvm/jvm-exporter.yml | 1 + dist/rules/kafka/danielqsj-kafka-exporter.yml | 1 + dist/rules/kafka/linkedin-kafka-exporter.yml | 1 + dist/rules/kubernetes/kubestate-exporter.yml | 4 ++++ dist/rules/linkerd/embedded-exporter.yml | 1 + dist/rules/loki/embedded-exporter.yml | 1 + dist/rules/meilisearch/embedded-exporter.yml | 1 + dist/rules/minio/embedded-exporter.yml | 1 + dist/rules/mongodb/dcu-mongodb-exporter.yml | 1 + dist/rules/mongodb/percona-mongodb-exporter.yml | 1 + dist/rules/mongodb/stefanprodan-mgob-exporter.yml | 1 + dist/rules/mysql/mysqld-exporter.yml | 1 + dist/rules/nats/nats-exporter.yml | 1 + dist/rules/netdata/embedded-exporter.yml | 1 + dist/rules/nginx/knyar-nginx-exporter.yml | 1 + dist/rules/nomad/embedded-exporter.yml | 1 + dist/rules/openebs/embedded-exporter.yml | 1 + dist/rules/patroni/embedded-exporter-patroni.yml | 1 + dist/rules/pgbouncer/spreaker-pgbouncer-exporter.yml | 1 + dist/rules/php-fpm/bakins-fpm-exporter.yml | 1 + dist/rules/postgresql/postgres-exporter.yml | 4 ++++ .../prometheus-self-monitoring/embedded-exporter.yml | 1 + dist/rules/promtail/embedded-exporter.yml | 1 + dist/rules/pulsar/embedded-exporter.yml | 1 + dist/rules/rabbitmq/kbudde-rabbitmq-exporter.yml | 6 ++++++ dist/rules/rabbitmq/rabbitmq-exporter.yml | 1 + dist/rules/redis/oliver006-redis-exporter.yml | 2 ++ .../smartctl-exporter.yml | 1 + dist/rules/sidekiq/strech-sidekiq-exporter.yml | 1 + dist/rules/solr/embedded-exporter.yml | 1 + .../speedtest/nlamirault-speedtest-exporter.yml | 1 + dist/rules/sql-server/ozarklake-mssql-exporter.yml | 1 + dist/rules/ssl/tls/ribbybibby-ssl-exporter.yml | 1 + dist/rules/thanos/thanos-bucket-replicate.yml | 1 + dist/rules/thanos/thanos-compactor.yml | 1 + dist/rules/thanos/thanos-component-absent.yml | 1 + dist/rules/thanos/thanos-query.yml | 1 + dist/rules/thanos/thanos-receiver.yml | 1 + dist/rules/thanos/thanos-ruler.yml | 1 + dist/rules/thanos/thanos-sidecar.yml | 1 + dist/rules/thanos/thanos-store.yml | 1 + dist/rules/traefik/embedded-exporter-v1.yml | 1 + dist/rules/traefik/embedded-exporter-v2.yml | 1 + dist/rules/vmware/pryorda-vmware-exporter.yml | 1 + dist/rules/windows-server/windows-exporter.yml | 1 + dist/rules/zfs/node-exporter.yml | 1 + dist/rules/zfs/zfs_exporter.yml | 8 ++++++++ .../cloudflare-kafka-zookeeper-exporter.yml | 1 + dist/rules/zookeeper/dabealu-zookeeper-exporter.yml | 1 + 78 files changed, 124 insertions(+) diff --git a/dist/rules/apache/lusitaniae-apache-exporter.yml b/dist/rules/apache/lusitaniae-apache-exporter.yml index b1f8d02..5876e25 100644 --- a/dist/rules/apache/lusitaniae-apache-exporter.yml +++ b/dist/rules/apache/lusitaniae-apache-exporter.yml @@ -2,6 +2,7 @@ groups: - name: LusitaniaeApacheExporter + rules: - alert: ApacheDown diff --git a/dist/rules/apc-ups/apcupsd_exporter.yml b/dist/rules/apc-ups/apcupsd_exporter.yml index 6a071fd..dcc192f 100644 --- a/dist/rules/apc-ups/apcupsd_exporter.yml +++ b/dist/rules/apc-ups/apcupsd_exporter.yml @@ -2,6 +2,7 @@ groups: - name: Apcupsd_exporter + rules: - alert: ApcUpsBatteryNearlyEmpty diff --git a/dist/rules/argocd/embedded-exporter.yml b/dist/rules/argocd/embedded-exporter.yml index 620f140..eba3ce9 100644 --- a/dist/rules/argocd/embedded-exporter.yml +++ b/dist/rules/argocd/embedded-exporter.yml @@ -2,6 +2,7 @@ groups: - name: EmbeddedExporter + rules: - alert: ArgocdServiceNotSynced diff --git a/dist/rules/blackbox/blackbox-exporter.yml b/dist/rules/blackbox/blackbox-exporter.yml index 3f90436..6d101db 100644 --- a/dist/rules/blackbox/blackbox-exporter.yml +++ b/dist/rules/blackbox/blackbox-exporter.yml @@ -2,6 +2,7 @@ groups: - name: BlackboxExporter + rules: - alert: BlackboxProbeFailed @@ -58,6 +59,10 @@ groups: summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }}) description: "SSL certificate expires in less than 3 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # For probe_ssl_earliest_cert_expiry to be exposed after expiration, you + # need to enable insecure_skip_verify. Note that this will disable + # certificate validation. + # See https://github.com/prometheus/blackbox_exporter/blob/master/CONFIGURATION.md#tls_config - alert: BlackboxSslCertificateExpired expr: 'round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 0' for: 0m diff --git a/dist/rules/caddy/embedded-exporter.yml b/dist/rules/caddy/embedded-exporter.yml index ec8a17e..e8e2635 100644 --- a/dist/rules/caddy/embedded-exporter.yml +++ b/dist/rules/caddy/embedded-exporter.yml @@ -2,6 +2,7 @@ groups: - name: EmbeddedExporter + rules: - alert: CaddyReverseProxyDown diff --git a/dist/rules/cassandra/criteo-cassandra-exporter.yml b/dist/rules/cassandra/criteo-cassandra-exporter.yml index 7807a69..7ff24bf 100644 --- a/dist/rules/cassandra/criteo-cassandra-exporter.yml +++ b/dist/rules/cassandra/criteo-cassandra-exporter.yml @@ -2,6 +2,7 @@ groups: - name: CriteoCassandraExporter + rules: - alert: CassandraHintsCount diff --git a/dist/rules/cassandra/instaclustr-cassandra-exporter.yml b/dist/rules/cassandra/instaclustr-cassandra-exporter.yml index ca01737..2d397f5 100644 --- a/dist/rules/cassandra/instaclustr-cassandra-exporter.yml +++ b/dist/rules/cassandra/instaclustr-cassandra-exporter.yml @@ -2,6 +2,7 @@ groups: - name: InstaclustrCassandraExporter + rules: - alert: CassandraNodeIsUnavailable diff --git a/dist/rules/ceph/embedded-exporter.yml b/dist/rules/ceph/embedded-exporter.yml index a8221d7..48d433b 100644 --- a/dist/rules/ceph/embedded-exporter.yml +++ b/dist/rules/ceph/embedded-exporter.yml @@ -2,6 +2,7 @@ groups: - name: EmbeddedExporter + rules: - alert: CephState diff --git a/dist/rules/clickhouse/embedded-exporter.yml b/dist/rules/clickhouse/embedded-exporter.yml index a4cd2a7..5553722 100644 --- a/dist/rules/clickhouse/embedded-exporter.yml +++ b/dist/rules/clickhouse/embedded-exporter.yml @@ -2,6 +2,7 @@ groups: - name: EmbeddedExporter + rules: - alert: ClickhouseNodeDown @@ -85,6 +86,7 @@ groups: summary: ClickHouse No Live Replicas (instance {{ $labels.instance }}) description: "There are too few live replicas available, risking data loss and service disruption.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Please replace the threshold with an appropriate value - alert: ClickhouseHighNetworkTraffic expr: 'ClickHouseMetrics_NetworkSend > 250 or ClickHouseMetrics_NetworkReceive > 250' for: 5m @@ -94,6 +96,7 @@ groups: summary: ClickHouse High Network Traffic (instance {{ $labels.instance }}) description: "Network traffic is unusually high, may affect cluster performance.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Please replace the threshold with an appropriate value - alert: ClickhouseHighTcpConnections expr: 'ClickHouseMetrics_TCPConnection > 400' for: 5m @@ -166,6 +169,7 @@ groups: summary: ClickHouse zookeeper hardware exception (instance {{ $labels.instance }}) description: "Zookeeper hardware exception: network issues communicating with ZooKeeper\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Please replace the threshold with an appropriate value - alert: ClickhouseHighNetworkUsage expr: 'rate(ClickHouseProfileEvents_NetworkSendBytes[1m]) > 100*1024*1024 or rate(ClickHouseProfileEvents_NetworkReceiveBytes[1m]) > 100*1024*1024' for: 2m diff --git a/dist/rules/cloudflare/lablabs-cloudflare-exporter.yml b/dist/rules/cloudflare/lablabs-cloudflare-exporter.yml index bb4c018..d647ebf 100644 --- a/dist/rules/cloudflare/lablabs-cloudflare-exporter.yml +++ b/dist/rules/cloudflare/lablabs-cloudflare-exporter.yml @@ -2,6 +2,7 @@ groups: - name: LablabsCloudflareExporter + rules: - alert: CloudflareHttp4xxErrorRate diff --git a/dist/rules/consul/consul-exporter.yml b/dist/rules/consul/consul-exporter.yml index 48657d6..0387850 100644 --- a/dist/rules/consul/consul-exporter.yml +++ b/dist/rules/consul/consul-exporter.yml @@ -2,6 +2,7 @@ groups: - name: ConsulExporter + rules: - alert: ConsulServiceHealthcheckFailed diff --git a/dist/rules/coredns/embedded-exporter.yml b/dist/rules/coredns/embedded-exporter.yml index eba60e4..ff0d171 100644 --- a/dist/rules/coredns/embedded-exporter.yml +++ b/dist/rules/coredns/embedded-exporter.yml @@ -2,6 +2,7 @@ groups: - name: EmbeddedExporter + rules: - alert: CorednsPanicCount diff --git a/dist/rules/cortex/embedded-exporter.yml b/dist/rules/cortex/embedded-exporter.yml index c5b6bb7..e711e1e 100644 --- a/dist/rules/cortex/embedded-exporter.yml +++ b/dist/rules/cortex/embedded-exporter.yml @@ -2,6 +2,7 @@ groups: - name: EmbeddedExporter + rules: - alert: CortexRulerConfigurationReloadFailure diff --git a/dist/rules/couchdb/gesellix-couchdb-prometheus-exporter.yml b/dist/rules/couchdb/gesellix-couchdb-prometheus-exporter.yml index b689d0c..0280f18 100644 --- a/dist/rules/couchdb/gesellix-couchdb-prometheus-exporter.yml +++ b/dist/rules/couchdb/gesellix-couchdb-prometheus-exporter.yml @@ -2,6 +2,7 @@ groups: - name: GesellixCouchdbPrometheusExporter + rules: - alert: CouchdbNodeDown diff --git a/dist/rules/docker-containers/google-cadvisor.yml b/dist/rules/docker-containers/google-cadvisor.yml index 40eb177..84bf65c 100644 --- a/dist/rules/docker-containers/google-cadvisor.yml +++ b/dist/rules/docker-containers/google-cadvisor.yml @@ -2,8 +2,10 @@ groups: - name: GoogleCadvisor + rules: + # This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment. - alert: ContainerKilled expr: 'time() - container_last_seen > 60' for: 0m @@ -13,6 +15,7 @@ groups: summary: Container killed (instance {{ $labels.instance }}) description: "A container has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment. - alert: ContainerAbsent expr: 'absent(container_last_seen)' for: 5m @@ -31,6 +34,7 @@ groups: summary: Container High CPU utilization (instance {{ $labels.instance }}) description: "Container CPU utilization is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # See https://medium.com/faun/how-much-is-too-much-the-linux-oomkiller-and-used-memory-d32186f29c9d - alert: ContainerHighMemoryUsage expr: '(sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80' for: 2m diff --git a/dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml b/dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml index 12705b5..1980c1d 100644 --- a/dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml +++ b/dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml @@ -2,6 +2,7 @@ groups: - name: PrometheusCommunityElasticsearchExporter + rules: - alert: ElasticsearchHeapUsageTooHigh diff --git a/dist/rules/etcd/embedded-exporter.yml b/dist/rules/etcd/embedded-exporter.yml index eace7ae..a934eb8 100644 --- a/dist/rules/etcd/embedded-exporter.yml +++ b/dist/rules/etcd/embedded-exporter.yml @@ -2,6 +2,7 @@ groups: - name: EmbeddedExporter + rules: - alert: EtcdInsufficientMembers diff --git a/dist/rules/fluxcd/embedded-exporter.yml b/dist/rules/fluxcd/embedded-exporter.yml index 5617faa..f0fd9ab 100644 --- a/dist/rules/fluxcd/embedded-exporter.yml +++ b/dist/rules/fluxcd/embedded-exporter.yml @@ -2,6 +2,7 @@ groups: - name: EmbeddedExporter + rules: - alert: FluxKustomizationFailure diff --git a/dist/rules/freeswitch/znerol-freeswitch-exporter.yml b/dist/rules/freeswitch/znerol-freeswitch-exporter.yml index be9fa1f..154bd63 100644 --- a/dist/rules/freeswitch/znerol-freeswitch-exporter.yml +++ b/dist/rules/freeswitch/znerol-freeswitch-exporter.yml @@ -2,6 +2,7 @@ groups: - name: ZnerolFreeswitchExporter + rules: - alert: FreeswitchDown diff --git a/dist/rules/grafana-alloy/embedded-exporter.yml b/dist/rules/grafana-alloy/embedded-exporter.yml index d86c8a4..99003ec 100644 --- a/dist/rules/grafana-alloy/embedded-exporter.yml +++ b/dist/rules/grafana-alloy/embedded-exporter.yml @@ -2,6 +2,7 @@ groups: - name: EmbeddedExporter + rules: - alert: GrafanaAlloyServiceDown diff --git a/dist/rules/graph-node/embedded-exporter.yml b/dist/rules/graph-node/embedded-exporter.yml index a8d0768..b902605 100644 --- a/dist/rules/graph-node/embedded-exporter.yml +++ b/dist/rules/graph-node/embedded-exporter.yml @@ -2,6 +2,7 @@ groups: - name: EmbeddedExporter + rules: - alert: ProviderFailedBecauseNet_versionFailed diff --git a/dist/rules/hadoop/jmx_exporter.yml b/dist/rules/hadoop/jmx_exporter.yml index 42d6ee3..5a94f8a 100644 --- a/dist/rules/hadoop/jmx_exporter.yml +++ b/dist/rules/hadoop/jmx_exporter.yml @@ -2,6 +2,7 @@ groups: - name: Jmx_exporter + rules: - alert: HadoopNameNodeDown diff --git a/dist/rules/haproxy/embedded-exporter-v2.yml b/dist/rules/haproxy/embedded-exporter-v2.yml index 820159e..a296434 100644 --- a/dist/rules/haproxy/embedded-exporter-v2.yml +++ b/dist/rules/haproxy/embedded-exporter-v2.yml @@ -2,6 +2,7 @@ groups: - name: EmbeddedExporterV2 + rules: - alert: HaproxyHighHttp4xxErrorRateBackend diff --git a/dist/rules/haproxy/haproxy-exporter-v1.yml b/dist/rules/haproxy/haproxy-exporter-v1.yml index 7be81a0..9af8084 100644 --- a/dist/rules/haproxy/haproxy-exporter-v1.yml +++ b/dist/rules/haproxy/haproxy-exporter-v1.yml @@ -2,6 +2,7 @@ groups: - name: HaproxyExporterV1 + rules: - alert: HaproxyDown diff --git a/dist/rules/hashicorp-vault/embedded-exporter.yml b/dist/rules/hashicorp-vault/embedded-exporter.yml index cd36928..51fcb22 100644 --- a/dist/rules/hashicorp-vault/embedded-exporter.yml +++ b/dist/rules/hashicorp-vault/embedded-exporter.yml @@ -2,6 +2,7 @@ groups: - name: EmbeddedExporter + rules: - alert: VaultSealed diff --git a/dist/rules/host-and-hardware/node-exporter.yml b/dist/rules/host-and-hardware/node-exporter.yml index 4faab18..5eef86e 100644 --- a/dist/rules/host-and-hardware/node-exporter.yml +++ b/dist/rules/host-and-hardware/node-exporter.yml @@ -2,6 +2,7 @@ groups: - name: NodeExporter + rules: - alert: HostOutOfMemory @@ -22,6 +23,7 @@ groups: summary: Host memory under memory pressure (instance {{ $labels.instance }}) description: "The node is under heavy memory pressure. High rate of loading memory pages from disk.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly - alert: HostMemoryIsUnderutilized expr: 'min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8' for: 0m @@ -58,6 +60,9 @@ groups: summary: Host unusual disk read rate (instance {{ $labels.instance }}) description: "Disk is too busy (IO wait > 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Please add ignored mountpoints in node_exporter parameters like + # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". + # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. - alert: HostOutOfDiskSpace expr: '(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0)' for: 2m @@ -67,6 +72,9 @@ groups: summary: Host out of disk space (instance {{ $labels.instance }}) description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Please add ignored mountpoints in node_exporter parameters like + # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". + # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. - alert: HostDiskMayFillIn24Hours expr: 'predict_linear(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[3h], 86400) <= 0 and node_filesystem_avail_bytes > 0' for: 2m @@ -130,6 +138,7 @@ groups: summary: Host high CPU load (instance {{ $labels.instance }}) description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly - alert: HostCpuIsUnderutilized expr: '(min by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8' for: 1w @@ -166,6 +175,9 @@ groups: summary: Host unusual disk IO (instance {{ $labels.instance }}) description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # x2 context switches is an arbitrary number. + # The alert threshold depends on the nature of the application. + # Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58 - alert: HostContextSwitchingHigh expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2' for: 0m diff --git a/dist/rules/istio/embedded-exporter.yml b/dist/rules/istio/embedded-exporter.yml index ce1c837..aef8632 100644 --- a/dist/rules/istio/embedded-exporter.yml +++ b/dist/rules/istio/embedded-exporter.yml @@ -2,6 +2,7 @@ groups: - name: EmbeddedExporter + rules: - alert: IstioKubernetesGatewayAvailabilityDrop diff --git a/dist/rules/jenkins/metric-plugin.yml b/dist/rules/jenkins/metric-plugin.yml index 38fb8c8..57c9cf6 100644 --- a/dist/rules/jenkins/metric-plugin.yml +++ b/dist/rules/jenkins/metric-plugin.yml @@ -2,6 +2,7 @@ groups: - name: MetricPlugin + rules: - alert: JenkinsOffline @@ -58,6 +59,12 @@ groups: summary: Jenkins build tests failing (instance {{ $labels.instance }}) description: "Last build tests failed: {{$labels.jenkins_job}}. Failed build Tests for job `{{$labels.jenkins_job}}` on {{$labels.instance}}/{{$labels.env}} ({{$labels.region}})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # * RUNNING -1 true - The build had no errors. + # * SUCCESS 0 true - The build had no errors. + # * UNSTABLE 1 true - The build had some errors but they were not fatal. For example, some tests failed. + # * FAILURE 2 false - The build had a fatal error. + # * NOT_BUILT 3 false - The module was not built. + # * ABORTED 4 false - The build was manually aborted. - alert: JenkinsLastBuildFailed expr: 'default_jenkins_builds_last_build_result_ordinal == 2' for: 0m diff --git a/dist/rules/juniper/czerwonk-junos-exporter.yml b/dist/rules/juniper/czerwonk-junos-exporter.yml index 8543fa8..e224732 100644 --- a/dist/rules/juniper/czerwonk-junos-exporter.yml +++ b/dist/rules/juniper/czerwonk-junos-exporter.yml @@ -2,6 +2,7 @@ groups: - name: CzerwonkJunosExporter + rules: - alert: JuniperSwitchDown diff --git a/dist/rules/jvm/jvm-exporter.yml b/dist/rules/jvm/jvm-exporter.yml index db8dd79..8828f52 100644 --- a/dist/rules/jvm/jvm-exporter.yml +++ b/dist/rules/jvm/jvm-exporter.yml @@ -2,6 +2,7 @@ groups: - name: JvmExporter + rules: - alert: JvmMemoryFillingUp diff --git a/dist/rules/kafka/danielqsj-kafka-exporter.yml b/dist/rules/kafka/danielqsj-kafka-exporter.yml index 31bdd1b..5348361 100644 --- a/dist/rules/kafka/danielqsj-kafka-exporter.yml +++ b/dist/rules/kafka/danielqsj-kafka-exporter.yml @@ -2,6 +2,7 @@ groups: - name: DanielqsjKafkaExporter + rules: - alert: KafkaTopicsReplicas diff --git a/dist/rules/kafka/linkedin-kafka-exporter.yml b/dist/rules/kafka/linkedin-kafka-exporter.yml index 6f572e5..01f6a83 100644 --- a/dist/rules/kafka/linkedin-kafka-exporter.yml +++ b/dist/rules/kafka/linkedin-kafka-exporter.yml @@ -2,6 +2,7 @@ groups: - name: LinkedinKafkaExporter + rules: - alert: KafkaTopicOffsetDecreased diff --git a/dist/rules/kubernetes/kubestate-exporter.yml b/dist/rules/kubernetes/kubestate-exporter.yml index 1125181..a49cf88 100644 --- a/dist/rules/kubernetes/kubestate-exporter.yml +++ b/dist/rules/kubernetes/kubestate-exporter.yml @@ -2,6 +2,7 @@ groups: - name: KubestateExporter + rules: - alert: KubernetesNodeNotReady @@ -13,6 +14,8 @@ groups: summary: Kubernetes Node ready (node {{ $labels.node }}) description: "Node {{ $labels.node }} has been unready for a long time\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Kubernetes Node with disabled schedules are fine. + # This alarm can be useful to get warned if there are nodes which are longer unscheduled. - alert: KubernetesNodeSchedulingDisabled expr: 'kube_node_spec_taint{key="node.kubernetes.io/unschedulable"} == 1' for: 30m @@ -265,6 +268,7 @@ groups: summary: Kubernetes DaemonSet misscheduled ({{ $labels.namespace }}/{{ $labels.daemonset }}) description: "Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Threshold should be customized for each cronjob name. - alert: KubernetesCronjobTooLong expr: 'kube_job_status_start_time > 0 and absent(kube_job_status_completion_time) and (time() - kube_job_status_start_time) > 3600' for: 0m diff --git a/dist/rules/linkerd/embedded-exporter.yml b/dist/rules/linkerd/embedded-exporter.yml index 269aacd..6afaaf4 100644 --- a/dist/rules/linkerd/embedded-exporter.yml +++ b/dist/rules/linkerd/embedded-exporter.yml @@ -2,6 +2,7 @@ groups: - name: EmbeddedExporter + rules: - alert: LinkerdHighErrorRate diff --git a/dist/rules/loki/embedded-exporter.yml b/dist/rules/loki/embedded-exporter.yml index 077036a..0283bc5 100644 --- a/dist/rules/loki/embedded-exporter.yml +++ b/dist/rules/loki/embedded-exporter.yml @@ -2,6 +2,7 @@ groups: - name: EmbeddedExporter + rules: - alert: LokiProcessTooManyRestarts diff --git a/dist/rules/meilisearch/embedded-exporter.yml b/dist/rules/meilisearch/embedded-exporter.yml index 8da2803..9e31806 100644 --- a/dist/rules/meilisearch/embedded-exporter.yml +++ b/dist/rules/meilisearch/embedded-exporter.yml @@ -2,6 +2,7 @@ groups: - name: EmbeddedExporter + rules: - alert: MeilisearchIndexIsEmpty diff --git a/dist/rules/minio/embedded-exporter.yml b/dist/rules/minio/embedded-exporter.yml index 1ac2de5..ea55791 100644 --- a/dist/rules/minio/embedded-exporter.yml +++ b/dist/rules/minio/embedded-exporter.yml @@ -2,6 +2,7 @@ groups: - name: EmbeddedExporter + rules: - alert: MinioClusterDiskOffline diff --git a/dist/rules/mongodb/dcu-mongodb-exporter.yml b/dist/rules/mongodb/dcu-mongodb-exporter.yml index 5df73f7..9ef62a0 100644 --- a/dist/rules/mongodb/dcu-mongodb-exporter.yml +++ b/dist/rules/mongodb/dcu-mongodb-exporter.yml @@ -2,6 +2,7 @@ groups: - name: DcuMongodbExporter + rules: - alert: MongodbReplicationLag diff --git a/dist/rules/mongodb/percona-mongodb-exporter.yml b/dist/rules/mongodb/percona-mongodb-exporter.yml index 1bd446f..5dc503a 100644 --- a/dist/rules/mongodb/percona-mongodb-exporter.yml +++ b/dist/rules/mongodb/percona-mongodb-exporter.yml @@ -2,6 +2,7 @@ groups: - name: PerconaMongodbExporter + rules: - alert: MongodbDown diff --git a/dist/rules/mongodb/stefanprodan-mgob-exporter.yml b/dist/rules/mongodb/stefanprodan-mgob-exporter.yml index 3fc7afb..885a5bb 100644 --- a/dist/rules/mongodb/stefanprodan-mgob-exporter.yml +++ b/dist/rules/mongodb/stefanprodan-mgob-exporter.yml @@ -2,6 +2,7 @@ groups: - name: StefanprodanMgobExporter + rules: - alert: MgobBackupFailed diff --git a/dist/rules/mysql/mysqld-exporter.yml b/dist/rules/mysql/mysqld-exporter.yml index 3ef716f..a19fc17 100644 --- a/dist/rules/mysql/mysqld-exporter.yml +++ b/dist/rules/mysql/mysqld-exporter.yml @@ -2,6 +2,7 @@ groups: - name: MysqldExporter + rules: - alert: MysqlDown diff --git a/dist/rules/nats/nats-exporter.yml b/dist/rules/nats/nats-exporter.yml index a0e26c8..e9ed81e 100644 --- a/dist/rules/nats/nats-exporter.yml +++ b/dist/rules/nats/nats-exporter.yml @@ -2,6 +2,7 @@ groups: - name: NatsExporter + rules: - alert: NatsHighConnectionCount diff --git a/dist/rules/netdata/embedded-exporter.yml b/dist/rules/netdata/embedded-exporter.yml index 8c57745..4540a17 100644 --- a/dist/rules/netdata/embedded-exporter.yml +++ b/dist/rules/netdata/embedded-exporter.yml @@ -2,6 +2,7 @@ groups: - name: EmbeddedExporter + rules: - alert: NetdataHighCpuUsage diff --git a/dist/rules/nginx/knyar-nginx-exporter.yml b/dist/rules/nginx/knyar-nginx-exporter.yml index 4c971a4..a7ab176 100644 --- a/dist/rules/nginx/knyar-nginx-exporter.yml +++ b/dist/rules/nginx/knyar-nginx-exporter.yml @@ -2,6 +2,7 @@ groups: - name: KnyarNginxExporter + rules: - alert: NginxHighHttp4xxErrorRate diff --git a/dist/rules/nomad/embedded-exporter.yml b/dist/rules/nomad/embedded-exporter.yml index 446b52f..b8b4059 100644 --- a/dist/rules/nomad/embedded-exporter.yml +++ b/dist/rules/nomad/embedded-exporter.yml @@ -2,6 +2,7 @@ groups: - name: EmbeddedExporter + rules: - alert: NomadJobFailed diff --git a/dist/rules/openebs/embedded-exporter.yml b/dist/rules/openebs/embedded-exporter.yml index 5f97a82..2070346 100644 --- a/dist/rules/openebs/embedded-exporter.yml +++ b/dist/rules/openebs/embedded-exporter.yml @@ -2,6 +2,7 @@ groups: - name: EmbeddedExporter + rules: - alert: OpenebsUsedPoolCapacity diff --git a/dist/rules/patroni/embedded-exporter-patroni.yml b/dist/rules/patroni/embedded-exporter-patroni.yml index 561f12f..87528f3 100644 --- a/dist/rules/patroni/embedded-exporter-patroni.yml +++ b/dist/rules/patroni/embedded-exporter-patroni.yml @@ -2,6 +2,7 @@ groups: - name: EmbeddedExporterPatroni + rules: - alert: PatroniHasNoLeader diff --git a/dist/rules/pgbouncer/spreaker-pgbouncer-exporter.yml b/dist/rules/pgbouncer/spreaker-pgbouncer-exporter.yml index e3dc0cc..ec83f51 100644 --- a/dist/rules/pgbouncer/spreaker-pgbouncer-exporter.yml +++ b/dist/rules/pgbouncer/spreaker-pgbouncer-exporter.yml @@ -2,6 +2,7 @@ groups: - name: SpreakerPgbouncerExporter + rules: - alert: PgbouncerActiveConnections diff --git a/dist/rules/php-fpm/bakins-fpm-exporter.yml b/dist/rules/php-fpm/bakins-fpm-exporter.yml index 1dfbd52..f5cce13 100644 --- a/dist/rules/php-fpm/bakins-fpm-exporter.yml +++ b/dist/rules/php-fpm/bakins-fpm-exporter.yml @@ -2,6 +2,7 @@ groups: - name: BakinsFpmExporter + rules: - alert: Php-fpmMax-childrenReached diff --git a/dist/rules/postgresql/postgres-exporter.yml b/dist/rules/postgresql/postgres-exporter.yml index 4514e9a..75e71b4 100644 --- a/dist/rules/postgresql/postgres-exporter.yml +++ b/dist/rules/postgresql/postgres-exporter.yml @@ -2,6 +2,7 @@ groups: - name: PostgresExporter + rules: - alert: PostgresqlDown @@ -166,6 +167,7 @@ groups: summary: Postgresql too many locks acquired (instance {{ $labels.instance }}) description: "Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737 - alert: PostgresqlBloatIndexHigh(>80%) expr: 'pg_bloat_btree_bloat_pct > 80 and on (idxname) (pg_bloat_btree_real_size > 100000000)' for: 1h @@ -175,6 +177,7 @@ groups: summary: Postgresql bloat index high (> 80%) (instance {{ $labels.instance }}) description: "The index {{ $labels.idxname }} is bloated. You should execute `REINDEX INDEX CONCURRENTLY {{ $labels.idxname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737 - alert: PostgresqlBloatTableHigh(>80%) expr: 'pg_bloat_table_bloat_pct > 80 and on (relname) (pg_bloat_table_real_size > 200000000)' for: 1h @@ -184,6 +187,7 @@ groups: summary: Postgresql bloat table high (> 80%) (instance {{ $labels.instance }}) description: "The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737 - alert: PostgresqlInvalidIndex expr: 'pg_general_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}' for: 6h diff --git a/dist/rules/prometheus-self-monitoring/embedded-exporter.yml b/dist/rules/prometheus-self-monitoring/embedded-exporter.yml index 908f001..5c623d5 100644 --- a/dist/rules/prometheus-self-monitoring/embedded-exporter.yml +++ b/dist/rules/prometheus-self-monitoring/embedded-exporter.yml @@ -2,6 +2,7 @@ groups: - name: EmbeddedExporter + rules: - alert: PrometheusJobMissing diff --git a/dist/rules/promtail/embedded-exporter.yml b/dist/rules/promtail/embedded-exporter.yml index c2dc0e0..bd32fa5 100644 --- a/dist/rules/promtail/embedded-exporter.yml +++ b/dist/rules/promtail/embedded-exporter.yml @@ -2,6 +2,7 @@ groups: - name: EmbeddedExporter + rules: - alert: PromtailRequestErrors diff --git a/dist/rules/pulsar/embedded-exporter.yml b/dist/rules/pulsar/embedded-exporter.yml index bf03a87..c6ba4ae 100644 --- a/dist/rules/pulsar/embedded-exporter.yml +++ b/dist/rules/pulsar/embedded-exporter.yml @@ -2,6 +2,7 @@ groups: - name: EmbeddedExporter + rules: - alert: PulsarSubscriptionHighNumberOfBacklogEntries diff --git a/dist/rules/rabbitmq/kbudde-rabbitmq-exporter.yml b/dist/rules/rabbitmq/kbudde-rabbitmq-exporter.yml index 40b6d95..85a19e4 100644 --- a/dist/rules/rabbitmq/kbudde-rabbitmq-exporter.yml +++ b/dist/rules/rabbitmq/kbudde-rabbitmq-exporter.yml @@ -2,6 +2,7 @@ groups: - name: KbuddeRabbitmqExporter + rules: - alert: RabbitmqDown @@ -49,6 +50,7 @@ groups: summary: RabbitMQ too many connections (instance {{ $labels.instance }}) description: "RabbitMQ instance has too many connections (> 1000)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Indicate the queue name in dedicated label. - alert: RabbitmqDeadLetterQueueFillingUp expr: 'rabbitmq_queue_messages{queue="my-dead-letter-queue"} > 10' for: 1m @@ -58,6 +60,7 @@ groups: summary: RabbitMQ dead letter queue filling up (instance {{ $labels.instance }}) description: "Dead letter queue is filling up (> 10 msgs)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Indicate the queue name in dedicated label. - alert: RabbitmqTooManyMessagesInQueue expr: 'rabbitmq_queue_messages_ready{queue="my-queue"} > 1000' for: 2m @@ -67,6 +70,7 @@ groups: summary: RabbitMQ too many messages in queue (instance {{ $labels.instance }}) description: "Queue is filling up (> 1000 msgs)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Indicate the queue name in dedicated label. - alert: RabbitmqSlowQueueConsuming expr: 'time() - rabbitmq_queue_head_message_timestamp{queue="my-queue"} > 60' for: 2m @@ -85,6 +89,7 @@ groups: summary: RabbitMQ no consumer (instance {{ $labels.instance }}) description: "Queue has no consumer\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Indicate the queue name in dedicated label. - alert: RabbitmqTooManyConsumers expr: 'rabbitmq_queue_consumers{queue="my-queue"} > 1' for: 0m @@ -94,6 +99,7 @@ groups: summary: RabbitMQ too many consumers (instance {{ $labels.instance }}) description: "Queue should have only 1 consumer\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # Indicate the exchange name in dedicated label. - alert: RabbitmqUnactiveExchange expr: 'rate(rabbitmq_exchange_messages_published_in_total{exchange="my-exchange"}[1m]) < 5' for: 2m diff --git a/dist/rules/rabbitmq/rabbitmq-exporter.yml b/dist/rules/rabbitmq/rabbitmq-exporter.yml index 10823d2..46a23ab 100644 --- a/dist/rules/rabbitmq/rabbitmq-exporter.yml +++ b/dist/rules/rabbitmq/rabbitmq-exporter.yml @@ -2,6 +2,7 @@ groups: - name: RabbitmqExporter + rules: - alert: RabbitmqNodeDown diff --git a/dist/rules/redis/oliver006-redis-exporter.yml b/dist/rules/redis/oliver006-redis-exporter.yml index 6b4dd8d..7ca53b6 100644 --- a/dist/rules/redis/oliver006-redis-exporter.yml +++ b/dist/rules/redis/oliver006-redis-exporter.yml @@ -2,6 +2,7 @@ groups: - name: Oliver006RedisExporter + rules: - alert: RedisDown @@ -67,6 +68,7 @@ groups: summary: Redis missing backup (instance {{ $labels.instance }}) description: "Redis has not been backuped for 24 hours\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # The exporter must be started with --include-system-metrics flag or REDIS_EXPORTER_INCL_SYSTEM_METRICS=true environment variable. - alert: RedisOutOfSystemMemory expr: 'redis_memory_used_bytes / redis_total_system_memory_bytes * 100 > 90' for: 2m diff --git a/dist/rules/s.m.a.r.t-device-monitoring/smartctl-exporter.yml b/dist/rules/s.m.a.r.t-device-monitoring/smartctl-exporter.yml index 866d715..6b0c48e 100644 --- a/dist/rules/s.m.a.r.t-device-monitoring/smartctl-exporter.yml +++ b/dist/rules/s.m.a.r.t-device-monitoring/smartctl-exporter.yml @@ -2,6 +2,7 @@ groups: - name: SmartctlExporter + rules: - alert: SmartDeviceTemperatureWarning diff --git a/dist/rules/sidekiq/strech-sidekiq-exporter.yml b/dist/rules/sidekiq/strech-sidekiq-exporter.yml index b704381..7da4969 100644 --- a/dist/rules/sidekiq/strech-sidekiq-exporter.yml +++ b/dist/rules/sidekiq/strech-sidekiq-exporter.yml @@ -2,6 +2,7 @@ groups: - name: StrechSidekiqExporter + rules: - alert: SidekiqQueueSize diff --git a/dist/rules/solr/embedded-exporter.yml b/dist/rules/solr/embedded-exporter.yml index 70e567f..5b653fb 100644 --- a/dist/rules/solr/embedded-exporter.yml +++ b/dist/rules/solr/embedded-exporter.yml @@ -2,6 +2,7 @@ groups: - name: EmbeddedExporter + rules: - alert: SolrUpdateErrors diff --git a/dist/rules/speedtest/nlamirault-speedtest-exporter.yml b/dist/rules/speedtest/nlamirault-speedtest-exporter.yml index a296d6b..6933caa 100644 --- a/dist/rules/speedtest/nlamirault-speedtest-exporter.yml +++ b/dist/rules/speedtest/nlamirault-speedtest-exporter.yml @@ -2,6 +2,7 @@ groups: - name: NlamiraultSpeedtestExporter + rules: - alert: SpeedtestSlowInternetDownload diff --git a/dist/rules/sql-server/ozarklake-mssql-exporter.yml b/dist/rules/sql-server/ozarklake-mssql-exporter.yml index 7bf3422..a699402 100644 --- a/dist/rules/sql-server/ozarklake-mssql-exporter.yml +++ b/dist/rules/sql-server/ozarklake-mssql-exporter.yml @@ -2,6 +2,7 @@ groups: - name: OzarklakeMssqlExporter + rules: - alert: SqlServerDown diff --git a/dist/rules/ssl/tls/ribbybibby-ssl-exporter.yml b/dist/rules/ssl/tls/ribbybibby-ssl-exporter.yml index f068e46..dcc1ce9 100644 --- a/dist/rules/ssl/tls/ribbybibby-ssl-exporter.yml +++ b/dist/rules/ssl/tls/ribbybibby-ssl-exporter.yml @@ -2,6 +2,7 @@ groups: - name: RibbybibbySslExporter + rules: - alert: SslCertificateProbeFailed diff --git a/dist/rules/thanos/thanos-bucket-replicate.yml b/dist/rules/thanos/thanos-bucket-replicate.yml index 694ef0e..972ed1c 100644 --- a/dist/rules/thanos/thanos-bucket-replicate.yml +++ b/dist/rules/thanos/thanos-bucket-replicate.yml @@ -2,6 +2,7 @@ groups: - name: ThanosBucketReplicate + rules: - alert: ThanosBucketReplicateErrorRate diff --git a/dist/rules/thanos/thanos-compactor.yml b/dist/rules/thanos/thanos-compactor.yml index 8276574..67032a9 100644 --- a/dist/rules/thanos/thanos-compactor.yml +++ b/dist/rules/thanos/thanos-compactor.yml @@ -2,6 +2,7 @@ groups: - name: ThanosCompactor + rules: - alert: ThanosCompactorMultipleRunning diff --git a/dist/rules/thanos/thanos-component-absent.yml b/dist/rules/thanos/thanos-component-absent.yml index 7454e62..5fc41df 100644 --- a/dist/rules/thanos/thanos-component-absent.yml +++ b/dist/rules/thanos/thanos-component-absent.yml @@ -2,6 +2,7 @@ groups: - name: ThanosComponentAbsent + rules: - alert: ThanosCompactIsDown diff --git a/dist/rules/thanos/thanos-query.yml b/dist/rules/thanos/thanos-query.yml index 18edd83..7813b8a 100644 --- a/dist/rules/thanos/thanos-query.yml +++ b/dist/rules/thanos/thanos-query.yml @@ -2,6 +2,7 @@ groups: - name: ThanosQuery + rules: - alert: ThanosQueryHttpRequestQueryErrorRateHigh diff --git a/dist/rules/thanos/thanos-receiver.yml b/dist/rules/thanos/thanos-receiver.yml index ef4270c..9ff00c3 100644 --- a/dist/rules/thanos/thanos-receiver.yml +++ b/dist/rules/thanos/thanos-receiver.yml @@ -2,6 +2,7 @@ groups: - name: ThanosReceiver + rules: - alert: ThanosReceiveHttpRequestErrorRateHigh diff --git a/dist/rules/thanos/thanos-ruler.yml b/dist/rules/thanos/thanos-ruler.yml index 3aa28b0..e69e6fa 100644 --- a/dist/rules/thanos/thanos-ruler.yml +++ b/dist/rules/thanos/thanos-ruler.yml @@ -2,6 +2,7 @@ groups: - name: ThanosRuler + rules: - alert: ThanosRuleQueueIsDroppingAlerts diff --git a/dist/rules/thanos/thanos-sidecar.yml b/dist/rules/thanos/thanos-sidecar.yml index 01505fd..82bab36 100644 --- a/dist/rules/thanos/thanos-sidecar.yml +++ b/dist/rules/thanos/thanos-sidecar.yml @@ -2,6 +2,7 @@ groups: - name: ThanosSidecar + rules: - alert: ThanosSidecarBucketOperationsFailed diff --git a/dist/rules/thanos/thanos-store.yml b/dist/rules/thanos/thanos-store.yml index 0651a6f..633ba97 100644 --- a/dist/rules/thanos/thanos-store.yml +++ b/dist/rules/thanos/thanos-store.yml @@ -2,6 +2,7 @@ groups: - name: ThanosStore + rules: - alert: ThanosStoreGrpcErrorRate diff --git a/dist/rules/traefik/embedded-exporter-v1.yml b/dist/rules/traefik/embedded-exporter-v1.yml index e25a8e7..0d95ac6 100644 --- a/dist/rules/traefik/embedded-exporter-v1.yml +++ b/dist/rules/traefik/embedded-exporter-v1.yml @@ -2,6 +2,7 @@ groups: - name: EmbeddedExporterV1 + rules: - alert: TraefikBackendDown diff --git a/dist/rules/traefik/embedded-exporter-v2.yml b/dist/rules/traefik/embedded-exporter-v2.yml index afc9040..d04519a 100644 --- a/dist/rules/traefik/embedded-exporter-v2.yml +++ b/dist/rules/traefik/embedded-exporter-v2.yml @@ -2,6 +2,7 @@ groups: - name: EmbeddedExporterV2 + rules: - alert: TraefikServiceDown diff --git a/dist/rules/vmware/pryorda-vmware-exporter.yml b/dist/rules/vmware/pryorda-vmware-exporter.yml index e3b2185..e9dbfe2 100644 --- a/dist/rules/vmware/pryorda-vmware-exporter.yml +++ b/dist/rules/vmware/pryorda-vmware-exporter.yml @@ -2,6 +2,7 @@ groups: - name: PryordaVmwareExporter + rules: - alert: VirtualMachineMemoryWarning diff --git a/dist/rules/windows-server/windows-exporter.yml b/dist/rules/windows-server/windows-exporter.yml index 6b4e806..08ab937 100644 --- a/dist/rules/windows-server/windows-exporter.yml +++ b/dist/rules/windows-server/windows-exporter.yml @@ -2,6 +2,7 @@ groups: - name: WindowsExporter + rules: - alert: WindowsServerCollectorError diff --git a/dist/rules/zfs/node-exporter.yml b/dist/rules/zfs/node-exporter.yml index e4c9cd6..546abf1 100644 --- a/dist/rules/zfs/node-exporter.yml +++ b/dist/rules/zfs/node-exporter.yml @@ -2,6 +2,7 @@ groups: - name: NodeExporter + rules: - alert: ZfsOfflinePool diff --git a/dist/rules/zfs/zfs_exporter.yml b/dist/rules/zfs/zfs_exporter.yml index acb0337..52abb00 100644 --- a/dist/rules/zfs/zfs_exporter.yml +++ b/dist/rules/zfs/zfs_exporter.yml @@ -2,6 +2,7 @@ groups: - name: Zfs_exporter + rules: - alert: ZfsPoolOutOfSpace @@ -13,6 +14,13 @@ groups: summary: ZFS pool out of space (instance {{ $labels.instance }}) description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + # 0: ONLINE + # 1: DEGRADED + # 2: FAULTED + # 3: OFFLINE + # 4: UNAVAIL + # 5: REMOVED + # 6: SUSPENDED - alert: ZfsPoolUnhealthy expr: 'zfs_pool_health > 0' for: 0m diff --git a/dist/rules/zookeeper/cloudflare-kafka-zookeeper-exporter.yml b/dist/rules/zookeeper/cloudflare-kafka-zookeeper-exporter.yml index 0219f03..e088e69 100644 --- a/dist/rules/zookeeper/cloudflare-kafka-zookeeper-exporter.yml +++ b/dist/rules/zookeeper/cloudflare-kafka-zookeeper-exporter.yml @@ -2,4 +2,5 @@ groups: - name: CloudflareKafkaZookeeperExporter + rules: diff --git a/dist/rules/zookeeper/dabealu-zookeeper-exporter.yml b/dist/rules/zookeeper/dabealu-zookeeper-exporter.yml index 64c8e7b..0e3747e 100644 --- a/dist/rules/zookeeper/dabealu-zookeeper-exporter.yml +++ b/dist/rules/zookeeper/dabealu-zookeeper-exporter.yml @@ -2,6 +2,7 @@ groups: - name: DabealuZookeeperExporter + rules: - alert: ZookeeperDown From d58bc324ad88a11f8a80912dfc90e23996dfa15a Mon Sep 17 00:00:00 2001 From: Arve Knudsen Date: Wed, 5 Nov 2025 17:08:26 +0100 Subject: [PATCH 3/8] Add OpenTelemetry Collector monitoring alerts (#480) Signed-off-by: Arve Knudsen --- _data/rules.yml | 70 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/_data/rules.yml b/_data/rules.yml index 985a546..bf2ee22 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -3183,6 +3183,76 @@ groups: query: "count by (instance) (alloy_build_info) unless count by (instance) (alloy_build_info offset 2m) " severity: critical + - name: OpenTelemetry Collector + exporters: + - name: Embedded exporter + slug: embedded-exporter + doc_url: https://opentelemetry.io/docs/collector/internal-telemetry/ + comments: | + OpenTelemetry Collector self-monitoring metrics are exposed on port 8888 by default at the /metrics endpoint. + These alerts monitor the collector's health when metrics are ingested via the Prometheus OTLP endpoint or scraped directly. + All collector internal metrics are prefixed with 'otelcol_'. + rules: + - name: OpenTelemetry Collector down + description: OpenTelemetry Collector instance has disappeared or is not being scraped + query: 'up{job=~".*otel.*collector.*"} == 0' + severity: critical + for: 1m + - name: OpenTelemetry Collector receiver refused spans + description: "OpenTelemetry Collector is refusing spans on {{ $labels.receiver }}" + query: 'rate(otelcol_receiver_refused_spans[5m]) > 0' + severity: critical + for: 5m + - name: OpenTelemetry Collector receiver refused metric points + description: "OpenTelemetry Collector is refusing metric points on {{ $labels.receiver }}" + query: 'rate(otelcol_receiver_refused_metric_points[5m]) > 0' + severity: critical + for: 5m + - name: OpenTelemetry Collector receiver refused log records + description: "OpenTelemetry Collector is refusing log records on {{ $labels.receiver }}" + query: 'rate(otelcol_receiver_refused_log_records[5m]) > 0' + severity: critical + for: 5m + - name: OpenTelemetry Collector exporter failed spans + description: "OpenTelemetry Collector failing to send spans via {{ $labels.exporter }}" + query: 'rate(otelcol_exporter_send_failed_spans[5m]) > 0' + severity: warning + for: 5m + - name: OpenTelemetry Collector exporter failed metric points + description: "OpenTelemetry Collector failing to send metric points via {{ $labels.exporter }}" + query: 'rate(otelcol_exporter_send_failed_metric_points[5m]) > 0' + severity: warning + for: 5m + - name: OpenTelemetry Collector exporter failed log records + description: "OpenTelemetry Collector failing to send log records via {{ $labels.exporter }}" + query: 'rate(otelcol_exporter_send_failed_log_records[5m]) > 0' + severity: warning + for: 5m + - name: OpenTelemetry Collector exporter queue nearly full + description: "OpenTelemetry Collector exporter {{ $labels.exporter }} queue is over 80% full" + query: '(otelcol_exporter_queue_size / on(instance, job, exporter) otelcol_exporter_queue_capacity) > 0.8 and otelcol_exporter_queue_capacity > 0' + severity: warning + - name: OpenTelemetry Collector processor refused spans + description: "OpenTelemetry Collector processor {{ $labels.processor }} is refusing spans, likely due to backpressure" + query: 'rate(otelcol_processor_refused_spans[5m]) > 0' + severity: warning + for: 5m + - name: OpenTelemetry Collector processor refused metric points + description: "OpenTelemetry Collector processor {{ $labels.processor }} is refusing metric points, likely due to backpressure" + query: 'rate(otelcol_processor_refused_metric_points[5m]) > 0' + severity: warning + for: 5m + - name: OpenTelemetry Collector high memory usage + description: "OpenTelemetry Collector memory usage is above 90%" + query: '(otelcol_process_runtime_heap_alloc_bytes{job=~".*otel.*collector.*"} / on(instance, job) otelcol_process_runtime_total_sys_memory_bytes{job=~".*otel.*collector.*"}) > 0.9' + severity: warning + for: 5m + - name: OpenTelemetry Collector OTLP receiver errors + description: "OpenTelemetry Collector OTLP receiver is completely failing - all spans are being refused" + query: 'rate(otelcol_receiver_accepted_spans{receiver=~"otlp"}[5m]) == 0 and rate(otelcol_receiver_refused_spans{receiver=~"otlp"}[5m]) > 0' + severity: critical + for: 2m + - name: Jenkins exporters: - name: Metric plugin From cea78d7fd637da484618403812c2caa5aeb8bc57 Mon Sep 17 00:00:00 2001 From: samber Date: Wed, 5 Nov 2025 16:08:52 +0000 Subject: [PATCH 4/8] Publish --- .../embedded-exporter.yml | 117 ++++++++++++++++++ 1 file changed, 117 insertions(+) create mode 100644 dist/rules/opentelemetry-collector/embedded-exporter.yml diff --git a/dist/rules/opentelemetry-collector/embedded-exporter.yml b/dist/rules/opentelemetry-collector/embedded-exporter.yml new file mode 100644 index 0000000..2ab4217 --- /dev/null +++ b/dist/rules/opentelemetry-collector/embedded-exporter.yml @@ -0,0 +1,117 @@ +groups: + +- name: EmbeddedExporter + + # OpenTelemetry Collector self-monitoring metrics are exposed on port 8888 by default at the /metrics endpoint. + # These alerts monitor the collector's health when metrics are ingested via the Prometheus OTLP endpoint or scraped directly. + # All collector internal metrics are prefixed with 'otelcol_'. + + rules: + + - alert: OpentelemetryCollectorDown + expr: 'up{job=~".*otel.*collector.*"} == 0' + for: 1m + labels: + severity: critical + annotations: + summary: OpenTelemetry Collector down (instance {{ $labels.instance }}) + description: "OpenTelemetry Collector instance has disappeared or is not being scraped\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: OpentelemetryCollectorReceiverRefusedSpans + expr: 'rate(otelcol_receiver_refused_spans[5m]) > 0' + for: 5m + labels: + severity: critical + annotations: + summary: OpenTelemetry Collector receiver refused spans (instance {{ $labels.instance }}) + description: "OpenTelemetry Collector is refusing spans on {{ $labels.receiver }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: OpentelemetryCollectorReceiverRefusedMetricPoints + expr: 'rate(otelcol_receiver_refused_metric_points[5m]) > 0' + for: 5m + labels: + severity: critical + annotations: + summary: OpenTelemetry Collector receiver refused metric points (instance {{ $labels.instance }}) + description: "OpenTelemetry Collector is refusing metric points on {{ $labels.receiver }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: OpentelemetryCollectorReceiverRefusedLogRecords + expr: 'rate(otelcol_receiver_refused_log_records[5m]) > 0' + for: 5m + labels: + severity: critical + annotations: + summary: OpenTelemetry Collector receiver refused log records (instance {{ $labels.instance }}) + description: "OpenTelemetry Collector is refusing log records on {{ $labels.receiver }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: OpentelemetryCollectorExporterFailedSpans + expr: 'rate(otelcol_exporter_send_failed_spans[5m]) > 0' + for: 5m + labels: + severity: warning + annotations: + summary: OpenTelemetry Collector exporter failed spans (instance {{ $labels.instance }}) + description: "OpenTelemetry Collector failing to send spans via {{ $labels.exporter }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: OpentelemetryCollectorExporterFailedMetricPoints + expr: 'rate(otelcol_exporter_send_failed_metric_points[5m]) > 0' + for: 5m + labels: + severity: warning + annotations: + summary: OpenTelemetry Collector exporter failed metric points (instance {{ $labels.instance }}) + description: "OpenTelemetry Collector failing to send metric points via {{ $labels.exporter }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: OpentelemetryCollectorExporterFailedLogRecords + expr: 'rate(otelcol_exporter_send_failed_log_records[5m]) > 0' + for: 5m + labels: + severity: warning + annotations: + summary: OpenTelemetry Collector exporter failed log records (instance {{ $labels.instance }}) + description: "OpenTelemetry Collector failing to send log records via {{ $labels.exporter }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: OpentelemetryCollectorExporterQueueNearlyFull + expr: '(otelcol_exporter_queue_size / on(instance, job, exporter) otelcol_exporter_queue_capacity) > 0.8 and otelcol_exporter_queue_capacity > 0' + for: 0m + labels: + severity: warning + annotations: + summary: OpenTelemetry Collector exporter queue nearly full (instance {{ $labels.instance }}) + description: "OpenTelemetry Collector exporter {{ $labels.exporter }} queue is over 80% full\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: OpentelemetryCollectorProcessorRefusedSpans + expr: 'rate(otelcol_processor_refused_spans[5m]) > 0' + for: 5m + labels: + severity: warning + annotations: + summary: OpenTelemetry Collector processor refused spans (instance {{ $labels.instance }}) + description: "OpenTelemetry Collector processor {{ $labels.processor }} is refusing spans, likely due to backpressure\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: OpentelemetryCollectorProcessorRefusedMetricPoints + expr: 'rate(otelcol_processor_refused_metric_points[5m]) > 0' + for: 5m + labels: + severity: warning + annotations: + summary: OpenTelemetry Collector processor refused metric points (instance {{ $labels.instance }}) + description: "OpenTelemetry Collector processor {{ $labels.processor }} is refusing metric points, likely due to backpressure\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: OpentelemetryCollectorHighMemoryUsage + expr: '(otelcol_process_runtime_heap_alloc_bytes{job=~".*otel.*collector.*"} / on(instance, job) otelcol_process_runtime_total_sys_memory_bytes{job=~".*otel.*collector.*"}) > 0.9' + for: 5m + labels: + severity: warning + annotations: + summary: OpenTelemetry Collector high memory usage (instance {{ $labels.instance }}) + description: "OpenTelemetry Collector memory usage is above 90%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: OpentelemetryCollectorOtlpReceiverErrors + expr: 'rate(otelcol_receiver_accepted_spans{receiver=~"otlp"}[5m]) == 0 and rate(otelcol_receiver_refused_spans{receiver=~"otlp"}[5m]) > 0' + for: 2m + labels: + severity: critical + annotations: + summary: OpenTelemetry Collector OTLP receiver errors (instance {{ $labels.instance }}) + description: "OpenTelemetry Collector OTLP receiver is completely failing - all spans are being refused\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" From 48f2dde80c8d810c3a29064148ea021d27171227 Mon Sep 17 00:00:00 2001 From: Samuel Berthe Date: Wed, 5 Nov 2025 17:12:50 +0100 Subject: [PATCH 5/8] feat: use /ref/head/ instead of /master/ for yaml url (#482) --- rules.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rules.md b/rules.md index 02364e5..1ce19c7 100644 --- a/rules.md +++ b/rules.md @@ -64,7 +64,7 @@ {% else %} {{ exporter.comments | strip | newline_to_br }} {% highlight bash %} -$ wget https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/{{ service.name | replace: " ", "-" | downcase }}/{{ exporter.slug }}.yml +$ wget https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/ref/head/dist/rules/{{ service.name | replace: " ", "-" | downcase }}/{{ exporter.slug }}.yml {% endhighlight %} {% endif %} From e617c0717928b03b52ca5b88c4afd77e79b57b37 Mon Sep 17 00:00:00 2001 From: Samuel Berthe Date: Wed, 5 Nov 2025 17:14:47 +0100 Subject: [PATCH 6/8] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 2e4e70e..be1fdf8 100644 --- a/README.md +++ b/README.md @@ -106,6 +106,7 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts - [Promtail](https://samber.github.io/awesome-prometheus-alerts/rules#promtail) - [Cortex](https://samber.github.io/awesome-prometheus-alerts/rules#cortex) - [Grafana Alloy](https://samber.github.io/awesome-prometheus-alerts/rules#grafana-alloy) +- [Opentelemetry Collector](https://samber.github.io/awesome-prometheus-alerts/rules#opentelemetry-collector) - [Jenkins](https://samber.github.io/awesome-prometheus-alerts/rules#jenkins) - [Graph Node](https://samber.github.io/awesome-prometheus-alerts/rules#graph-node) From d0d1b00a7be5b82d97e6966ed260f06da61cda4c Mon Sep 17 00:00:00 2001 From: Samuel Berthe Date: Wed, 5 Nov 2025 17:15:10 +0100 Subject: [PATCH 7/8] Fix typo in OpenTelemetry Collector link --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index be1fdf8..db16f95 100644 --- a/README.md +++ b/README.md @@ -106,7 +106,7 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts - [Promtail](https://samber.github.io/awesome-prometheus-alerts/rules#promtail) - [Cortex](https://samber.github.io/awesome-prometheus-alerts/rules#cortex) - [Grafana Alloy](https://samber.github.io/awesome-prometheus-alerts/rules#grafana-alloy) -- [Opentelemetry Collector](https://samber.github.io/awesome-prometheus-alerts/rules#opentelemetry-collector) +- [OpenTelemetry Collector](https://samber.github.io/awesome-prometheus-alerts/rules#opentelemetry-collector) - [Jenkins](https://samber.github.io/awesome-prometheus-alerts/rules#jenkins) - [Graph Node](https://samber.github.io/awesome-prometheus-alerts/rules#graph-node) From d6589237e11d5b4590f977fd95e16460fc82837d Mon Sep 17 00:00:00 2001 From: Samuel Berthe Date: Thu, 13 Nov 2025 16:24:49 +0100 Subject: [PATCH 8/8] Update CONTRIBUTING.md --- CONTRIBUTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 02b8c38..816a59f 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -18,7 +18,7 @@ Please ensure your pull request adheres to the following guidelines: ## Improving Github page -### Run localy +### Run locally ``` gem install bundler