Compare commits

..

No commits in common. "master" and "2024-05-02.1" have entirely different histories.

191 changed files with 2839 additions and 20712 deletions

1
.github/FUNDING.yml vendored
View file

@ -1,2 +1 @@
github: [samber]
ko_fi: samuelberthe

View file

@ -5,8 +5,3 @@ updates:
directory: "/"
schedule:
interval: "monthly"
- package-ecosystem: "npm"
directory: "/site"
schedule:
interval: "monthly"

View file

@ -1,25 +0,0 @@
name: Dependabot automerge
on:
pull_request:
types: [opened, synchronize]
jobs:
automerge:
runs-on: ubuntu-latest
if: github.actor == 'dependabot[bot]'
permissions:
contents: write
pull-requests: write
steps:
- name: Fetch Dependabot metadata
id: metadata
uses: dependabot/fetch-metadata@v3
- name: Enable auto-merge for github-actions updates
if: steps.metadata.outputs.package-ecosystem == 'github_actions'
run: gh pr merge --auto --squash "$PR_URL"
env:
PR_URL: ${{ github.event.pull_request.html_url }}
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}

View file

@ -1,62 +0,0 @@
name: Deploy Astro site to GitHub Pages
on:
push:
branches: [master]
workflow_dispatch:
# Only allow one concurrent deployment
concurrency:
group: pages
cancel-in-progress: false
permissions:
contents: read
pages: write
id-token: write
jobs:
build:
name: Build
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v6
- name: Setup Node.js
uses: actions/setup-node@v6
with:
node-version: 'latest'
cache: npm
cache-dependency-path: site/package-lock.json
- name: Install dependencies
working-directory: site
run: npm ci
- name: Build Astro site
working-directory: site
env:
ASTRO_TELEMETRY_DISABLED: "1"
run: npm run build
- name: Build Pagefind search index
working-directory: site
run: npx pagefind --site dist
- name: Upload Pages artifact
uses: actions/upload-pages-artifact@v5
with:
path: site/dist
deploy:
name: Deploy
needs: build
runs-on: ubuntu-latest
environment:
name: github-pages
url: ${{ steps.deployment.outputs.page_url }}
steps:
- name: Deploy to GitHub Pages
id: deployment
uses: actions/deploy-pages@v5

View file

@ -1,38 +1,34 @@
name: Publish
on:
workflow_dispatch:
push:
branches:
- master
permissions:
contents: write
jobs:
publish:
name: Publish
# Check if the PR is not from a fork
if: github.repository_owner == 'samber'
runs-on: ubuntu-latest
steps:
- name: Checkout Repo
uses: actions/checkout@v6
uses: actions/checkout@v4
- name: Set up Ruby
uses: ruby/setup-ruby@v1
with:
ruby-version: '3.4'
ruby-version: 2.7
- name: Set up yq
uses: mikefarah/yq@v4
uses: mikefarah/yq@master
- name: Install liquid
run: |
gem install liquid -v 5.5.1
gem install liquid-cli
run: gem install liquid-cli
- name: Build rule configuration
run: |
gem install liquid-cli
cat _data/rules.yml | yq -I 0 -o json > _data/rules.json
rm -rf dist/rules
@ -42,7 +38,7 @@ jobs:
mkdir -p "${subdir}"
# groupName=$(echo "{% assign groupName = name | split: ' ' %}{% capture groupNameCamelcase %}{% for word in groupName %}{{ word | capitalize }} {% endfor %}{% endcapture %} {{ groupNameCamelcase | remove: ' ' | remove: '-' }}" | liquid $(echo ${service} | base64 --decode | jq -r '.name | ascii_downcase | split(" ") | join("-")'))
for exporter in $(echo ${service} | base64 --decode | jq -r '.exporters[] | @base64'); do
exporterName=$(echo ${exporter} | base64 --decode | jq -r '.slug')
cat dist/template.yml | liquid "$(echo ${exporter} | base64 --decode)" > ${subdir}/${exporterName}.yml
@ -55,7 +51,7 @@ jobs:
# https://peterevans.dev/posts/github-actions-how-to-automate-code-formatting-in-pull-requests/
- name: Check for modified files
id: git-check
run: echo "modified=$(git status -s --porcelain | wc -l | awk '{$1=$1};1')" >> $GITHUB_OUTPUT
run: echo ::set-output name=modified::$(git status -s --porcelain | wc -l | awk '{$1=$1};1')
- name: Push changes
if: steps.git-check.outputs.modified != '0'
run: |

View file

@ -1,38 +0,0 @@
name: Site build
on:
pull_request:
paths:
- site/**
- _data/**
push:
branches:
- master
paths:
- site/**
- _data/**
jobs:
site-build:
name: Build Astro site
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v6
- name: Setup Node.js
uses: actions/setup-node@v6
with:
node-version: 'latest'
cache: npm
cache-dependency-path: site/package-lock.json
- name: Install dependencies
working-directory: site
run: npm ci
- name: Build Astro site
working-directory: site
env:
ASTRO_TELEMETRY_DISABLED: "1"
run: npm run build

View file

@ -1,14 +1,6 @@
name: Promtool check
on:
pull_request:
paths:
- _data/**
push:
branches:
- master
paths:
- _data/**
on: [pull_request, push]
jobs:
promtool-check:
@ -16,21 +8,22 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout Repo
uses: actions/checkout@v6
uses: actions/checkout@v4
- name: Set up Ruby
uses: ruby/setup-ruby@v1
with:
ruby-version: 3.4
ruby-version: 2.7
- name: Set up yq
uses: mikefarah/yq@v4
uses: mikefarah/yq@master
- name: Install liquid
run: gem install liquid-cli
- name: Build rule configuration
run: |
gem install liquid-cli
cat _data/rules.yml | yq -I 0 -o json > _data/rules.json
for service in $(cat _data/rules.json | jq -r '.groups[].services[] | @base64'); do
@ -38,7 +31,7 @@ jobs:
mkdir -p "${subdir}"
# groupName=$(echo "{% assign groupName = name | split: ' ' %}{% capture groupNameCamelcase %}{% for word in groupName %}{{ word | capitalize }} {% endfor %}{% endcapture %} {{ groupNameCamelcase | remove: ' ' | remove: '-' }}" | liquid $(echo ${service} | base64 --decode | jq -r '.name | ascii_downcase | split(" ") | join("-")'))
for exporter in $(echo ${service} | base64 --decode | jq -r '.exporters[] | @base64'); do
exporterName=$(echo ${exporter} | base64 --decode | jq -r '.slug')
cat dist/template.yml | liquid "$(echo ${exporter} | base64 --decode)" > ${subdir}/${exporterName}.yml

15
.gitignore vendored
View file

@ -1,13 +1,6 @@
# Generated data
_site/
.sass-cache/
.jekyll-cache/
.jekyll-metadata
_data/rules.json
test/rules/
# Node / Astro
/node_modules
site/node_modules/
site/dist/
site/.astro/
site/public/pagefind/
# Misc
.worktrees/

216
CLAUDE.md
View file

@ -1,216 +0,0 @@
# CLAUDE.md
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
## Project Overview
A curated collection of ~940 Prometheus alerting rules covering 90+ services across 100+ exporters, organized in categories: basic resource monitoring (Prometheus, host/hardware, SMART, Docker, Blackbox, Windows, VMware, Netdata), databases (MySQL, PostgreSQL, Redis, MongoDB, Elasticsearch, Cassandra, Clickhouse, CouchDB, etc.), message brokers (RabbitMQ, Kafka, Pulsar, Nats, Zookeeper), proxies/load balancers/service meshes (Nginx, Apache, HaProxy, Traefik, Caddy, Linkerd, Istio), runtimes (PHP-FPM, JVM, Sidekiq), data engineering (Apache Flink, Apache Spark, Hadoop), orchestrators (Kubernetes, Nomad, Consul, Etcd, OpenStack), CI/CD (Jenkins, ArgoCD, FluxCD, GitLab CI, Spinnaker), network and security (SSL/TLS, CoreDNS, Vault, Cloudflare, Cilium, eBPF), storage (Ceph, ZFS, OpenEBS, Minio), cloud providers (AWS, Azure, DigitalOcean), observability (Thanos, Loki, Cortex, OpenTelemetry Collector, Grafana Tempo/Mimir/Alloy, Jaeger), and other (APC UPS, Graph Node).
All rules are stored in a single YAML data file (`_data/rules.yml`) and rendered as a static site built with Astro + TypeScript (located in `site/`). The site provides copy-pasteable Prometheus alert snippets and downloadable rule files per exporter.
The project is community-driven. Most contributions are PRs adding or updating rules in `_data/rules.yml`. Files in `dist/rules/` are auto-generated on merge — never edit them manually.
## Architecture
- **`_data/rules.yml`** — The single source of truth for all alerting rules. This is the main file contributors edit. It is NOT a valid Prometheus config; the site renders each rule into copy-pasteable Prometheus alert format.
- **`site/`** — Astro + TypeScript static site. Run `npm run dev` inside this directory to develop locally.
- **`site/src/data/rules.ts`** — Typed wrappers and helper functions over `_data/rules.yml`.
- **`site/src/data/site.ts`** — Shared site metadata constants (URLs, author, schema objects).
- **`site/src/pages/`** — Astro page routes: `index.astro` (homepage), `rules/[group]/[service].astro` (per-service rule pages), `alertmanager.astro`, `blackbox-exporter.astro`, `sleep-peacefully.astro` (guides).
- **`site/src/layouts/BaseLayout.astro`** — Root HTML layout (SEO, GA, dark mode).
- **`site/src/layouts/GuideLayout.astro`** — Layout for guide pages (TOC, hero, related guides).
- **`site/src/components/`** — Shared Astro components (Header, Footer, Sidebar, RuleCard, ExporterSection, etc.).
- **`site/astro.config.mjs`** — Astro configuration (sitemap, Vite YAML plugin, base URL).
- **`dist/rules/`** — Pre-built downloadable rule files organized by service/exporter (referenced in the site for `wget` commands).
## Rules YAML Structure
Services are listed in README.md.
`_data/rules.yml` hierarchy:
```
groups:
- name: "<category>" # e.g. "Basic resource monitoring"
services:
- name: "<service>" # e.g. "Host and hardware"
exporters:
- name: "<exporter>"
slug: "<slug>" # used for download URLs
doc_url: "<url>" # optional link to exporter docs
comments: # optional, exporter-level multiline notes rendered before rules
"<comment>"
rules:
- name: "<alert name>"
description: "<text>"
query: "<PromQL>"
severity: warning|critical|info
for: "<duration>" # optional, defaults to 0m
comments: # optional, rendered as multiline YAML comments
"<comment>"
```
Services are grouped in category. If you are not sure about the classification, ask the developer.
## Running Locally
```bash
cd site
npm install
npm run dev
```
Site serves at http://localhost:4321/awesome-prometheus-alerts.
To build for production:
```bash
cd site
npm run build
npm run preview
```
## Contributing Rules
All rule changes go in `_data/rules.yml`. Each rule needs: `name`, `description`, `query` (valid PromQL), and `severity`. The `for` field is optional. Descriptions should be factual ("what") and include root cause hints ("why"). Queries must be tested against the latest exporter version. Never modify files in `dist/` — they are auto-generated on merge.
## Query Validation
- When adding or updating an alert, verify that the PromQL query references metric series that actually exist in the related exporter. Check the exporter's documentation or source code to confirm series names.
- If a metric series has been deprecated or removed in a newer version of the exporter, update the query to use the replacement series, or remove the rule if no replacement exists. Known examples: `kube_hpa_*` renamed to `kube_horizontalpodautoscaler_*` in kube-state-metrics 2.x; `node_hwmon_temp_alarm` does not exist (correct: `node_hwmon_temp_crit_alarm_celsius`); node-exporter CLI flags get renamed across versions.
- When writing or reviewing a query, search the internet (exporter docs, GitHub issues, changelogs) to validate correctness and catch outdated series names. When you are not sure about a metric name, always search the internet to confirm it exists and is spelled correctly before using it.
- Pay special attention to metric naming conventions: many exporters add `_total` suffixes for counters and `_seconds_total` for time-based counters. Verify the exact name from source code, not just docs. Known examples: Spark's PrometheusResource adds `_total` and `_seconds_total` suffixes (e.g., `metrics_executor_failedTasks_total`, not `metrics_executor_failedTasks`); Oracle's `oracledb_sessions_value` not `oracledb_sessions_activity`.
- Verify that label names used in `{{ $labels.xxx }}` template variables actually exist on the metric. Check the exporter source code for the exact label names. Known examples: cloudflare/ebpf_exporter uses `id` not `name` for programs, and `config` not `name` for decoder errors.
- When a metric uses info-style patterns (value always 1, information carried in labels), `== 0` will never be true — the metric simply won't exist. Use `absent()` instead. Known example: `ebpf_exporter_enabled_configs`.
- Some metrics are version-dependent. When a metric was renamed or removed in a newer version, add a comment noting the version requirement. Known examples: `go_memstats_gc_cpu_fraction` removed in client_golang v1.12+; cert-manager renamed `certmanager_http_acme_client_request_count` to `certmanager_acme_client_request_count` in v1.19+.
- Verify the unit of a metric before setting thresholds. Some metrics use milliseconds while descriptions assume seconds. Known example: Keycloak's `keycloak_request_duration` is in milliseconds, so `> 2` means 2ms not 2s.
- Some exporters expose labels that differ between services even within the same ecosystem. Known example: OpenStack Neutron uses `adminState="up"` while Nova and Cinder use `adminState="enabled"`.
- When an official mixin exists for a service, compare thresholds and time windows against it. Known deviations to watch for: Mimir store-gateway sync uses 1800s (not 600s), Mimir compactor skipped blocks uses `[24h]` (not `[5m]`), Tempo normalizes outstanding blocks per worker.
## Common Review Pitfalls (learned from PR history)
These are the most frequent issues raised during code review on this repo:
### Severity levels
- `critical` = requires immediate human attention. Do not use for informational/security notifications.
- `warning` = needs attention soon but not urgent.
- `info` = awareness only (e.g., config changes, underutilized resources).
- Authentication failures, security notifications, and config-change detections are typically `info`, not `critical`.
### `for` duration
- Omit `for` when the default (0m) is intentional and appropriate — do not add `for: 0m` explicitly.
- Add a `for` duration (e.g., `for: 2m` or `for: 5m`) to tolerate brief unavailability from restarts or transient spikes. Most "service down" rules should have at least `for: 1m``2m`.
- Do not blanket-change all `for: 0m` to `for: 1m` — it depends on the alert's semantics and the range window used in `increase()`/`rate()`.
### Query design
- Prefer symptom-based alerts over cause-based alerts to reduce alert fatigue. Example: "service is unreachable" is better than "specific internal counter changed". Metrics like heap object count, allocation rate, or free heap slots are causes, not symptoms — prefer GC duration, latency, or error rate alerts instead.
- Don't add unnecessary aggregation (`avg()`, `avg_over_time()`) on metrics that are local to a single node/instance. Only aggregate when the alert is cluster-wide.
- Don't combine `min_over_time()[1m]` with `for: 2m` redundantly — pick one mechanism for smoothing. Same applies to `avg_over_time()[5m]` with `for: 5m`.
- Remove unnecessary label filters (e.g., `job="cassandra"` or `cluster=~".*"`) that add noise without value.
- Verify comparison operators match the intent — e.g., "high snapshot count" must use `> N`, not `< N`.
- When dividing counters (e.g., error rate = errors / total), guard against division by zero with `and total > 0` or filter appropriately. This is the most common issue in new PRs — check every ratio query.
- Filter out system/template databases explicitly in DB queries (e.g., PostgreSQL: add `datid!="0"` alongside `datname!~"template.*|postgres"`).
- Never use `rate()` on a gauge metric — use `deriv()` instead. `rate()` is for monotonically increasing counters only.
- Conversely, never use `deriv()` or `delta()` on a metric that is a cumulative counter, even if the exporter declares it as `untyped`. The only reliable way to determine whether a metric is a counter or a gauge is to check whether it monotonically increases and resets on restart — not just the declared type. Known examples of untyped metrics with counter semantics: `node_vmstat_*` (e.g., `node_vmstat_pgmajfault`, `node_vmstat_oom_kill`) from node_exporter (cumulative values from /proc/vmstat — the official node_exporter mixin uses `rate()`); MySQL `SHOW GLOBAL STATUS` variables via mysqld_exporter (e.g., `mysql_global_status_slow_queries`, `mysql_global_status_innodb_log_waits`, `mysql_global_status_questions` — all monotonically increasing, use `rate()`/`increase()`).
- When using `increase()` for ratio calculations, prefer `rate()` instead — `increase()` can produce incorrect results when counters reset mid-window.
- When filtering gRPC error codes, don't use `grpc_code!="OK"` — this includes normal application responses like `NotFound`, `AlreadyExists`, and `Cancelled`. Filter to actual errors: `grpc_code=~"Internal|Unavailable|DeadlineExceeded|ResourceExhausted|Aborted|Unknown"`.
- When computing ratios with `rate()` on a metric that is itself already a normalized rate (e.g., Oracle's `v$waitclassmetric`), applying `rate()` computes the rate-of-change of a rate, which is not meaningful.
- When a multi-label metric is used in a binary operation with a metric that has fewer labels, use `ignoring(extra_label)` to avoid join failures. Known example: `systemd_unit_tasks_current / ignoring(type) systemd_unit_tasks_max`.
- When a query groups by labels (e.g., `by (le, worker)`), consider the cardinality impact — hundreds of label values means hundreds of independent alerts.
- Ensure `{{ $value | humanizeDuration }}` is only used on values in seconds. If the metric is in milliseconds, divide by 1000 first or use `{{ $value | humanize }}ms`.
- Avoid using `up{job=~"exporter-name"} == 0` or `absent(up{job=~"exporter-name"})` to detect whether a service is down. When targets are managed via service discovery or a job reaches multiple targets, a disappeared target causes the `up` series to become stale and vanish rather than drop to 0, so the alert never fires. Prefer application-level or cluster-level metrics instead (e.g., "number of consul cluster members < 3", "PostgreSQL primary node absent").
### Thresholds
- Alert thresholds are inherently arbitrary and depend on workload. Use `comments:` to note this when a threshold is a rough default.
- When threshold values in a PR seem unreasonable (too high or too low), challenge them with real-world reasoning or exporter docs.
- Watch for thresholds that are so high they only catch catastrophic scenarios and miss real problems. Examples: Go goroutine spike at 100/s (misses gradual leaks), Ruby major GC at 5/s (only fires if app is non-functional), Python gen2 GC at >1/s (extremely rare).
- Watch for thresholds that will fire on normal healthy operation. Examples: Memcached at 90% memory is desired (it's a cache), Flink TaskManager at 90% JVM heap is normal, cache hit rate < 80% is common for cold caches.
- For SNMP bandwidth utilization, `ifSpeed` (Gauge32) maxes at ~4.29 Gbps. For 10G+ interfaces, use `ifHighSpeed * 1000000` instead.
- For alerts using `> 0` on counters with `rate()` or `increase()`, consider whether a single event truly warrants alerting. In most cases, a small threshold (e.g., `> 0.05` for rate, `> 3` for increase) better distinguishes real problems from transient noise.
- When checking a cumulative total metric (one that only resets on process restart) with `> 0`, the alert will fire permanently after the first occurrence and never resolve. Always wrap such metrics in `increase()` or `rate()` to detect new events. Known example: `opensearch_circuitbreaker_tripped_count > 0` fires forever after the first circuit breaker trip.
### Comments
- When an alert or its query needs explanation (e.g., non-obvious PromQL logic, threshold rationale, edge cases), use the rule-level `comments:` field. Use multiline comments when needed.
- Use the exporter-level `comments:` field for notes that apply to all rules under that exporter (e.g., exporter version requirements, known quirks, setup prerequisites).
- Comments are rendered as YAML `#` comments in the output, so they are visible to users who copy-paste the rules.
- Never add two `comments:` keys to the same rule or exporter block. YAML silently discards the first when there are duplicate keys in the same mapping. Always merge multiple comment paragraphs into a single `comments:` field using the multiline `|` block scalar.
### Descriptions
- Keep descriptions short, factual, and actionable.
- Include what is happening ("Disk is almost full") and why it matters or what to check.
- Use `{{ $labels.instance }}`, `{{ $value }}`, and other template variables in descriptions when useful.
- If the description says "average" but the query uses `histogram_quantile(0.95, ...)`, fix the description to say "p95" (or vice versa).
- When alerting on rates or ratios that may not be intuitive, include `{{ $value }}` in the description so operators can see the actual number.
### Structure
- Some services have multiple exporters (e.g., MongoDB has `percona/mongodb_exporter` and `dcu/mongodb_exporter`). Place rules under the correct exporter.
- Search for duplicates before adding a new rule — a similar alert may already exist under a different exporter or with different thresholds.
- The `slug` field must be unique per exporter and is used for download URLs.
## Reference Sources for Cross-Checking Alerts
Use these sources to criticize and validate PromQL queries, compare thresholds, and find inspiration for new rules.
Everytime you consume an external resource to change a PromQL query, please compare before/after and explain why you think the external source is right.
### Official project mixins (alerts maintained by the project itself)
- https://github.com/prometheus/node_exporter/tree/master/docs/node-mixin/alerts
- https://github.com/prometheus/prometheus/tree/main/documentation/prometheus-mixin
- https://github.com/prometheus/alertmanager/tree/main/doc/alertmanager-mixin
- https://github.com/prometheus/snmp_exporter/tree/main/snmp-mixin
- https://github.com/prometheus/mysqld_exporter/tree/main/mysqld-mixin
- https://github.com/prometheus-community/postgres_exporter/tree/master/postgres_mixin
- https://github.com/prometheus-community/elasticsearch_exporter (mixin via Grafana docs)
- https://github.com/etcd-io/etcd/tree/main/contrib/mixin
- https://github.com/thanos-io/thanos/tree/main/mixin (also: examples/alerts/)
- https://github.com/grafana/loki/tree/main/production/loki-mixin (also: promtail-mixin/)
- https://github.com/grafana/mimir/tree/main/operations/mimir-mixin
- https://github.com/grafana/tempo/tree/main/operations/tempo-mixin
- https://github.com/grafana/grafana/tree/main/grafana-mixin
- https://github.com/ceph/ceph/tree/main/monitoring/ceph-mixin (in-tree; also https://github.com/ceph/ceph-mixins)
- https://github.com/jaegertracing/jaeger/tree/main/monitoring/jaeger-mixin
- https://github.com/kubernetes-monitoring/kubernetes-mixin (includes runbook.md)
- https://github.com/kubernetes/kube-state-metrics/tree/main/jsonnet/kube-state-metrics-mixin
- https://github.com/prometheus-operator/prometheus-operator/tree/main/jsonnet/mixin
- https://github.com/prometheus-operator/kube-prometheus
- https://github.com/cortexproject/cortex-jsonnet
- https://github.com/gluster/gluster-mixins
### Standalone mixin repositories
- https://github.com/povilasv/coredns-mixin
- https://github.com/adinhodovic/rabbitmq-mixin
- https://github.com/adinhodovic/blackbox-exporter-mixin
- https://github.com/adinhodovic/django-mixin
- https://github.com/adinhodovic/argo-cd-mixin
- https://github.com/adinhodovic/ingress-nginx-mixin
- https://github.com/adinhodovic/kubernetes-autoscaling-mixin
- https://github.com/metalmatze/kube-cockroachdb (CockroachDB on Kubernetes)
- https://github.com/bitnami-labs/sealed-secrets (sealed-secrets mixin)
- https://github.com/lukas-vlcek/elasticsearch-mixin (includes runbook.md)
- https://github.com/opensearch-project/opensearch-prometheus-exporter (OpenSearch exporter — check metric names here)
- https://github.com/adinhodovic/postgresql-mixin
- https://github.com/imusmanmalik/cert-manager-mixin
- https://gitlab.com/uneeq-oss/cert-manager-mixin (alternative cert-manager mixin)
- https://github.com/uneeq-oss/spinnaker-mixin
- https://github.com/metalmatze/slo-libsonnet (SLO alerting/recording rules generation library)
### Grafana jsonnet-libs (93 mixins — browse for specific services)
- https://github.com/grafana/jsonnet-libs
- Notable mixins with alerts: consul, memcached, elasticsearch, haproxy, clickhouse, opensearch, redis, mongodb, kafka, nginx, rabbitmq, jvm, vault, envoy, istio, jenkins, caddy, cloudflare, docker, traefik, windows, snmp, argocd, nomad, pgbouncer, minio, ceph, and 60+ more.
### Mixin aggregators
- https://monitoring.mixins.dev/ (central registry of all monitoring mixins)
- https://github.com/monitoring-mixins/website/blob/master/mixins.json (machine-readable list of all mixins with source URLs)
- https://github.com/nlamirault/monitoring-mixins (hub aggregating many mixins)
### GitLab monitoring & infrastructure
- https://gitlab.com/gitlab-com/runbooks (GitLab.com SRE runbooks — production alert rules, runbook docs, alertmanager config)
- https://gitlab.com/gitlab-com/runbooks/-/tree/master/mimir-rules (production Mimir alerting rules organized by tenant/environment)
- https://gitlab.com/gitlab-com/runbooks/-/tree/master/mimir-rules-jsonnet (jsonnet sources for GitLab alerting rules)
- https://gitlab.com/gitlab-org/omnibus-gitlab/-/tree/master/files/gitlab-cookbooks/monitoring/templates/rules (default Prometheus rules shipped with GitLab Omnibus)
### Community alert collections
- https://github.com/jpweber/prometheus-alert-rules
- https://github.com/bdossantos/prometheus-alert-rules
- https://github.com/giantswarm/prometheus-rules
- https://github.com/last9/awesome-prometheus-toolkit
- https://github.com/warpnet/awesome-prometheus (meta-list of Prometheus resources)

View file

@ -16,16 +16,24 @@ Please ensure your pull request adheres to the following guidelines:
- Description must be factual (the "what?") and should provide root cause suggestions (the "why?"), for faster resolution.
- Queries must be tested on latest exporter version.
## Improving the website
## Improving Github page
The site is built with Astro + TypeScript, located in `site/`.
### Run locally
### Run localy
```
cd site
npm install
npm run dev
gem install bundler
bundle install
jekyll serve
```
Site serves at http://localhost:4321/awesome-prometheus-alerts.
Or with Docker:
```
docker run --rm -it -p 4000:4000 -v $(pwd):/srv/jekyll jekyll/jekyll jekyll serve
```
Or with Docker-Compose:
```
docker-compose up -d
```

3
Gemfile Normal file
View file

@ -0,0 +1,3 @@
source 'https://rubygems.org'
gem 'github-pages', group: :jekyll_plugins
gem 'webrick', '~> 1.3', '>= 1.3.1'

284
Gemfile.lock Normal file
View file

@ -0,0 +1,284 @@
GEM
remote: https://rubygems.org/
specs:
activesupport (6.0.6.1)
concurrent-ruby (~> 1.0, >= 1.0.2)
i18n (>= 0.7, < 2)
minitest (~> 5.1)
tzinfo (~> 1.1)
zeitwerk (~> 2.2, >= 2.2.2)
addressable (2.8.0)
public_suffix (>= 2.0.2, < 5.0)
coffee-script (2.4.1)
coffee-script-source
execjs
coffee-script-source (1.11.1)
colorator (1.1.0)
commonmarker (0.23.10)
concurrent-ruby (1.2.0)
dnsruby (1.61.9)
simpleidn (~> 0.1)
em-websocket (0.5.3)
eventmachine (>= 0.12.9)
http_parser.rb (~> 0)
ethon (0.15.0)
ffi (>= 1.15.0)
eventmachine (1.2.7)
execjs (2.8.1)
faraday (1.10.0)
faraday-em_http (~> 1.0)
faraday-em_synchrony (~> 1.0)
faraday-excon (~> 1.1)
faraday-httpclient (~> 1.0)
faraday-multipart (~> 1.0)
faraday-net_http (~> 1.0)
faraday-net_http_persistent (~> 1.0)
faraday-patron (~> 1.0)
faraday-rack (~> 1.0)
faraday-retry (~> 1.0)
ruby2_keywords (>= 0.0.4)
faraday-em_http (1.0.0)
faraday-em_synchrony (1.0.0)
faraday-excon (1.1.0)
faraday-httpclient (1.0.1)
faraday-multipart (1.0.3)
multipart-post (>= 1.2, < 3)
faraday-net_http (1.0.1)
faraday-net_http_persistent (1.2.0)
faraday-patron (1.0.0)
faraday-rack (1.0.0)
faraday-retry (1.0.3)
ffi (1.15.5)
forwardable-extended (2.6.0)
gemoji (3.0.1)
github-pages (226)
github-pages-health-check (= 1.17.9)
jekyll (= 3.9.2)
jekyll-avatar (= 0.7.0)
jekyll-coffeescript (= 1.1.1)
jekyll-commonmark-ghpages (= 0.2.0)
jekyll-default-layout (= 0.1.4)
jekyll-feed (= 0.15.1)
jekyll-gist (= 1.5.0)
jekyll-github-metadata (= 2.13.0)
jekyll-include-cache (= 0.2.1)
jekyll-mentions (= 1.6.0)
jekyll-optional-front-matter (= 0.3.2)
jekyll-paginate (= 1.1.0)
jekyll-readme-index (= 0.3.0)
jekyll-redirect-from (= 0.16.0)
jekyll-relative-links (= 0.6.1)
jekyll-remote-theme (= 0.4.3)
jekyll-sass-converter (= 1.5.2)
jekyll-seo-tag (= 2.8.0)
jekyll-sitemap (= 1.4.0)
jekyll-swiss (= 1.0.0)
jekyll-theme-architect (= 0.2.0)
jekyll-theme-cayman (= 0.2.0)
jekyll-theme-dinky (= 0.2.0)
jekyll-theme-hacker (= 0.2.0)
jekyll-theme-leap-day (= 0.2.0)
jekyll-theme-merlot (= 0.2.0)
jekyll-theme-midnight (= 0.2.0)
jekyll-theme-minimal (= 0.2.0)
jekyll-theme-modernist (= 0.2.0)
jekyll-theme-primer (= 0.6.0)
jekyll-theme-slate (= 0.2.0)
jekyll-theme-tactile (= 0.2.0)
jekyll-theme-time-machine (= 0.2.0)
jekyll-titles-from-headings (= 0.5.3)
jemoji (= 0.12.0)
kramdown (= 2.3.2)
kramdown-parser-gfm (= 1.1.0)
liquid (= 4.0.3)
mercenary (~> 0.3)
minima (= 2.5.1)
nokogiri (>= 1.13.4, < 2.0)
rouge (= 3.26.0)
terminal-table (~> 1.4)
github-pages-health-check (1.17.9)
addressable (~> 2.3)
dnsruby (~> 1.60)
octokit (~> 4.0)
public_suffix (>= 3.0, < 5.0)
typhoeus (~> 1.3)
html-pipeline (2.14.1)
activesupport (>= 2)
nokogiri (>= 1.4)
http_parser.rb (0.8.0)
i18n (0.9.5)
concurrent-ruby (~> 1.0)
jekyll (3.9.2)
addressable (~> 2.4)
colorator (~> 1.0)
em-websocket (~> 0.5)
i18n (~> 0.7)
jekyll-sass-converter (~> 1.0)
jekyll-watch (~> 2.0)
kramdown (>= 1.17, < 3)
liquid (~> 4.0)
mercenary (~> 0.3.3)
pathutil (~> 0.9)
rouge (>= 1.7, < 4)
safe_yaml (~> 1.0)
jekyll-avatar (0.7.0)
jekyll (>= 3.0, < 5.0)
jekyll-coffeescript (1.1.1)
coffee-script (~> 2.2)
coffee-script-source (~> 1.11.1)
jekyll-commonmark (1.4.0)
commonmarker (~> 0.22)
jekyll-commonmark-ghpages (0.2.0)
commonmarker (~> 0.23.4)
jekyll (~> 3.9.0)
jekyll-commonmark (~> 1.4.0)
rouge (>= 2.0, < 4.0)
jekyll-default-layout (0.1.4)
jekyll (~> 3.0)
jekyll-feed (0.15.1)
jekyll (>= 3.7, < 5.0)
jekyll-gist (1.5.0)
octokit (~> 4.2)
jekyll-github-metadata (2.13.0)
jekyll (>= 3.4, < 5.0)
octokit (~> 4.0, != 4.4.0)
jekyll-include-cache (0.2.1)
jekyll (>= 3.7, < 5.0)
jekyll-mentions (1.6.0)
html-pipeline (~> 2.3)
jekyll (>= 3.7, < 5.0)
jekyll-optional-front-matter (0.3.2)
jekyll (>= 3.0, < 5.0)
jekyll-paginate (1.1.0)
jekyll-readme-index (0.3.0)
jekyll (>= 3.0, < 5.0)
jekyll-redirect-from (0.16.0)
jekyll (>= 3.3, < 5.0)
jekyll-relative-links (0.6.1)
jekyll (>= 3.3, < 5.0)
jekyll-remote-theme (0.4.3)
addressable (~> 2.0)
jekyll (>= 3.5, < 5.0)
jekyll-sass-converter (>= 1.0, <= 3.0.0, != 2.0.0)
rubyzip (>= 1.3.0, < 3.0)
jekyll-sass-converter (1.5.2)
sass (~> 3.4)
jekyll-seo-tag (2.8.0)
jekyll (>= 3.8, < 5.0)
jekyll-sitemap (1.4.0)
jekyll (>= 3.7, < 5.0)
jekyll-swiss (1.0.0)
jekyll-theme-architect (0.2.0)
jekyll (> 3.5, < 5.0)
jekyll-seo-tag (~> 2.0)
jekyll-theme-cayman (0.2.0)
jekyll (> 3.5, < 5.0)
jekyll-seo-tag (~> 2.0)
jekyll-theme-dinky (0.2.0)
jekyll (> 3.5, < 5.0)
jekyll-seo-tag (~> 2.0)
jekyll-theme-hacker (0.2.0)
jekyll (> 3.5, < 5.0)
jekyll-seo-tag (~> 2.0)
jekyll-theme-leap-day (0.2.0)
jekyll (> 3.5, < 5.0)
jekyll-seo-tag (~> 2.0)
jekyll-theme-merlot (0.2.0)
jekyll (> 3.5, < 5.0)
jekyll-seo-tag (~> 2.0)
jekyll-theme-midnight (0.2.0)
jekyll (> 3.5, < 5.0)
jekyll-seo-tag (~> 2.0)
jekyll-theme-minimal (0.2.0)
jekyll (> 3.5, < 5.0)
jekyll-seo-tag (~> 2.0)
jekyll-theme-modernist (0.2.0)
jekyll (> 3.5, < 5.0)
jekyll-seo-tag (~> 2.0)
jekyll-theme-primer (0.6.0)
jekyll (> 3.5, < 5.0)
jekyll-github-metadata (~> 2.9)
jekyll-seo-tag (~> 2.0)
jekyll-theme-slate (0.2.0)
jekyll (> 3.5, < 5.0)
jekyll-seo-tag (~> 2.0)
jekyll-theme-tactile (0.2.0)
jekyll (> 3.5, < 5.0)
jekyll-seo-tag (~> 2.0)
jekyll-theme-time-machine (0.2.0)
jekyll (> 3.5, < 5.0)
jekyll-seo-tag (~> 2.0)
jekyll-titles-from-headings (0.5.3)
jekyll (>= 3.3, < 5.0)
jekyll-watch (2.2.1)
listen (~> 3.0)
jemoji (0.12.0)
gemoji (~> 3.0)
html-pipeline (~> 2.2)
jekyll (>= 3.0, < 5.0)
kramdown (2.3.2)
rexml
kramdown-parser-gfm (1.1.0)
kramdown (~> 2.0)
liquid (4.0.3)
listen (3.7.1)
rb-fsevent (~> 0.10, >= 0.10.3)
rb-inotify (~> 0.9, >= 0.9.10)
mercenary (0.3.6)
minima (2.5.1)
jekyll (>= 3.5, < 5.0)
jekyll-feed (~> 0.9)
jekyll-seo-tag (~> 2.1)
minitest (5.17.0)
multipart-post (2.1.1)
nokogiri (1.16.2-x86_64-linux)
racc (~> 1.4)
octokit (4.22.0)
faraday (>= 0.9)
sawyer (~> 0.8.0, >= 0.5.3)
pathutil (0.16.2)
forwardable-extended (~> 2.6)
public_suffix (4.0.7)
racc (1.7.3)
rb-fsevent (0.11.1)
rb-inotify (0.10.1)
ffi (~> 1.0)
rexml (3.2.5)
rouge (3.26.0)
ruby2_keywords (0.0.5)
rubyzip (2.3.2)
safe_yaml (1.0.5)
sass (3.7.4)
sass-listen (~> 4.0.0)
sass-listen (4.0.0)
rb-fsevent (~> 0.9, >= 0.9.4)
rb-inotify (~> 0.9, >= 0.9.7)
sawyer (0.8.2)
addressable (>= 2.3.5)
faraday (> 0.8, < 2.0)
simpleidn (0.2.1)
unf (~> 0.1.4)
terminal-table (1.8.0)
unicode-display_width (~> 1.1, >= 1.1.1)
thread_safe (0.3.6)
typhoeus (1.4.0)
ethon (>= 0.9.0)
tzinfo (1.2.11)
thread_safe (~> 0.1)
unf (0.1.4)
unf_ext
unf_ext (0.0.8.1)
unicode-display_width (1.8.0)
webrick (1.7.0)
zeitwerk (2.6.6)
PLATFORMS
x86_64-linux
x86_64-linux-musl
DEPENDENCIES
github-pages
webrick (~> 1.3, >= 1.3.1)
BUNDLED WITH
2.3.13

38
LICENSE
View file

@ -1,39 +1,3 @@
This repository uses a dual license:
- Alert rules and content (_data/rules.yml, dist/rules/, README.md):
Creative Commons Attribution 4.0 International (CC BY 4.0)
https://creativecommons.org/licenses/by/4.0/
- Site source code (site/):
MIT License
https://opensource.org/licenses/MIT
---
Creative Commons Attribution 4.0 International License (CC BY 4.0)
https://creativecommons.org/licenses/by/4.0/
---
MIT License (site source code)
Copyright (c) 2018 Samuel Berthe
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
http://creativecommons.org/licenses/by/4.0/

118
README.md
View file

@ -1,6 +1,6 @@
# 👋 Awesome Prometheus Alerts [![Awesome](https://awesome.re/badge-flat.svg)](https://awesome.re)
> **940+ production-ready Prometheus alerting rules for 90+ services** — copy-paste YAML for Kubernetes, MySQL, Redis, Kafka, and more.
> Most alerting rules are common to every Prometheus setup. We need a place to find them all. 🤘 🚨 📊
Collection available here: **[https://samber.github.io/awesome-prometheus-alerts](https://samber.github.io/awesome-prometheus-alerts)**
@ -8,18 +8,9 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts
<hr>
<sup><b>Sponsored by:</b></sup>
<br>
<a href="https://cast.ai/samuel">
<div>
<img src="https://samber.github.io/awesome-prometheus-alerts/images/sponsor-cast-ai.png" width="200" alt="Cast AI">
</div>
<div>
Cut Kubernetes & AI costs, boost application stability.
</div>
</a>
<br>
<a href="https://betterstack.com">
<div>
<img src="https://samber.github.io/awesome-prometheus-alerts/images/sponsor-betterstack.png" width="200" alt="Better Stack">
<img src="https://samber.github.io/awesome-prometheus-alerts/assets/sponsor-betterstack.png" width="200" alt="Better Stack">
</div>
<div>
Better Stack lets you centralize, search, and visualize your logs.
@ -43,130 +34,73 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts
- [Prometheus self-monitoring](https://samber.github.io/awesome-prometheus-alerts/rules#prometheus-internals)
- [Host/Hardware](https://samber.github.io/awesome-prometheus-alerts/rules#host-and-hardware)
- [SMART](https://samber.github.io/awesome-prometheus-alerts/rules#smart)
- [IPMI](https://samber.github.io/awesome-prometheus-alerts/rules#ipmi)
- [Docker Containers](https://samber.github.io/awesome-prometheus-alerts/rules#docker-containers)
- [Blackbox](https://samber.github.io/awesome-prometheus-alerts/rules#blackbox)
- [Windows](https://samber.github.io/awesome-prometheus-alerts/rules#windows-server)
- [VMWare](https://samber.github.io/awesome-prometheus-alerts/rules#vmware)
- [Proxmox VE](https://samber.github.io/awesome-prometheus-alerts/rules#proxmox-ve)
- [Netdata](https://samber.github.io/awesome-prometheus-alerts/rules#netdata)
- [eBPF](https://samber.github.io/awesome-prometheus-alerts/rules#ebpf)
- [Process Exporter](https://samber.github.io/awesome-prometheus-alerts/rules#process-exporter)
- [Systemd](https://samber.github.io/awesome-prometheus-alerts/rules#systemd)
#### Databases
#### Databases and brokers
- [MySQL](https://samber.github.io/awesome-prometheus-alerts/rules#mysql)
- [PostgreSQL](https://samber.github.io/awesome-prometheus-alerts/rules#postgresql)
- [SQL Server](https://samber.github.io/awesome-prometheus-alerts/rules#sql-server)
- [Oracle Database](https://samber.github.io/awesome-prometheus-alerts/rules#oracle-database)
- [Patroni](https://samber.github.io/awesome-prometheus-alerts/rules#patroni)
- [PGBouncer](https://samber.github.io/awesome-prometheus-alerts/rules#pgbouncer)
- [Redis](https://samber.github.io/awesome-prometheus-alerts/rules#redis)
- [Memcached](https://samber.github.io/awesome-prometheus-alerts/rules#memcached)
- [MongoDB](https://samber.github.io/awesome-prometheus-alerts/rules#mongodb)
- [Elasticsearch](https://samber.github.io/awesome-prometheus-alerts/rules#elasticsearch)
- [OpenSearch](https://samber.github.io/awesome-prometheus-alerts/rules#opensearch)
- [Meilisearch](https://samber.github.io/awesome-prometheus-alerts/rules#meilisearch)
- [Cassandra](https://samber.github.io/awesome-prometheus-alerts/rules#cassandra)
- [Clickhouse](https://samber.github.io/awesome-prometheus-alerts/rules#clickhouse)
- [CouchDB](https://samber.github.io/awesome-prometheus-alerts/rules#couchdb)
- [Solr](https://samber.github.io/awesome-prometheus-alerts/rules#solr)
#### Message brokers
- [RabbitMQ](https://samber.github.io/awesome-prometheus-alerts/rules#rabbitmq)
- [Elasticsearch](https://samber.github.io/awesome-prometheus-alerts/rules#elasticsearch)
- [Cassandra](https://samber.github.io/awesome-prometheus-alerts/rules#cassandra)
- [Zookeeper](https://samber.github.io/awesome-prometheus-alerts/rules#zookeeper)
- [Kafka](https://samber.github.io/awesome-prometheus-alerts/rules#kafka)
- [Pulsar](https://samber.github.io/awesome-prometheus-alerts/rules#pulsar)
- [Nats](https://samber.github.io/awesome-prometheus-alerts/rules#nats)
- [Solr](https://samber.github.io/awesome-prometheus-alerts/rules#solr)
- [Hadoop](https://samber.github.io/awesome-prometheus-alerts/rules#hadoop)
#### Proxies, load balancers and service meshes
#### Reverse proxies and load balancers
- [Nginx](https://samber.github.io/awesome-prometheus-alerts/rules#nginx)
- [Apache](https://samber.github.io/awesome-prometheus-alerts/rules#apache)
- [HaProxy](https://samber.github.io/awesome-prometheus-alerts/rules#haproxy)
- [Traefik](https://samber.github.io/awesome-prometheus-alerts/rules#traefik)
- [Caddy](https://samber.github.io/awesome-prometheus-alerts/rules#caddy)
- [Envoy](https://samber.github.io/awesome-prometheus-alerts/rules#envoy)
- [Linkerd](https://samber.github.io/awesome-prometheus-alerts/rules#linkerd)
- [Istio](https://samber.github.io/awesome-prometheus-alerts/rules#istio)
#### Runtimes
- [PHP-FPM](https://samber.github.io/awesome-prometheus-alerts/rules#php-fpm)
- [JVM](https://samber.github.io/awesome-prometheus-alerts/rules#jvm)
- [Golang](https://samber.github.io/awesome-prometheus-alerts/rules#golang)
- [Ruby](https://samber.github.io/awesome-prometheus-alerts/rules#ruby)
- [Python](https://samber.github.io/awesome-prometheus-alerts/rules#python)
- [Sidekiq](https://samber.github.io/awesome-prometheus-alerts/rules#sidekiq)
#### Data engineering
- [Apache Flink](https://samber.github.io/awesome-prometheus-alerts/rules#apache-flink)
- [Apache Spark](https://samber.github.io/awesome-prometheus-alerts/rules#apache-spark)
- [Hadoop](https://samber.github.io/awesome-prometheus-alerts/rules#hadoop)
#### Orchestrators
- [Kubernetes](https://samber.github.io/awesome-prometheus-alerts/rules#kubernetes)
- [Nomad](https://samber.github.io/awesome-prometheus-alerts/rules#nomad)
- [Consul](https://samber.github.io/awesome-prometheus-alerts/rules#consul)
- [Etcd](https://samber.github.io/awesome-prometheus-alerts/rules#etcd)
- [OpenStack](https://samber.github.io/awesome-prometheus-alerts/rules#openstack)
#### CI/CD
- [Jenkins](https://samber.github.io/awesome-prometheus-alerts/rules#jenkins)
- [Linkerd](https://samber.github.io/awesome-prometheus-alerts/rules#linkerd)
- [Istio](https://samber.github.io/awesome-prometheus-alerts/rules#istio)
- [ArgoCD](https://samber.github.io/awesome-prometheus-alerts/rules#argocd)
- [FluxCD](https://samber.github.io/awesome-prometheus-alerts/rules#fluxcd)
- [GitLab CI](https://samber.github.io/awesome-prometheus-alerts/rules#gitlab-ci)
- [Spinnaker](https://samber.github.io/awesome-prometheus-alerts/rules#spinnaker)
#### Network and security
- [SpeedTest](https://samber.github.io/awesome-prometheus-alerts/rules#speedtest)
- [SSL/TLS](https://samber.github.io/awesome-prometheus-alerts/rules#ssl/tls)
- [cert-manager](https://samber.github.io/awesome-prometheus-alerts/rules#cert-manager)
- [Juniper](https://samber.github.io/awesome-prometheus-alerts/rules#juniper)
- [CoreDNS](https://samber.github.io/awesome-prometheus-alerts/rules#coredns)
- [FreeSwitch](https://samber.github.io/awesome-prometheus-alerts/rules#freeswitch)
- [Hashicorp Vault](https://samber.github.io/awesome-prometheus-alerts/rules#hashicorp-vault)
- [Keycloak](https://samber.github.io/awesome-prometheus-alerts/rules#keycloak)
- [Cloudflare](https://samber.github.io/awesome-prometheus-alerts/rules#cloudflare)
- [SNMP](https://samber.github.io/awesome-prometheus-alerts/rules#snmp)
- [Cilium](https://samber.github.io/awesome-prometheus-alerts/rules#cilium)
- [WireGuard](https://samber.github.io/awesome-prometheus-alerts/rules#wireguard)
#### Storage
#### Network, security and storage
- [Ceph](https://samber.github.io/awesome-prometheus-alerts/rules#ceph)
- [ZFS](https://samber.github.io/awesome-prometheus-alerts/rules#zfs)
- [OpenEBS](https://samber.github.io/awesome-prometheus-alerts/rules#openebs)
- [Minio](https://samber.github.io/awesome-prometheus-alerts/rules#minio)
- [SSL/TLS](https://samber.github.io/awesome-prometheus-alerts/rules#ssl/tls)
- [Juniper](https://samber.github.io/awesome-prometheus-alerts/rules#juniper)
- [CoreDNS](https://samber.github.io/awesome-prometheus-alerts/rules#coredns)
- [FreeSwitch](https://samber.github.io/awesome-prometheus-alerts/rules#freeswitch)
- [Hashicorp Vault](https://samber.github.io/awesome-prometheus-alerts/rules#hashicorp-vault)
- [Cloudflare](https://samber.github.io/awesome-prometheus-alerts/rules#cloudflare)
#### Cloud providers
- [AWS CloudWatch](https://samber.github.io/awesome-prometheus-alerts/rules#aws-cloudwatch)
- [Google Cloud Stackdriver](https://samber.github.io/awesome-prometheus-alerts/rules#google-cloud-stackdriver)
- [DigitalOcean](https://samber.github.io/awesome-prometheus-alerts/rules#digitalocean)
- [Azure](https://samber.github.io/awesome-prometheus-alerts/rules#azure)
#### Observability
#### Other
- [Thanos](https://samber.github.io/awesome-prometheus-alerts/rules#thanos)
- [Loki](https://samber.github.io/awesome-prometheus-alerts/rules#loki)
- [Promtail](https://samber.github.io/awesome-prometheus-alerts/rules#promtail)
- [Cortex](https://samber.github.io/awesome-prometheus-alerts/rules#cortex)
- [Grafana Tempo](https://samber.github.io/awesome-prometheus-alerts/rules#grafana-tempo)
- [Grafana Mimir](https://samber.github.io/awesome-prometheus-alerts/rules#grafana-mimir)
- [Grafana Alloy](https://samber.github.io/awesome-prometheus-alerts/rules#grafana-alloy)
- [OpenTelemetry Collector](https://samber.github.io/awesome-prometheus-alerts/rules#opentelemetry-collector)
- [Jaeger](https://samber.github.io/awesome-prometheus-alerts/rules#jaeger)
#### Other
- [APC UPS](https://samber.github.io/awesome-prometheus-alerts/rules#apc-ups)
- [Jenkins](https://samber.github.io/awesome-prometheus-alerts/rules#jenkins)
- [Graph Node](https://samber.github.io/awesome-prometheus-alerts/rules#graph-node)
## 🤝 Contributing
@ -177,15 +111,23 @@ There are many ways to contribute: writing code, alerting rules, documentation,
[Instructions here](CONTRIBUTING.md)
## 🏋️ Improvements
- Create an alert rule builder in Jekyll for custom alerts (severity, thresholds, instances...)
- Add resolution suggestions to rule descriptions, for faster incident resolution ([#85](https://github.com/samber/awesome-prometheus-alerts/issues/85)).
## 💫 Show your support
Give a ⭐️ if this project helped you!
[![support us](https://c5.patreon.com/external/logo/become_a_patron_button.png)](https://www.patreon.com/samber)
## 👏 Thanks
Gratitude for the Gitlab operation team that provided 50+ rules. \o/
## 📝 License
- Alert rules and content: [Creative Commons CC BY 4.0](https://creativecommons.org/licenses/by/4.0/)
- Site source code: [MIT](site/LICENSE)
[![CC4](https://mirrors.creativecommons.org/presskit/cc.srr.primary.svg)](https://creativecommons.org/licenses/by/4.0/legalcode)
See [LICENSE](LICENSE) for details.
Licensed under the Creative Commons 4.0 License, see LICENSE file for more detail.

8
_config.yml Normal file
View file

@ -0,0 +1,8 @@
theme: jekyll-theme-cayman
title: Awesome Prometheus alerts
description: Collection of alerting rules
repository: samber/awesome-prometheus-alerts
baseurl: /awesome-prometheus-alerts

File diff suppressed because it is too large Load diff

170
_layouts/default.html Normal file
View file

@ -0,0 +1,170 @@
<!DOCTYPE html>
<html lang="{{ site.lang | default: "en-US" }}">
<head>
<meta charset="UTF-8">
{% seo %}
<meta name="viewport" content="width=device-width, initial-scale=1">
<meta name="theme-color" content="#157878">
<meta name="apple-mobile-web-app-status-bar-style" content="black-translucent">
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.min.css">
<link rel="stylesheet" href="{{ '/assets/css/style.css?v=' | append: site.github.build_revision | relative_url }}">
<link rel="stylesheet" href="{{ '/assets/css/app.css?v=' | append: site.github.build_revision | relative_url }}">
<link rel="icon" type="image/png" href="/assets/favicon.ico">
<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.1/jquery.min.js"></script>
<script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.1/js/bootstrap.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.4/clipboard.min.js"></script>
<script src="{{ '/assets/js/app.js?v=' | append: site.github.build_revision | relative_url }}"></script>
<!-- Global site tag (gtag.js) - Google Analytics -->
<script async src="https://www.googletagmanager.com/gtag/js?id=UA-118604063-2"></script>
<script>
window.dataLayer = window.dataLayer || [];
function gtag() {
dataLayer.push(arguments);
}
gtag('js', new Date());
gtag('config', 'UA-118604063-2');
</script>
</head>
<body>
<style>
#skip-to-content {
height: 1px;
width: 1px;
position: absolute;
overflow: hidden;
top: -10px;
&:focus {
position: fixed;
top: 10px;
left: 10px;
height: auto;
width: auto;
background: invert($body-link-color);
outline: thick solid invert($body-link-color);
}
}
ul.github-buttons-cta li {
display: inline-block;
height: 20px;
padding: 0px 15px;
}
ul.github-buttons-cta li a {
/* width: 100px; */
text-decoration: none;
}
.fa {
/* padding: 14px;
width: 50px;
height: 50px; */
font-size: 25px;
text-align: center;
text-decoration: none;
border-radius: 50%;
}
.fa:hover {
opacity: 0.8;
}
.fa-twitter,
.fa-linkedin {
/* background: #55ACEE; */
color: white;
}
</style>
<a id="skip-to-content" href="#content">Skip to the content.</a>
<header class="page-header" role="banner">
<h1 class="project-name">
<a href="{{ '/' | relative_url }}" style="color: white">
{{ site.title | default: site.github.repository_name }}
</a>
</h1>
<h2 class="project-tagline">{{ site.description | default: site.github.project_tagline }}</h2>
<a href="{{ '/alertmanager' | relative_url }}" class="btn">Global configuration</a>
<a href="{{ '/rules' | relative_url }}" class="btn">Rules</a>
<a href="{{ '/sleep-peacefully' | relative_url }}" class="btn">Sleep peacefully</a>
<a href="{{ '/blackbox-exporter' | relative_url }}" class="btn">Blackbox</a>
<a href="https://github.com/samber/awesome-prometheus-alerts/blob/master/CONTRIBUTING.md" class="btn">
Contribute on GitHub
</a>
<ul class="github-buttons-cta">
<li>
<a href="https://github.com/samber/awesome-prometheus-alerts">
<img alt="GitHub Repo Watchers" src="https://img.shields.io/github/watchers/samber/awesome-prometheus-alerts?style=social">
</a>
</li>
<li>
<a href="https://github.com/samber/awesome-prometheus-alerts">
<img alt="GitHub Repo stars" src="https://img.shields.io/github/stars/samber/awesome-prometheus-alerts?style=social">
</a>
</li>
<li>
<a href="https://github.com/samber/awesome-prometheus-alerts">
<img alt="GitHub Repo forks" src="https://img.shields.io/github/forks/samber/awesome-prometheus-alerts?style=social">
</a>
</li>
<li>
<a href="https://twitter.com/share?via=samuelberthe&related=samuelberthe&text=🚨 📊 Here is a collection of Awesome Prometheus Alerts&url=https://samber.github.io/awesome-prometheus-alerts"
class="fa fa-twitter" target="_blank"></a>
</li>
<li>
<a href="http://www.linkedin.com/shareArticle?mini=true&url=https://samber.github.io/awesome-prometheus-alerts/"
class="fa fa-linkedin" target="_blank"></a>
</li>
</ul>
<ul id="sponsoring">
<li>
Kindly supported by&nbsp; 👉
</li>
<li>
<a href="https://betterstack.com/">
<img width="" src="assets/sponsor-betterstack.png" />
</a>
</li>
</ul>
</header>
<main id="content" class="main-content" role="main">
{{ content }}
<footer class="site-footer">
{% if site.github.is_project_page %}
<span class="site-footer-owner">
<a href="{{ site.github.repository_url }}">{{ site.title }}</a> is maintained by
<a href="{{ site.github.owner_url }}">{{ site.github.owner_name }}</a>.
</span>
{% endif %}
</footer>
</main>
<!-- Screeb tag -->
<script type="text/javascript">
(function (s,c,r,ee,b) {
s['ScreebObject']=r;s[r]=s[r]||function(){(s[r].q=s[r].q||[]).push(arguments)};
b=c.createElement('script');b.type='text/javascript';
b.id=r;b.src=ee;b.async=1;c.getElementsByTagName("head")[0].appendChild(b);
}(window,document,'$screeb','https://t2.screeb.app/tag.js'));
$screeb('init', '232450e3-d3fe-4240-b543-649a5041a7db');
</script>
<!-- End of Screeb tag -->
</body>
</html>

141
alertmanager.md Normal file
View file

@ -0,0 +1,141 @@
<h1 style="text-align: center;">
Global configuration
</h1>
If you notice a delay between an event and the first notification, read the following blog post => [https://pracucci.com/prometheus-understanding-the-delays-on-alerting.html](https://pracucci.com/prometheus-understanding-the-delays-on-alerting.html).
## Prometheus configuration
{% highlight yaml %}
# prometheus.yml
global:
scrape_interval: 20s
# A short evaluation_interval will check alerting rules very often.
# It can be costly if you run Prometheus with 100+ alerts.
evaluation_interval: 20s
...
rule_files:
- 'alerts/*.yml'
scrape_configs:
...
{% endhighlight %}
{% highlight yaml %}
# alerts/example-redis.yml
groups:
- name: ExampleRedisGroup
rules:
- alert: ExampleRedisDown
expr: redis_up{} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Redis instance down"
description: "Whatever"
{% endhighlight %}
## AlertManager configuration
{% highlight yaml %}
{% raw %}
# alertmanager.yml
route:
# When a new group of alerts is created by an incoming alert, wait at
# least 'group_wait' to send the initial notification.
# This way ensures that you get multiple alerts for the same group that start
# firing shortly after another are batched together on the first
# notification.
group_wait: 10s
# When the first notification was sent, wait 'group_interval' to send a batch
# of new alerts that started firing for that group.
group_interval: 30s
# If an alert has successfully been sent, wait 'repeat_interval' to
# resend them.
repeat_interval: 30m
# A default receiver
receiver: "slack"
# All the above attributes are inherited by all child routes and can
# overwritten on each.
routes:
- receiver: "slack"
group_wait: 10s
match_re:
severity: critical|warning
continue: true
- receiver: "pager"
group_wait: 10s
match_re:
severity: critical
continue: true
receivers:
- name: "slack"
slack_configs:
- api_url: 'https://hooks.slack.com/services/XXXXXXXXX/XXXXXXXXX/xxxxxxxxxxxxxxxxxxxxxxxxxxx'
send_resolved: true
channel: 'monitoring'
text: "{{ range .Alerts }}<!channel> {{ .Annotations.summary }}\n{{ .Annotations.description }}\n{{ end }}"
- name: "pager"
webhook_configs:
- url: http://a.b.c.d:8080/send/sms
send_resolved: true
{% endraw %}
{% endhighlight %}
## Reduce Prometheus server load
For expansive or frequent PromQL queries, Prometheus allows to precompute rules.
{% highlight yaml %}
{% raw %}
groups:
# first define the recorded rule
- name: ExampleRecordedGroup
rules:
- record: job:rabbitmq_queue_messages_delivered_total:rate:5m
expr: rate(rabbitmq_queue_messages_delivered_total[5m])
# then use it in alerts
- name: ExampleAlertingGroup
rules:
- alert: ExampleRabbitmqLowMessageDelivery
expr: sum(job:rabbitmq_queue_messages_delivered_total:rate:5m) < 10
for: 2m
labels:
severity: critical
annotations:
summary: "Low delivery rate in Rabbitmq queues"
{% endraw %}
{% endhighlight %}
## Troubleshooting
If the notification takes too much time to be triggered, check the following delays:
- `scrape_interval = 20s` (prometheus.yml)
- `evaluation_interval = 20s` (prometheus.yml)
- `increase(mysql_global_status_slow_queries[1m]) > 0` (alerts/example-mysql.yml)
- `for: 5m` (alerts/example-mysql.yml)
- `group_wait = 10s` (alertmanager.yml)
Also read:
- [https://pracucci.com/prometheus-understanding-the-delays-on-alerting.html](https://pracucci.com/prometheus-understanding-the-delays-on-alerting.html).
- [https://hodovi.cc/blog/creating-awesome-alertmanager-templates-for-slack/](https://hodovi.cc/blog/creating-awesome-alertmanager-templates-for-slack/)

View file

Before

Width:  |  Height:  |  Size: 8.9 KiB

After

Width:  |  Height:  |  Size: 8.9 KiB

143
assets/css/app.css Normal file
View file

@ -0,0 +1,143 @@
a.anchor {
font-size: 15px;
vertical-align: middle;
color: darkblue;
display: inline-block;
padding-bottom: 5px;
margin-right: 5px;
opacity: 0;
transition: opacity 0.4s;
}
h2:hover a.anchor,
h3:hover a.anchor,
h4:hover a.anchor {
opacity: 1;
}
summary {
position: relative;
padding-left: 60px;
padding-right: 50px;
margin-bottom: 15px;
font-size: 15px;
}
h2 {
position: relative;
}
.clipboard-single,
.clipboard-multiple {
right: 0;
position: absolute;
cursor: pointer;
font-size: 14px;
color: #606c71;
}
/* NAVBAR */
#rules-navbar.affix {
/* showed by JS */
display: none;
position: fixed;
overflow: auto;
top: 0;
right: 0;
max-width: 250px;
max-height: 100%;
padding-top: 20px;
padding-bottom: 20px;
padding-left: 20px;
padding-right: 10px;
background-color: #f3f6fa;
}
/* hide menu on small screens */
@media screen and (max-width: 1350px) {
#rules-navbar.affix {
display: none !important;
}
}
/* hide menu scrollbar */
#rules-navbar.affix::-webkit-scrollbar {
display: none;
}
#rules-navbar.affix {
-ms-overflow-style: none;
/* IE and Edge */
scrollbar-width: none;
/* Firefox */
}
#rules-navbar.affix h3 {
margin-bottom: 10px;
}
#rules-navbar.affix h4 {
margin: 0;
font-weight: bold;
font-size: 14px;
line-height: 14px;
}
#rules-navbar.affix ul,
#rules-navbar.affix ul li {
margin: 0;
padding-top: 0;
padding-bottom: 0;
line-height: normal;
}
#rules-navbar.affix>ul {
padding-left: 0;
padding-right: 0;
}
#rules-navbar.affix>ul>li {
margin-bottom: 10px;
padding-left: 0;
padding-right: 0;
}
#rules-navbar.affix a {
font-size: 14px;
line-height: 14px;
}
/* https://github.com/samber/awesome-prometheus-alerts/issues/356 */
@media screen and (min-width: 64em) {
.main-content {
max-width: 85rem;
}
}
ul#sponsoring {
display: flex;
align-items: center;
justify-content: center;
margin-top: 50px;
}
ul#sponsoring li {
display: flex;
padding: 0px 15px;
font-size: 16px;
}
ul#sponsoring li a {
display: flex;
}
ul#sponsoring li a img {
max-width: 180px;
max-height: 80px;
}
.page-header {
padding-bottom: 30px;
}

BIN
assets/favicon.ico Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.3 KiB

View file

Before

Width:  |  Height:  |  Size: 126 KiB

After

Width:  |  Height:  |  Size: 126 KiB

16
assets/js/app.js Normal file
View file

@ -0,0 +1,16 @@
$(function () {
var clipboardRules = new ClipboardJS('.clipboard-single', {
text: function (trigger) {
const id = trigger.getAttribute('data-clipboard-target-id');
const html = $("#" + id + " .highlight");
return html.text() + '\n';
},
});
var clipboardCategories = new ClipboardJS('.clipboard-multiple', {
text: function (trigger) {
const id = trigger.getAttribute('data-clipboard-target-id');
const html = $("[id^=" + id + "] .highlight");
return Array.from(html.map((i, target) => $(target).text())).join('\n\n');
},
});
});

View file

Before

Width:  |  Height:  |  Size: 17 KiB

After

Width:  |  Height:  |  Size: 17 KiB

View file

Before

Width:  |  Height:  |  Size: 16 KiB

After

Width:  |  Height:  |  Size: 16 KiB

125
blackbox-exporter.md Normal file
View file

@ -0,0 +1,125 @@
<h1 style="text-align: center;">
Blackbox exporter
</h1>
## Wordwide probes
<a href="https://github.com/prometheus/blackbox_exporter" target="_blank">Blackbox Exporter</a> gives you the ability to probe endpoints over HTTP, HTTPS, DNS, TCP and ICMP.
You should deploy blackbox exporters in multiple Point of Presence around the globe, to monitor latency. Feel free to use the following endpoints for your own projects:
- https://screeb-probe-<b>montreal</b>.cleverapps.io
- https://screeb-probe-<b>paris</b>.cleverapps.io
- https://screeb-probe-<b>jeddah</b>.cleverapps.io
- https://screeb-probe-<b>singapore</b>.cleverapps.io
- https://screeb-probe-<b>sydney</b>.cleverapps.io
- https://screeb-probe-<b>warsaw</b>.cleverapps.io
☝️ Logs have been disabled. More probes from the community would be appreciated, please contribute <a href="https://github.com/samber/awesome-prometheus-alerts/" target="_blank">here</a>! These blackbox exporters use the following <a href="https://github.com/ScreebApp/blackbox_exporter/blob/master/screeb.yml" target="_blank">configuration</a>.
## Prometheus Configuration
Blackbox exporters and endpoints must be declared in Prometheus. Here is a simple configuration, inspired by [Hayk Davtyan medium post](https://medium.com/geekculture/single-prometheus-job-for-dozens-of-blackbox-exporters-2a7ba492d6c8):
```yml
# sd/blackbox.yml
- targets:
#
# Montreal
#
# http
- screeb-probe-montreal.cleverapps.io:_:http_2xx:_:Montreal:_:f229cy:_:https://api.screeb.app
- screeb-probe-montreal.cleverapps.io:_:http_2xx:_:Montreal:_:f229cy:_:https://t.screeb.app/tag.js
# icmp
- screeb-probe-montreal.cleverapps.io:_:icmp_ipv4:_:Montreal:_:f229cy:_:api.screeb.app
- screeb-probe-montreal.cleverapps.io:_:icmp_ipv4:_:Montreal:_:f229cy:_:t.screeb.app
#
# Paris
#
# http
- screeb-probe-paris.cleverapps.io:_:http_2xx:_:Paris:_:u09tgy:_:https://api.screeb.app
- screeb-probe-paris.cleverapps.io:_:http_2xx:_:Paris:_:u09tgy:_:https://t.screeb.app/tag.js
# icmp
- screeb-probe-paris.cleverapps.io:_:icmp_ipv4:_:Paris:_:u09tgy:_:api.screeb.app
- screeb-probe-paris.cleverapps.io:_:icmp_ipv4:_:Paris:_:u09tgy:_:t.screeb.app
#
# Sydney
#
# http
- screeb-probe-sydney.cleverapps.io:_:http_2xx:_:Sydney:_:r3gpkn:_:https://api.screeb.app
- screeb-probe-sydney.cleverapps.io:_:http_2xx:_:Sydney:_:r3gpkn:_:https://t.screeb.app/tag.js
# icmp
- screeb-probe-sydney.cleverapps.io:_:icmp_ipv4:_:Sydney:_:r3gpkn:_:api.screeb.app
- screeb-probe-sydney.cleverapps.io:_:icmp_ipv4:_:Sydney:_:r3gpkn:_:t.screeb.app
# ...
```
```yml
# prometheus.yml
global:
# ...
scrape_configs:
- job_name: 'blackbox'
metrics_path: /probe
scrape_interval: 30s
scheme: https
file_sd_configs:
- files:
- /etc/prometheus/sd/blackbox.yml
relabel_configs:
# adds "module" label in the final labelset
- source_labels: [__address__]
regex: '.*:_:(.*):_:.*:_:.*:_:.*'
target_label: module
# adds "geohash" label in the final labelset
- source_labels: [__address__]
regex: '.*:_:.*:_:.*:_:(.*):_:.*'
target_label: geohash
# rewrites "instance" label with corresponding URL
- source_labels: [__address__]
regex: '.*:_:.*:_:.*:_:.*:_:(.*)'
target_label: instance
# rewrites "pop" label with corresponding location name
- source_labels: [__address__]
regex: '.*:_:.*:_:(.*):_:.*:_:.*'
target_label: pop
# passes "module" parameter to Blackbox exporter
- source_labels: [module]
target_label: __param_module
# passes "target" parameter to Blackbox exporter
- source_labels: [instance]
target_label: __param_target
# the Blackbox exporter's real hostname:port
- source_labels: [__address__]
regex: '(.*):_:.*:_:.*:_:.*:_:.*'
target_label: __address__
# ...
```
## Geohash
![](assets/grafana-map-panel.png)
To display nice maps in Grafana, you need to instruct blackbox exporters about the location. Grafana map panel speaks the "geohash" format:
- go to google map
- extract the lat/long from the url
- convert lat/long to geohash here: http://geohash.co
## Grafana
Some great dashboard have been created by the community: https://grafana.com/grafana/dashboards/?search=blackbox
Since Grafana v5.0.0, a map panel is available: https://grafana.com/docs/grafana/latest/panels-visualizations/visualizations/geomap/

View file

@ -1,123 +0,0 @@
groups:
- name: FlinkPrometheusReporter
rules:
- alert: FlinkJobIsNotRunning
expr: 'flink_jobmanager_numRunningJobs == 0'
for: 1m
labels:
severity: critical
annotations:
summary: Flink job is not running (instance {{ $labels.instance }})
description: "No Flink jobs are currently running. All jobs may have failed or been cancelled.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: FlinkNoTaskmanagersRegistered
expr: 'flink_jobmanager_numRegisteredTaskManagers == 0'
for: 1m
labels:
severity: critical
annotations:
summary: Flink no TaskManagers registered (instance {{ $labels.instance }})
description: "No TaskManagers are registered with the JobManager. The cluster has no processing capacity.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# This alert fires when there are no available task slots. Adjust the threshold if your cluster is expected to run at full capacity.
- alert: FlinkAllTaskSlotsUsed
expr: 'flink_jobmanager_taskSlotsAvailable == 0'
for: 5m
labels:
severity: warning
annotations:
summary: Flink all task slots used (instance {{ $labels.instance }})
description: "All Flink task slots are in use ({{ $value }} available). New jobs cannot be scheduled.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# A single restart may be normal during deployments. Adjust threshold based on restart tolerance.
- alert: FlinkJobRestartIncreasing
expr: 'delta(flink_jobmanager_job_numRestarts[5m]) > 1'
for: 5m
labels:
severity: warning
annotations:
summary: Flink job restart increasing (instance {{ $labels.instance }})
description: "Flink job {{ $labels.job_name }} has restarted {{ $value }} times in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: FlinkCheckpointFailures
expr: 'delta(flink_jobmanager_job_numberOfFailedCheckpoints[10m]) > 1'
for: 5m
labels:
severity: warning
annotations:
summary: Flink checkpoint failures (instance {{ $labels.instance }})
description: "Flink job {{ $labels.job_name }} has {{ $value }} failed checkpoints in the last 10 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Value is converted from milliseconds to seconds for correct humanizeDuration display.
# Threshold is 60 seconds. Adjust based on your checkpoint interval and state size.
- alert: FlinkCheckpointDurationHigh
expr: 'flink_jobmanager_job_lastCheckpointDuration / 1000 > 60'
for: 5m
labels:
severity: warning
annotations:
summary: Flink checkpoint duration high (instance {{ $labels.instance }})
description: "Flink job {{ $labels.job_name }} last checkpoint took {{ $value | humanizeDuration }} to complete.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: FlinkTaskBackpressured
expr: 'flink_taskmanager_job_task_isBackPressured == 1'
for: 5m
labels:
severity: warning
annotations:
summary: Flink task backpressured (instance {{ $labels.instance }})
description: "Flink task {{ $labels.task_name }} in job {{ $labels.job_name }} is backpressured.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Fires when a task spends more than 500ms/sec backpressured. This indicates the task cannot keep up with upstream data rate.
- alert: FlinkTaskHighBackpressureTime
expr: 'flink_taskmanager_job_task_backPressuredTimeMsPerSecond > 500'
for: 5m
labels:
severity: warning
annotations:
summary: Flink task high backpressure time (instance {{ $labels.instance }})
description: "Flink task {{ $labels.task_name }} is spending {{ $value | humanize }}ms/sec in backpressure.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Flink TaskManagers manage their own memory pool. High JVM heap usage (outside managed memory) may indicate memory leaks or misconfiguration.
- alert: FlinkTaskmanagerHeapMemoryHigh
expr: 'flink_taskmanager_Status_JVM_Memory_Heap_Used / flink_taskmanager_Status_JVM_Memory_Heap_Max > 0.9 and flink_taskmanager_Status_JVM_Memory_Heap_Max > 0'
for: 5m
labels:
severity: warning
annotations:
summary: Flink TaskManager heap memory high (instance {{ $labels.instance }})
description: "Flink TaskManager {{ $labels.instance }} heap memory usage is above 90%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: FlinkJobmanagerHeapMemoryHigh
expr: 'flink_jobmanager_Status_JVM_Memory_Heap_Used / flink_jobmanager_Status_JVM_Memory_Heap_Max > 0.9 and flink_jobmanager_Status_JVM_Memory_Heap_Max > 0'
for: 5m
labels:
severity: warning
annotations:
summary: Flink JobManager heap memory high (instance {{ $labels.instance }})
description: "Flink JobManager {{ $labels.instance }} heap memory usage is above 90%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Flink exposes GC time as a gauge (cumulative milliseconds), so deriv() is used instead of rate().
# Threshold: more than 100ms/sec of GC time (10% of wall clock). Adjust based on your workload.
- alert: FlinkTaskmanagerGcTimeHigh
expr: 'deriv(flink_taskmanager_Status_JVM_GarbageCollector_All_Time[5m]) > 100'
for: 5m
labels:
severity: warning
annotations:
summary: Flink TaskManager GC time high (instance {{ $labels.instance }})
description: "Flink TaskManager {{ $labels.instance }} is spending more than 10% of time in garbage collection.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Only fires for tasks that have previously received records, to avoid false positives during startup.
- alert: FlinkNoRecordsProcessed
expr: 'delta(flink_taskmanager_job_task_numRecordsIn[5m]) == 0 and flink_taskmanager_job_task_numRecordsIn > 0'
for: 5m
labels:
severity: warning
annotations:
summary: Flink no records processed (instance {{ $labels.instance }})
description: "Flink task {{ $labels.task_name }} has not processed any records in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -1,89 +0,0 @@
groups:
- name: SparkPrometheus
# Spark exposes metrics via two built-in endpoints:
# - PrometheusServlet: master/worker/driver metrics at /metrics/prometheus/ (ports 8080, 8081, 4040)
# - PrometheusResource: executor metrics at /metrics/executors/prometheus/ (port 4040, requires spark.ui.prometheus.enabled=true in Spark 3.x)
# Metric names from PrometheusServlet include a dynamic namespace (application ID), making static PromQL queries challenging.
# Configuration: spark.metrics.conf.*.sink.prometheusServlet.class=org.apache.spark.metrics.sink.PrometheusServlet
rules:
- alert: SparkNoAliveWorkers
expr: 'metrics_master_aliveWorkers_Value == 0'
for: 1m
labels:
severity: critical
annotations:
summary: Spark no alive workers (instance {{ $labels.instance }})
description: "No Spark workers are alive. The cluster has no processing capacity.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Adjust the threshold based on your cluster's typical queuing behavior.
- alert: SparkTooManyWaitingApps
expr: 'metrics_master_waitingApps_Value > 10'
for: 5m
labels:
severity: warning
annotations:
summary: Spark too many waiting apps (instance {{ $labels.instance }})
description: "Spark has {{ $value }} applications waiting for resources.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SparkWorkerMemoryExhausted
expr: 'metrics_worker_memFree_MB_Value == 0'
for: 2m
labels:
severity: warning
annotations:
summary: Spark worker memory exhausted (instance {{ $labels.instance }})
description: "Spark worker {{ $labels.instance }} has no free memory ({{ $value }}MB free).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Fires when a worker has no free cores. This may be normal under high load but can indicate capacity issues.
- alert: SparkWorkerCoresExhausted
expr: 'metrics_worker_coresFree_Value == 0'
for: 5m
labels:
severity: warning
annotations:
summary: Spark worker cores exhausted (instance {{ $labels.instance }})
description: "Spark worker {{ $labels.instance }} has no free cores.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Fires when more than 10% of executor time is spent in garbage collection.
# This metric comes from the PrometheusResource endpoint (/metrics/executors/prometheus/).
- alert: SparkExecutorHighGcTime
expr: 'metrics_executor_totalGCTime_seconds_total / metrics_executor_totalDuration > 0.1 and metrics_executor_totalDuration > 0'
for: 5m
labels:
severity: warning
annotations:
summary: Spark executor high GC time (instance {{ $labels.instance }})
description: "Spark executor {{ $labels.executor_id }} in {{ $labels.application_name }} is spending too much time in GC.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SparkExecutorAllTasksFailing
expr: 'metrics_executor_failedTasks_total > 0 and metrics_executor_completedTasks_total == 0'
for: 5m
labels:
severity: critical
annotations:
summary: Spark executor all tasks failing (instance {{ $labels.instance }})
description: "Spark executor {{ $labels.executor_id }} has only failing tasks ({{ $value }} failed, 0 completed).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SparkExecutorHighTaskFailureRate
expr: 'metrics_executor_failedTasks_total / metrics_executor_totalTasks_total > 0.1 and metrics_executor_totalTasks_total > 0'
for: 5m
labels:
severity: warning
annotations:
summary: Spark executor high task failure rate (instance {{ $labels.instance }})
description: "Spark executor {{ $labels.executor_id }} has a task failure rate above 10%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# diskUsed is a gauge, not a counter — do not use rate(). Threshold of 1GB is a rough default.
# Disk spilling indicates insufficient memory for the workload.
- alert: SparkExecutorHighDiskSpill
expr: 'metrics_executor_diskUsed_bytes > 1e9'
for: 5m
labels:
severity: warning
annotations:
summary: Spark executor high disk spill (instance {{ $labels.instance }})
description: "Spark executor {{ $labels.executor_id }} is spilling data to disk. Consider increasing executor memory.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -2,7 +2,6 @@ groups:
- name: LusitaniaeApacheExporter
rules:
- alert: ApacheDown
@ -15,7 +14,7 @@ groups:
description: "Apache down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ApacheWorkersLoad
expr: '(sum by (instance) (apache_workers{state="busy"}) / sum by (instance) (apache_scoreboard) ) * 100 > 80 and sum by (instance) (apache_scoreboard) > 0'
expr: '(sum by (instance) (apache_workers{state="busy"}) / sum by (instance) (apache_scoreboard) ) * 100 > 80'
for: 2m
labels:
severity: warning
@ -27,7 +26,7 @@ groups:
expr: 'apache_uptime_seconds_total / 60 < 1'
for: 0m
labels:
severity: info
severity: warning
annotations:
summary: Apache restart (instance {{ $labels.instance }})
description: "Apache has just been restarted.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -2,7 +2,6 @@ groups:
- name: Apcupsd_exporter
rules:
- alert: ApcUpsBatteryNearlyEmpty
@ -33,7 +32,7 @@ groups:
description: "UPS now running on battery (since {{$value | humanizeDuration}})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ApcUpsLowBatteryVoltage
expr: '(apcupsd_battery_volts / apcupsd_battery_nominal_volts) < 0.95 and apcupsd_battery_nominal_volts > 0'
expr: '(apcupsd_battery_volts / apcupsd_battery_nominal_volts) < 0.95'
for: 0m
labels:
severity: warning

View file

@ -2,7 +2,6 @@ groups:
- name: EmbeddedExporter
rules:
- alert: ArgocdServiceNotSynced

View file

@ -1,141 +0,0 @@
groups:
- name: PrometheusCloudwatchExporter
# CloudWatch metrics are exported as aws_{namespace}_{metric_name}_{statistic} gauges.
# The rules below cover both exporter health and common AWS service alerts.
# Adjust thresholds and label filters to match your CloudWatch exporter configuration.
rules:
- alert: CloudwatchExporterScrapeError
expr: 'cloudwatch_exporter_scrape_error > 0'
for: 5m
labels:
severity: warning
annotations:
summary: CloudWatch exporter scrape error (instance {{ $labels.instance }})
description: "CloudWatch exporter on {{ $labels.instance }} failed to scrape metrics from AWS CloudWatch API.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CloudwatchExporterSlowScrape
expr: 'cloudwatch_exporter_scrape_duration_seconds > 300'
for: 5m
labels:
severity: warning
annotations:
summary: CloudWatch exporter slow scrape (instance {{ $labels.instance }})
description: "CloudWatch exporter on {{ $labels.instance }} scrape is taking more than 5 minutes ({{ $value }}s). Consider reducing the number of metrics or splitting across multiple exporters.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# CloudWatch API calls cost money (~$0.01 per 1000 GetMetricData requests).
# 100 requests/minute ≈ $45/month. Adjust the threshold based on your budget.
- alert: CloudwatchApiHighRequestRate
expr: 'sum by (instance, namespace) (rate(cloudwatch_requests_total[5m])) * 60 > 100'
for: 0m
labels:
severity: warning
annotations:
summary: CloudWatch API high request rate (instance {{ $labels.instance }})
description: "CloudWatch exporter on {{ $labels.instance }} is making {{ $value }} API calls per minute to namespace {{ $labels.namespace }}. This can lead to high AWS costs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Requires EC2 CPUUtilization metric configured in the CloudWatch exporter.
- alert: AwsEc2HighCpuUtilization
expr: 'aws_ec2_cpuutilization_average > 90'
for: 15m
labels:
severity: warning
annotations:
summary: AWS EC2 high CPU utilization (instance {{ $labels.instance }})
description: "EC2 instance {{ $labels.instance_id }} CPU utilization is above 90% ({{ $value }}%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Requires RDS FreeStorageSpace metric. The threshold of 2GB is a rough default.
# Adjust based on your database size.
- alert: AwsRdsLowFreeStorageSpace
expr: 'aws_rds_free_storage_space_average < 2000000000'
for: 5m
labels:
severity: warning
annotations:
summary: AWS RDS low free storage space (instance {{ $labels.instance }})
description: "RDS instance {{ $labels.dbinstance_identifier }} has less than 2GB free storage ({{ $value }} bytes remaining).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Requires RDS CPUUtilization metric configured in the CloudWatch exporter.
- alert: AwsRdsHighCpuUtilization
expr: 'aws_rds_cpuutilization_average > 90'
for: 15m
labels:
severity: warning
annotations:
summary: AWS RDS high CPU utilization (instance {{ $labels.instance }})
description: "RDS instance {{ $labels.dbinstance_identifier }} CPU utilization is above 90% ({{ $value }}%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# The threshold depends on the RDS instance class. Adjust based on your
# instance type's max_connections parameter.
- alert: AwsRdsHighDatabaseConnections
expr: 'aws_rds_database_connections_average > 100'
for: 5m
labels:
severity: warning
annotations:
summary: AWS RDS high database connections (instance {{ $labels.instance }})
description: "RDS instance {{ $labels.dbinstance_identifier }} has {{ $value }} active connections.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Requires SQS ApproximateNumberOfMessagesVisible metric. The threshold of 1000
# is a rough default. Adjust based on your expected queue depth.
- alert: AwsSqsQueueMessagesVisible
expr: 'aws_sqs_approximate_number_of_messages_visible_average > 1000'
for: 10m
labels:
severity: warning
annotations:
summary: AWS SQS queue messages visible (instance {{ $labels.instance }})
description: "SQS queue {{ $labels.queue_name }} has {{ $value }} messages waiting to be processed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Requires SQS ApproximateAgeOfOldestMessage metric.
- alert: AwsSqsMessageAgeTooOld
expr: 'aws_sqs_approximate_age_of_oldest_message_maximum > 3600'
for: 0m
labels:
severity: warning
annotations:
summary: AWS SQS message age too old (instance {{ $labels.instance }})
description: "SQS queue {{ $labels.queue_name }} has messages older than 1 hour ({{ $value }}s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Requires ApplicationELB UnHealthyHostCount metric.
- alert: AwsAlbUnhealthyTargets
expr: 'aws_applicationelb_unhealthy_host_count_average > 0'
for: 5m
labels:
severity: critical
annotations:
summary: AWS ALB unhealthy targets (instance {{ $labels.instance }})
description: "ALB {{ $labels.load_balancer }} has {{ $value }} unhealthy target(s) in target group {{ $labels.target_group }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Requires ApplicationELB HTTPCode_ELB_5XX_Count and RequestCount metrics.
- alert: AwsAlbHigh5xxErrorRate
expr: '(aws_applicationelb_httpcode_elb_5_xx_count_sum / aws_applicationelb_request_count_sum) * 100 > 5 and aws_applicationelb_request_count_sum > 0'
for: 5m
labels:
severity: critical
annotations:
summary: AWS ALB high 5xx error rate (instance {{ $labels.instance }})
description: "ALB {{ $labels.load_balancer }} 5xx error rate is above 5% ({{ $value }}%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Requires ApplicationELB TargetResponseTime metric.
- alert: AwsAlbHighTargetResponseTime
expr: 'aws_applicationelb_target_response_time_average > 2'
for: 5m
labels:
severity: warning
annotations:
summary: AWS ALB high target response time (instance {{ $labels.instance }})
description: "ALB {{ $labels.load_balancer }} average target response time is above 2 seconds ({{ $value }}s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Requires Lambda Errors and Invocations metrics.
- alert: AwsLambdaHighErrorRate
expr: '(aws_lambda_errors_sum / aws_lambda_invocations_sum) * 100 > 5 and aws_lambda_invocations_sum > 0'
for: 5m
labels:
severity: warning
annotations:
summary: AWS Lambda high error rate (instance {{ $labels.instance }})
description: "Lambda function {{ $labels.function_name }} error rate is above 5% ({{ $value }}%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -1,57 +0,0 @@
groups:
- name: AzureMetricsExporter
# The exporter uses azurerm_resource_metric as the default metric name for forwarded Azure Monitor metrics.
# The metric name can be customized via the name parameter in probe configuration.
# Self-monitoring metrics use the azurerm_stats_* and azurerm_api_* prefixes.
rules:
- alert: AzureExporterRequestErrors
expr: 'increase(azurerm_stats_metric_requests{result="error"}[15m]) > 5'
for: 0m
labels:
severity: warning
annotations:
summary: Azure exporter request errors (instance {{ $labels.instance }})
description: "Azure metrics exporter on {{ $labels.instance }} has {{ $value }} API request errors in the last 15 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: AzureExporterHighErrorRate
expr: 'sum by (instance) (rate(azurerm_stats_metric_requests{result="error"}[5m])) / sum by (instance) (rate(azurerm_stats_metric_requests[5m])) * 100 > 10 and sum by (instance) (rate(azurerm_stats_metric_requests[5m])) > 0'
for: 5m
labels:
severity: warning
annotations:
summary: Azure exporter high error rate (instance {{ $labels.instance }})
description: "Azure metrics exporter on {{ $labels.instance }} has an error rate above 10% ({{ $value }}%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Azure Resource Manager enforces rate limits per subscription.
# The threshold of 100 remaining calls is a rough default. Adjust based on your
# scrape interval and number of monitored resources.
- alert: AzureApiReadRateLimitApproaching
expr: 'azurerm_api_ratelimit{type="read"} < 100'
for: 0m
labels:
severity: warning
annotations:
summary: Azure API read rate limit approaching (instance {{ $labels.instance }})
description: "Azure API read rate limit for subscription {{ $labels.subscriptionID }} is running low ({{ $value }} remaining).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: AzureApiWriteRateLimitApproaching
expr: 'azurerm_api_ratelimit{type="write"} < 50'
for: 0m
labels:
severity: warning
annotations:
summary: Azure API write rate limit approaching (instance {{ $labels.instance }})
description: "Azure API write rate limit for subscription {{ $labels.subscriptionID }} is running low ({{ $value }} remaining).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: AzureExporterSlowCollection
expr: 'azurerm_stats_metric_collecttime > 300'
for: 5m
labels:
severity: warning
annotations:
summary: Azure exporter slow collection (instance {{ $labels.instance }})
description: "Azure metrics exporter on {{ $labels.instance }} metric collection is taking more than 5 minutes ({{ $value }}s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -2,12 +2,11 @@ groups:
- name: BlackboxExporter
rules:
- alert: BlackboxProbeFailed
expr: 'probe_success == 0'
for: 1m
for: 0m
labels:
severity: critical
annotations:
@ -24,7 +23,7 @@ groups:
description: "Blackbox configuration reload failure\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxSlowProbe
expr: 'probe_duration_seconds > 1'
expr: 'avg_over_time(probe_duration_seconds[1m]) > 1'
for: 1m
labels:
severity: warning
@ -34,7 +33,7 @@ groups:
- alert: BlackboxProbeHttpFailure
expr: 'probe_http_status_code <= 199 OR probe_http_status_code >= 400'
for: 1m
for: 0m
labels:
severity: critical
annotations:
@ -50,19 +49,15 @@ groups:
summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
description: "SSL certificate expires in less than 20 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxSslCertificateWillExpireVerySoon
- alert: BlackboxSslCertificateWillExpireSoon
expr: '0 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 3'
for: 0m
labels:
severity: critical
annotations:
summary: Blackbox SSL certificate will expire very soon (instance {{ $labels.instance }})
summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
description: "SSL certificate expires in less than 3 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# For probe_ssl_earliest_cert_expiry to be exposed after expiration, you
# need to enable insecure_skip_verify. Note that this will disable
# certificate validation.
# See https://github.com/prometheus/blackbox_exporter/blob/master/CONFIGURATION.md#tls_config
- alert: BlackboxSslCertificateExpired
expr: 'round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 0'
for: 0m
@ -73,7 +68,7 @@ groups:
description: "SSL certificate has expired already\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxProbeSlowHttp
expr: 'probe_http_duration_seconds > 1'
expr: 'avg_over_time(probe_http_duration_seconds[1m]) > 1'
for: 1m
labels:
severity: warning
@ -82,7 +77,7 @@ groups:
description: "HTTP request took more than 1s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxProbeSlowPing
expr: 'probe_icmp_duration_seconds > 1'
expr: 'avg_over_time(probe_icmp_duration_seconds[1m]) > 1'
for: 1m
labels:
severity: warning

View file

@ -1,33 +0,0 @@
groups:
- name: EmbeddedExporter
rules:
- alert: CaddyReverseProxyDown
expr: 'caddy_reverse_proxy_upstreams_healthy == 0'
for: 0m
labels:
severity: critical
annotations:
summary: Caddy Reverse Proxy Down (instance {{ $labels.instance }})
description: "Caddy reverse proxy upstream {{ $labels.upstream }} is unhealthy\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CaddyHighHttp4xxErrorRateService
expr: 'sum(rate(caddy_http_request_duration_seconds_count{code=~"4.."}[3m])) by (instance) / sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) * 100 > 5 and sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) > 0'
for: 1m
labels:
severity: critical
annotations:
summary: Caddy high HTTP 4xx error rate service (instance {{ $labels.instance }})
description: "Caddy service 4xx error rate is above 5%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CaddyHighHttp5xxErrorRateService
expr: 'sum(rate(caddy_http_request_duration_seconds_count{code=~"5.."}[3m])) by (instance) / sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) * 100 > 5 and sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) > 0'
for: 1m
labels:
severity: critical
annotations:
summary: Caddy high HTTP 5xx error rate service (instance {{ $labels.instance }})
description: "Caddy service 5xx error rate is above 5%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -2,7 +2,6 @@ groups:
- name: CriteoCassandraExporter
rules:
- alert: CassandraHintsCount
@ -15,7 +14,7 @@ groups:
description: "Cassandra hints count has changed on {{ $labels.instance }} some nodes may go down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraCompactionTaskPending
expr: 'cassandra_stats{name="org:apache:cassandra:metrics:compaction:pendingtasks:value"} > 100'
expr: 'avg_over_time(cassandra_stats{name="org:apache:cassandra:metrics:compaction:pendingtasks:value"}[1m]) > 100'
for: 2m
labels:
severity: warning
@ -24,7 +23,7 @@ groups:
description: "Many Cassandra compaction tasks are pending. You might need to increase I/O capacity by adding nodes to the cluster.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraViewwriteLatency
expr: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:viewwrite:viewwritelatency:99thpercentile"} > 100000'
expr: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:viewwrite:viewwritelatency:99thpercentile",service="cas"} > 100000'
for: 2m
labels:
severity: warning
@ -32,50 +31,49 @@ groups:
summary: Cassandra viewwrite latency (instance {{ $labels.instance }})
description: "High viewwrite latency on {{ $labels.instance }} cassandra node\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraAuthenticationFailures
expr: 'delta(cassandra_stats{name="org:apache:cassandra:metrics:client:authfailure:count"}[1m]) > 5'
- alert: CassandraBadHacker
expr: 'rate(cassandra_stats{name="org:apache:cassandra:metrics:client:authfailure:count"}[1m]) > 5'
for: 2m
labels:
severity: warning
annotations:
summary: Cassandra authentication failures (instance {{ $labels.instance }})
summary: Cassandra bad hacker (instance {{ $labels.instance }})
description: "Increase of Cassandra authentication failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# 1m delay allows a restart without triggering an alert.
- alert: CassandraNodeDown
expr: 'sum(cassandra_stats{name="org:apache:cassandra:net:failuredetector:downendpointcount"}) by (service,group,cluster,env) > 0'
for: 1m
for: 0m
labels:
severity: critical
annotations:
summary: Cassandra node down (instance {{ $labels.instance }})
description: "Cassandra node down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraCommitlogPendingTasks(criteo)
- alert: CassandraCommitlogPendingTasks
expr: 'cassandra_stats{name="org:apache:cassandra:metrics:commitlog:pendingtasks:value"} > 15'
for: 2m
labels:
severity: warning
annotations:
summary: Cassandra commitlog pending tasks (Criteo) (instance {{ $labels.instance }})
summary: Cassandra commitlog pending tasks (instance {{ $labels.instance }})
description: "Unexpected number of Cassandra commitlog pending tasks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraCompactionExecutorBlockedTasks(criteo)
- alert: CassandraCompactionExecutorBlockedTasks
expr: 'cassandra_stats{name="org:apache:cassandra:metrics:threadpools:internal:compactionexecutor:currentlyblockedtasks:count"} > 0'
for: 2m
labels:
severity: warning
annotations:
summary: Cassandra compaction executor blocked tasks (Criteo) (instance {{ $labels.instance }})
summary: Cassandra compaction executor blocked tasks (instance {{ $labels.instance }})
description: "Some Cassandra compaction executor tasks are blocked\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraFlushWriterBlockedTasks(criteo)
- alert: CassandraFlushWriterBlockedTasks
expr: 'cassandra_stats{name="org:apache:cassandra:metrics:threadpools:internal:memtableflushwriter:currentlyblockedtasks:count"} > 0'
for: 2m
labels:
severity: warning
annotations:
summary: Cassandra flush writer blocked tasks (Criteo) (instance {{ $labels.instance }})
summary: Cassandra flush writer blocked tasks (instance {{ $labels.instance }})
description: "Some Cassandra flush writer tasks are blocked\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraRepairPendingTasks
@ -96,75 +94,74 @@ groups:
summary: Cassandra repair blocked tasks (instance {{ $labels.instance }})
description: "Some Cassandra repair tasks are blocked\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraConnectionTimeoutsTotal(criteo)
expr: 'delta(cassandra_stats{name="org:apache:cassandra:metrics:connection:totaltimeouts:count"}[1m]) > 5'
- alert: CassandraConnectionTimeoutsTotal
expr: 'rate(cassandra_stats{name="org:apache:cassandra:metrics:connection:totaltimeouts:count"}[1m]) > 5'
for: 2m
labels:
severity: critical
annotations:
summary: Cassandra connection timeouts total (Criteo) (instance {{ $labels.instance }})
summary: Cassandra connection timeouts total (instance {{ $labels.instance }})
description: "Some connection between nodes are ending in timeout\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraStorageExceptions(criteo)
- alert: CassandraStorageExceptions
expr: 'changes(cassandra_stats{name="org:apache:cassandra:metrics:storage:exceptions:count"}[1m]) > 1'
for: 0m
labels:
severity: critical
annotations:
summary: Cassandra storage exceptions (Criteo) (instance {{ $labels.instance }})
summary: Cassandra storage exceptions (instance {{ $labels.instance }})
description: "Something is going wrong with cassandra storage\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraTombstoneDump(criteo)
- alert: CassandraTombstoneDump
expr: 'cassandra_stats{name="org:apache:cassandra:metrics:table:tombstonescannedhistogram:99thpercentile"} > 1000'
for: 0m
labels:
severity: critical
annotations:
summary: Cassandra tombstone dump (Criteo) (instance {{ $labels.instance }})
summary: Cassandra tombstone dump (instance {{ $labels.instance }})
description: "Too much tombstones scanned in queries\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraClientRequestUnavailableWrite(criteo)
- alert: CassandraClientRequestUnavailableWrite
expr: 'changes(cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:write:unavailables:count"}[1m]) > 0'
for: 0m
labels:
severity: critical
annotations:
summary: Cassandra client request unavailable write (Criteo) (instance {{ $labels.instance }})
summary: Cassandra client request unavailable write (instance {{ $labels.instance }})
description: "Write failures have occurred because too many nodes are unavailable\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraClientRequestUnavailableRead(criteo)
- alert: CassandraClientRequestUnavailableRead
expr: 'changes(cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:read:unavailables:count"}[1m]) > 0'
for: 0m
labels:
severity: critical
annotations:
summary: Cassandra client request unavailable read (Criteo) (instance {{ $labels.instance }})
summary: Cassandra client request unavailable read (instance {{ $labels.instance }})
description: "Read failures have occurred because too many nodes are unavailable\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraClientRequestWriteFailure(criteo)
expr: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:write:failures:oneminuterate"} > 0.05'
- alert: CassandraClientRequestWriteFailure
expr: 'increase(cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:write:failures:oneminuterate"}[1m]) > 0'
for: 0m
labels:
severity: critical
annotations:
summary: Cassandra client request write failure (Criteo) (instance {{ $labels.instance }})
summary: Cassandra client request write failure (instance {{ $labels.instance }})
description: "A lot of write failures encountered. A write failure is a non-timeout exception encountered during a write request. Examine the reason map to find to the root cause. The most common cause for this type of error is when batch sizes are too large.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraClientRequestReadFailure(criteo)
expr: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:read:failures:oneminuterate"} > 0.05'
- alert: CassandraClientRequestReadFailure
expr: 'increase(cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:read:failures:oneminuterate"}[1m]) > 0'
for: 0m
labels:
severity: critical
annotations:
summary: Cassandra client request read failure (Criteo) (instance {{ $labels.instance }})
summary: Cassandra client request read failure (instance {{ $labels.instance }})
description: "A lot of read failures encountered. A read failure is a non-timeout exception encountered during a read request. Examine the reason map to find to the root cause. The most common cause for this type of error is when batch sizes are too large.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# A low key cache hit rate increases disk I/O. Threshold is workload-dependent — adjust based on your data access patterns.
- alert: CassandraCacheHitRateKeyCache
expr: 'cassandra_stats{name="org:apache:cassandra:metrics:cache:keycache:hitrate:value"} < .85'
for: 2m
labels:
severity: warning
severity: critical
annotations:
summary: Cassandra cache hit rate key cache (instance {{ $labels.instance }})
description: "Key cache hit rate is below 85%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -2,13 +2,11 @@ groups:
- name: InstaclustrCassandraExporter
rules:
# 1m delay allows a restart without triggering an alert.
- alert: CassandraNodeIsUnavailable
expr: 'cassandra_endpoint_active < 1'
for: 1m
expr: 'sum(cassandra_endpoint_active) by (cassandra_cluster,instance,exported_endpoint) < 1'
for: 0m
labels:
severity: critical
annotations:
@ -24,92 +22,92 @@ groups:
summary: Cassandra many compaction tasks are pending (instance {{ $labels.instance }})
description: "Many Cassandra compaction tasks are pending - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraCommitlogPendingTasks(instaclustr)
- alert: CassandraCommitlogPendingTasks
expr: 'cassandra_commit_log_pending_tasks > 15'
for: 2m
labels:
severity: warning
annotations:
summary: Cassandra commitlog pending tasks (Instaclustr) (instance {{ $labels.instance }})
summary: Cassandra commitlog pending tasks (instance {{ $labels.instance }})
description: "Cassandra commitlog pending tasks - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraCompactionExecutorBlockedTasks(instaclustr)
- alert: CassandraCompactionExecutorBlockedTasks
expr: 'cassandra_thread_pool_blocked_tasks{pool="CompactionExecutor"} > 15'
for: 2m
labels:
severity: warning
annotations:
summary: Cassandra compaction executor blocked tasks (Instaclustr) (instance {{ $labels.instance }})
summary: Cassandra compaction executor blocked tasks (instance {{ $labels.instance }})
description: "Some Cassandra compaction executor tasks are blocked - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraFlushWriterBlockedTasks(instaclustr)
- alert: CassandraFlushWriterBlockedTasks
expr: 'cassandra_thread_pool_blocked_tasks{pool="MemtableFlushWriter"} > 15'
for: 2m
labels:
severity: warning
annotations:
summary: Cassandra flush writer blocked tasks (Instaclustr) (instance {{ $labels.instance }})
summary: Cassandra flush writer blocked tasks (instance {{ $labels.instance }})
description: "Some Cassandra flush writer tasks are blocked - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraConnectionTimeoutsTotal(instaclustr)
expr: 'sum by (cassandra_cluster,instance) (rate(cassandra_client_request_timeouts_total[5m])) > 5'
- alert: CassandraConnectionTimeoutsTotal
expr: 'avg(cassandra_client_request_timeouts_total) by (cassandra_cluster,instance) > 5'
for: 2m
labels:
severity: critical
annotations:
summary: Cassandra connection timeouts total (Instaclustr) (instance {{ $labels.instance }})
summary: Cassandra connection timeouts total (instance {{ $labels.instance }})
description: "Some connection between nodes are ending in timeout - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraStorageExceptions(instaclustr)
- alert: CassandraStorageExceptions
expr: 'changes(cassandra_storage_exceptions_total[1m]) > 1'
for: 0m
labels:
severity: critical
annotations:
summary: Cassandra storage exceptions (Instaclustr) (instance {{ $labels.instance }})
summary: Cassandra storage exceptions (instance {{ $labels.instance }})
description: "Something is going wrong with cassandra storage - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraTombstoneDump(instaclustr)
- alert: CassandraTombstoneDump
expr: 'avg(cassandra_table_tombstones_scanned{quantile="0.99"}) by (instance,cassandra_cluster,keyspace) > 100'
for: 2m
labels:
severity: critical
annotations:
summary: Cassandra tombstone dump (Instaclustr) (instance {{ $labels.instance }})
summary: Cassandra tombstone dump (instance {{ $labels.instance }})
description: "Cassandra tombstone dump - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraClientRequestUnavailableWrite(instaclustr)
- alert: CassandraClientRequestUnavailableWrite
expr: 'changes(cassandra_client_request_unavailable_exceptions_total{operation="write"}[1m]) > 0'
for: 2m
labels:
severity: critical
annotations:
summary: Cassandra client request unavailable write (Instaclustr) (instance {{ $labels.instance }})
summary: Cassandra client request unavailable write (instance {{ $labels.instance }})
description: "Some Cassandra client requests are unavailable to write - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraClientRequestUnavailableRead(instaclustr)
- alert: CassandraClientRequestUnavailableRead
expr: 'changes(cassandra_client_request_unavailable_exceptions_total{operation="read"}[1m]) > 0'
for: 2m
labels:
severity: critical
annotations:
summary: Cassandra client request unavailable read (Instaclustr) (instance {{ $labels.instance }})
summary: Cassandra client request unavailable read (instance {{ $labels.instance }})
description: "Some Cassandra client requests are unavailable to read - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraClientRequestWriteFailure(instaclustr)
expr: 'increase(cassandra_client_request_failures_total{operation="write"}[1m]) > 5'
- alert: CassandraClientRequestWriteFailure
expr: 'increase(cassandra_client_request_failures_total{operation="write"}[1m]) > 0'
for: 2m
labels:
severity: critical
annotations:
summary: Cassandra client request write failure (Instaclustr) (instance {{ $labels.instance }})
description: "Write failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraClientRequestReadFailure(instaclustr)
expr: 'increase(cassandra_client_request_failures_total{operation="read"}[1m]) > 5'
for: 2m
labels:
severity: critical
annotations:
summary: Cassandra client request read failure (Instaclustr) (instance {{ $labels.instance }})
summary: Cassandra client request write failure (instance {{ $labels.instance }})
description: "Read failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CassandraClientRequestReadFailure
expr: 'increase(cassandra_client_request_failures_total{operation="read"}[1m]) > 0'
for: 2m
labels:
severity: critical
annotations:
summary: Cassandra client request read failure (instance {{ $labels.instance }})
description: "Read failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -2,14 +2,11 @@ groups:
- name: EmbeddedExporter
rules:
# ceph_health_status: 0=HEALTH_OK, 1=HEALTH_WARN, 2=HEALTH_ERR.
# This rule fires on any non-OK state. Split into ==1 (warning) and ==2 (critical) if you want separate severity levels.
- alert: CephState
expr: 'ceph_health_status != 0'
for: 1m
for: 0m
labels:
severity: critical
annotations:
@ -36,16 +33,15 @@ groups:
- alert: CephOsdDown
expr: 'ceph_osd_up == 0'
for: 1m
for: 0m
labels:
severity: critical
annotations:
summary: Ceph OSD Down (instance {{ $labels.instance }})
description: "Ceph Object Storage Daemon Down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 5000ms (5 seconds). Adjust based on your expected OSD performance.
- alert: CephHighOsdLatency
expr: 'ceph_osd_apply_latency_ms > 5000'
expr: 'ceph_osd_perf_apply_latency_seconds > 5'
for: 1m
labels:
severity: warning
@ -53,16 +49,14 @@ groups:
summary: Ceph high OSD latency (instance {{ $labels.instance }})
description: "Ceph Object Storage Daemon latency is high. Please check if it doesn't stuck in weird state.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Ceph internally triggers OSD_NEARFULL based on the nearfull_ratio (default 85%).
# ceph_health_detail exposes named health checks as individual time series.
- alert: CephOsdNearFull
expr: 'ceph_health_detail{name="OSD_NEARFULL"} == 1'
for: 5m
- alert: CephOsdLowSpace
expr: 'ceph_osd_utilization > 90'
for: 2m
labels:
severity: warning
annotations:
summary: Ceph OSD near full (instance {{ $labels.instance }})
description: "A Ceph OSD is dangerously full. Please add more disks.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Ceph OSD low space (instance {{ $labels.instance }})
description: "Ceph Object Storage Daemon is going out of space. Please add more disks.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CephOsdReweighted
expr: 'ceph_osd_weight < 1'
@ -120,7 +114,7 @@ groups:
- alert: CephPgUnavailable
expr: 'ceph_pg_total - ceph_pg_active > 0'
for: 1m
for: 0m
labels:
severity: critical
annotations:

View file

@ -1,45 +0,0 @@
groups:
- name: EmbeddedExporter
rules:
- alert: Cert-managerAbsent
expr: 'absent(up{job="cert-manager"})'
for: 10m
labels:
severity: critical
annotations:
summary: Cert-Manager absent (instance {{ $labels.instance }})
description: "Cert-Manager has disappeared from Prometheus service discovery. New certificates will not be able to be minted, and existing ones can't be renewed until cert-manager is back.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 21 days is a rough default. ACME certificates are typically renewed 30 days before expiry, so expiring within 21 days may indicate issuer misconfiguration.
- alert: Cert-managerCertificateExpiringSoon
expr: 'avg by (exported_namespace, namespace, name) (certmanager_certificate_expiration_timestamp_seconds - time()) < (21 * 24 * 3600)'
for: 1h
labels:
severity: warning
annotations:
summary: Cert-Manager certificate expiring soon (instance {{ $labels.instance }})
description: "The certificate {{ $labels.name }} is expiring in less than 21 days.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: Cert-managerCertificateNotReady
expr: 'max by (name, exported_namespace, namespace, condition) (certmanager_certificate_ready_status{condition!="True"} == 1)'
for: 10m
labels:
severity: critical
annotations:
summary: Cert-Manager certificate not ready (instance {{ $labels.instance }})
description: "The certificate {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is not ready to serve traffic.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Metric renamed in cert-manager v1.19+ (dropped the http_ prefix): certmanager_acme_client_request_count.
# For cert-manager < v1.19, use: certmanager_http_acme_client_request_count.
- alert: Cert-managerHittingAcmeRateLimits
expr: 'sum by (host) (rate(certmanager_acme_client_request_count{status="429"}[5m])) > 0'
for: 5m
labels:
severity: critical
annotations:
summary: Cert-Manager hitting ACME rate limits (instance {{ $labels.instance }})
description: "Cert-Manager is being rate-limited by the ACME provider. Certificate issuance and renewal may be blocked for up to a week.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -1,294 +0,0 @@
groups:
- name: EmbeddedExporter
rules:
# Metric name depends on Cilium version. Use cilium_unreachable_nodes (older) or cilium_node_connectivity_status (1.14+).
- alert: CiliumAgentUnreachableNodes
expr: 'sum(cilium_unreachable_nodes{}) by (pod) > 0'
for: 15m
labels:
severity: warning
annotations:
summary: Cilium agent unreachable nodes (instance {{ $labels.instance }})
description: "Cilium agent {{ $labels.pod }} cannot reach {{ $value }} node(s). Check network connectivity and node health.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Metric name depends on Cilium version. Use cilium_unreachable_health_endpoints (older) or cilium_node_connectivity_status (1.14+).
- alert: CiliumAgentUnreachableHealthEndpoints
expr: 'sum(cilium_unreachable_health_endpoints{}) by (pod) > 0'
for: 15m
labels:
severity: warning
annotations:
summary: Cilium agent unreachable health endpoints (instance {{ $labels.instance }})
description: "Cilium agent {{ $labels.pod }} cannot reach {{ $value }} health endpoint(s). Node-to-node health probes are failing.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Metric name depends on Cilium version. Use cilium_controllers_failing (older) or cilium_controllers_runs_total (1.14+).
- alert: CiliumAgentFailingControllers
expr: 'sum(cilium_controllers_failing{}) by (pod) > 0'
for: 5m
labels:
severity: warning
annotations:
summary: Cilium agent failing controllers (instance {{ $labels.instance }})
description: "Cilium agent {{ $labels.pod }} has {{ $value }} failing controller(s). Check cilium-agent logs for details.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CiliumAgentEndpointFailures
expr: 'sum(cilium_endpoint_state{endpoint_state="invalid"}) by (pod) > 0'
for: 5m
labels:
severity: warning
annotations:
summary: Cilium agent endpoint failures (instance {{ $labels.instance }})
description: "Cilium agent {{ $labels.pod }} has {{ $value }} endpoint(s) in invalid state.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CiliumAgentEndpointRegenerationFailures
expr: 'sum(rate(cilium_endpoint_regenerations_total{outcome="fail"}[5m])) by (pod) > 0.05'
for: 5m
labels:
severity: warning
annotations:
summary: Cilium agent endpoint regeneration failures (instance {{ $labels.instance }})
description: "Cilium agent {{ $labels.pod }} is failing to regenerate endpoints. Network policy enforcement may be stale.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CiliumAgentEndpointUpdateFailure
expr: 'sum(rate(cilium_k8s_client_api_calls_total{method=~"(PUT|POST|PATCH)", endpoint="endpoint", return_code!~"2[0-9][0-9]"}[5m])) by (pod, method, return_code) > 0.05'
for: 5m
labels:
severity: warning
annotations:
summary: Cilium agent endpoint update failure (instance {{ $labels.instance }})
description: "Cilium agent {{ $labels.pod }} is failing K8s endpoint update API calls ({{ $labels.method }} {{ $labels.return_code }}).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CiliumAgentEndpointCreateFailure
expr: 'sum(rate(cilium_api_limiter_processed_requests_total{api_call=~"endpoint-create", outcome="fail"}[1m])) by (pod, api_call) > 0.05'
for: 5m
labels:
severity: info
annotations:
summary: Cilium agent endpoint create failure (instance {{ $labels.instance }})
description: "Cilium agent {{ $labels.pod }} is failing CNI endpoint-create calls. New pods may fail to get networking.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CiliumAgentMapOperationFailures
expr: 'sum(rate(cilium_bpf_map_ops_total{outcome="fail"}[5m])) by (map_name, pod) > 0.05'
for: 5m
labels:
severity: warning
annotations:
summary: Cilium agent map operation failures (instance {{ $labels.instance }})
description: "Cilium agent {{ $labels.pod }} has eBPF map operation failures on {{ $labels.map_name }}. Datapath may be degraded.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Map pressure is a ratio from 0 to 1. At 1.0, the map is full and new entries will be dropped.
- alert: CiliumAgentBpfMapPressure
expr: 'cilium_bpf_map_pressure{} > 0.9'
for: 5m
labels:
severity: warning
annotations:
summary: Cilium agent BPF map pressure (instance {{ $labels.instance }})
description: "Cilium agent {{ $labels.pod }} eBPF map {{ $labels.map_name }} is above 90% utilization. Map may become full.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CiliumAgentConntrackTableFull
expr: 'sum(rate(cilium_drop_count_total{reason="CT: Map insertion failed"}[5m])) by (pod) > 0'
for: 5m
labels:
severity: critical
annotations:
summary: Cilium agent conntrack table full (instance {{ $labels.instance }})
description: "Cilium agent {{ $labels.pod }} conntrack table is full, causing packet drops. Increase CT map size or investigate connection leaks.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CiliumAgentConntrackFailedGarbageCollection
expr: 'sum(rate(cilium_datapath_conntrack_gc_runs_total{status="uncompleted"}[5m])) by (pod) > 0.05'
for: 5m
labels:
severity: warning
annotations:
summary: Cilium agent conntrack failed garbage collection (instance {{ $labels.instance }})
description: "Cilium agent {{ $labels.pod }} conntrack garbage collection is failing. Stale entries may accumulate.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CiliumAgentNatTableFull
expr: 'sum(rate(cilium_drop_count_total{reason="No mapping for NAT masquerade"}[1m])) by (pod) > 0'
for: 5m
labels:
severity: critical
annotations:
summary: Cilium agent NAT table full (instance {{ $labels.instance }})
description: "Cilium agent {{ $labels.pod }} NAT table is full, causing masquerade failures. Increase NAT map size or investigate.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Policy denials may be expected behavior. Investigate only if unexpected traffic is being blocked.
- alert: CiliumAgentHighDeniedRate
expr: 'sum(rate(cilium_drop_count_total{reason="Policy denied"}[1m])) by (pod) > 0'
for: 10m
labels:
severity: info
annotations:
summary: Cilium agent high denied rate (instance {{ $labels.instance }})
description: "Cilium agent {{ $labels.pod }} is dropping packets due to policy denial. Verify network policies are correct.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CiliumAgentHighDropRate
expr: 'sum(rate(cilium_drop_count_total{reason!~"Policy denied"}[5m])) by (pod, reason) > 0.05'
for: 5m
labels:
severity: warning
annotations:
summary: Cilium agent high drop rate (instance {{ $labels.instance }})
description: "Cilium agent {{ $labels.pod }} is dropping packets for reason {{ $labels.reason }}. This indicates infrastructure issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CiliumAgentPolicyMapPressure
expr: 'sum(cilium_bpf_map_pressure{map_name=~"cilium_policy_.*"}) by (pod) > 0.9'
for: 5m
labels:
severity: warning
annotations:
summary: Cilium agent policy map pressure (instance {{ $labels.instance }})
description: "Cilium agent {{ $labels.pod }} policy BPF map is above 90% utilization. New policies may fail to apply.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CiliumAgentPolicyImportErrors
expr: 'sum(rate(cilium_policy_change_total{outcome="fail"}[5m])) by (pod) > 0.05'
for: 5m
labels:
severity: warning
annotations:
summary: Cilium agent policy import errors (instance {{ $labels.instance }})
description: "Cilium agent {{ $labels.pod }} is failing to import network policies. Policy enforcement may be incomplete.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 60s is a rough default. Adjust based on cluster size and policy complexity.
- alert: CiliumAgentPolicyImplementationDelay
expr: 'histogram_quantile(0.99, sum(rate(cilium_policy_implementation_delay_bucket[5m])) by (le, pod)) > 60'
for: 5m
labels:
severity: warning
annotations:
summary: Cilium agent policy implementation delay (instance {{ $labels.instance }})
description: "Cilium agent {{ $labels.pod }} P99 policy deployment latency exceeds 60 seconds. Endpoints may run with stale policies.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CiliumNode-localHighIdentityAllocation
expr: '(sum(cilium_identity{type="node_local"}) by (pod) / (2^16-1)) > 0.8'
for: 5m
labels:
severity: warning
annotations:
summary: Cilium node-local high identity allocation (instance {{ $labels.instance }})
description: "Cilium agent {{ $labels.pod }} node-local identity allocation is above 80%. Approaching the 65535 identity limit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CiliumClusterHighIdentityAllocation
expr: '(sum(cilium_identity{type="cluster_local"}) by () / (2^16-256)) > 0.8'
for: 5m
labels:
severity: warning
annotations:
summary: Cilium cluster high identity allocation (instance {{ $labels.instance }})
description: "Cilium cluster-wide identity allocation is above 80%. Approaching the maximum identity limit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CiliumOperatorExhaustedIpamIps
expr: 'sum(cilium_operator_ipam_ips{type="available"}) by () <= 0'
for: 5m
labels:
severity: critical
annotations:
summary: Cilium operator exhausted IPAM IPs (instance {{ $labels.instance }})
description: "Cilium operator has no available IPAM IPs. New pods will fail to schedule networking.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 90% is a rough default. Adjust based on your pod churn rate and IP pool size.
- alert: CiliumOperatorLowAvailableIpamIps
expr: 'sum(cilium_operator_ipam_ips{type!="available"}) by () / sum(cilium_operator_ipam_ips) by () > 0.9 and sum(cilium_operator_ipam_ips) by () > 0'
for: 5m
labels:
severity: warning
annotations:
summary: Cilium operator low available IPAM IPs (instance {{ $labels.instance }})
description: "Cilium operator IPAM IP pool is over 90% utilized. Allocate more IPs to avoid exhaustion.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Some Cilium versions may not have a status label on this metric. Verify against your Cilium version.
- alert: CiliumOperatorIpamInterfaceCreationFailures
expr: 'sum(rate(cilium_operator_ipam_interface_creation_ops{status!="success"}[5m])) by () > 0.05'
for: 10m
labels:
severity: warning
annotations:
summary: Cilium operator IPAM interface creation failures (instance {{ $labels.instance }})
description: "Cilium operator is failing to create IPAM network interfaces. IP allocation may be impacted.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CiliumAgentApiErrors
expr: 'sum(rate(cilium_agent_api_process_time_seconds_count{return_code=~"5[0-9][0-9]"}[5m])) by (pod, return_code) > 0.05'
for: 5m
labels:
severity: warning
annotations:
summary: Cilium agent API errors (instance {{ $labels.instance }})
description: "Cilium agent {{ $labels.pod }} API is returning 5xx errors ({{ $labels.return_code }}). Agent may be unhealthy.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CiliumAgentKubernetesClientErrors
expr: 'sum(rate(cilium_k8s_client_api_calls_total{endpoint!="metrics", return_code!~"2[0-9][0-9]"}[5m])) by (pod, endpoint, return_code) > 0.05'
for: 5m
labels:
severity: info
annotations:
summary: Cilium agent Kubernetes client errors (instance {{ $labels.instance }})
description: "Cilium agent {{ $labels.pod }} is receiving errors from K8s API for endpoint {{ $labels.endpoint }} ({{ $labels.return_code }}).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CiliumClustermeshRemoteClusterNotReady
expr: 'count(cilium_clustermesh_remote_cluster_readiness_status < 1) by (source_cluster, target_cluster) > 0'
for: 5m
labels:
severity: critical
annotations:
summary: Cilium ClusterMesh remote cluster not ready (instance {{ $labels.instance }})
description: "Cilium ClusterMesh remote cluster {{ $labels.target_cluster }} is not ready from {{ $labels.source_cluster }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CiliumClustermeshRemoteClusterFailing
expr: 'sum(cilium_clustermesh_remote_cluster_failures) by (source_cluster, target_cluster) > 0'
for: 5m
labels:
severity: critical
annotations:
summary: Cilium ClusterMesh remote cluster failing (instance {{ $labels.instance }})
description: "Cilium ClusterMesh connectivity to remote cluster {{ $labels.target_cluster }} from {{ $labels.source_cluster }} is failing ({{ $value }} failures).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CiliumKvstoremeshRemoteClusterNotReady
expr: 'count(cilium_kvstoremesh_remote_cluster_readiness_status < 1) by (source_cluster, target_cluster) > 0'
for: 5m
labels:
severity: critical
annotations:
summary: Cilium KVStoreMesh remote cluster not ready (instance {{ $labels.instance }})
description: "Cilium KVStoreMesh remote cluster {{ $labels.target_cluster }} is not ready from {{ $labels.source_cluster }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CiliumKvstoremeshRemoteClusterFailing
expr: 'sum(cilium_kvstoremesh_remote_cluster_failures) by (source_cluster, target_cluster) > 0'
for: 5m
labels:
severity: critical
annotations:
summary: Cilium KVStoreMesh remote cluster failing (instance {{ $labels.instance }})
description: "Cilium KVStoreMesh remote cluster {{ $labels.target_cluster }} from {{ $labels.source_cluster }} is experiencing failures ({{ $value }} failures).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CiliumKvstoremeshSyncErrors
expr: 'sum(rate(cilium_kvstoremesh_kvstore_sync_errors_total[5m])) by (source_cluster) > 0.05'
for: 5m
labels:
severity: critical
annotations:
summary: Cilium KVStoreMesh sync errors (instance {{ $labels.instance }})
description: "Cilium KVStoreMesh from {{ $labels.source_cluster }} is experiencing kvstore sync errors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CiliumHubbleLostEvents
expr: 'sum(rate(hubble_lost_events_total[5m])) by (pod) > 0.05'
for: 5m
labels:
severity: warning
annotations:
summary: Cilium Hubble lost events (instance {{ $labels.instance }})
description: "Cilium Hubble on {{ $labels.pod }} is losing flow events. Observability data may be incomplete.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 10% is a rough default. Some DNS errors may be normal depending on your workload.
- alert: CiliumHubbleHighDnsErrorRate
expr: 'sum(rate(hubble_dns_responses_total{rcode!="No Error"}[5m])) by (pod) / sum(rate(hubble_dns_responses_total[5m])) by (pod) > 0.1 and sum(rate(hubble_dns_responses_total[5m])) by (pod) > 0'
for: 5m
labels:
severity: warning
annotations:
summary: Cilium Hubble high DNS error rate (instance {{ $labels.instance }})
description: "Cilium Hubble on {{ $labels.pod }} is observing more than 10% DNS error responses.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -1,181 +0,0 @@
groups:
- name: EmbeddedExporter
rules:
# Adjust the job label to match your Prometheus configuration.
- alert: ClickhouseNodeDown
expr: 'up{job="clickhouse"} == 0'
for: 2m
labels:
severity: critical
annotations:
summary: ClickHouse node down (instance {{ $labels.instance }})
description: "No metrics received from ClickHouse exporter for over 2 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ClickhouseMemoryUsageCritical
expr: 'ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 90 and ClickHouseAsyncMetrics_CGroupMemoryTotal > 0'
for: 5m
labels:
severity: critical
annotations:
summary: ClickHouse Memory Usage Critical (instance {{ $labels.instance }})
description: "Memory usage is critically high, over 90%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ClickhouseMemoryUsageWarning
expr: 'ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 80 and ClickHouseAsyncMetrics_CGroupMemoryTotal > 0'
for: 5m
labels:
severity: warning
annotations:
summary: ClickHouse Memory Usage Warning (instance {{ $labels.instance }})
description: "Memory usage is over 80%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ClickhouseDiskSpaceLowOnDefault
expr: 'ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 20 and (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) > 0'
for: 2m
labels:
severity: warning
annotations:
summary: ClickHouse Disk Space Low on Default (instance {{ $labels.instance }})
description: "Disk space on default is below 20%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ClickhouseDiskSpaceCriticalOnDefault
expr: 'ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 10 and (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) > 0'
for: 2m
labels:
severity: critical
annotations:
summary: ClickHouse Disk Space Critical on Default (instance {{ $labels.instance }})
description: "Disk space on default disk is critically low, below 10%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ClickhouseDiskSpaceLowOnBackups
expr: 'ClickHouseAsyncMetrics_DiskAvailable_backups / (ClickHouseAsyncMetrics_DiskAvailable_backups + ClickHouseAsyncMetrics_DiskUsed_backups) * 100 < 20 and (ClickHouseAsyncMetrics_DiskAvailable_backups + ClickHouseAsyncMetrics_DiskUsed_backups) > 0'
for: 2m
labels:
severity: warning
annotations:
summary: ClickHouse Disk Space Low on Backups (instance {{ $labels.instance }})
description: "Disk space on backups is below 20%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ClickhouseReplicaErrors
expr: 'ClickHouseErrorMetric_ALL_REPLICAS_ARE_STALE == 1 or ClickHouseErrorMetric_ALL_REPLICAS_LOST == 1'
for: 0m
labels:
severity: critical
annotations:
summary: ClickHouse Replica Errors (instance {{ $labels.instance }})
description: "Critical replica errors detected, either all replicas are stale or lost.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ClickhouseNoAvailableReplicas
expr: 'ClickHouseErrorMetric_NO_AVAILABLE_REPLICA == 1'
for: 0m
labels:
severity: critical
annotations:
summary: ClickHouse No Available Replicas (instance {{ $labels.instance }})
description: "No available replicas in ClickHouse.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ClickhouseNoLiveReplicas
expr: 'ClickHouseErrorMetric_TOO_FEW_LIVE_REPLICAS == 1'
for: 0m
labels:
severity: critical
annotations:
summary: ClickHouse No Live Replicas (instance {{ $labels.instance }})
description: "There are too few live replicas available, risking data loss and service disruption.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Please replace the threshold with an appropriate value
- alert: ClickhouseHighTcpConnections
expr: 'ClickHouseMetrics_TCPConnection > 400'
for: 5m
labels:
severity: warning
annotations:
summary: ClickHouse High TCP Connections (instance {{ $labels.instance }})
description: "High number of TCP connections, indicating heavy client or inter-cluster communication.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Adjust the threshold based on your cluster size and expected replication traffic.
- alert: ClickhouseInterserverConnectionIssues
expr: 'ClickHouseMetrics_InterserverConnection > 50'
for: 5m
labels:
severity: warning
annotations:
summary: ClickHouse Interserver Connection Issues (instance {{ $labels.instance }})
description: "High number of interserver connections may indicate replication or distributed query handling issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ClickhouseZookeeperConnectionIssues
expr: 'ClickHouseMetrics_ZooKeeperSession != 1'
for: 3m
labels:
severity: warning
annotations:
summary: ClickHouse ZooKeeper Connection Issues (instance {{ $labels.instance }})
description: "ClickHouse is experiencing issues with ZooKeeper connections, which may affect cluster state and coordination.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ClickhouseAuthenticationFailures
expr: 'increase(ClickHouseErrorMetric_AUTHENTICATION_FAILED[5m]) > 3'
for: 0m
labels:
severity: info
annotations:
summary: ClickHouse Authentication Failures (instance {{ $labels.instance }})
description: "Authentication failures detected, indicating potential security issues or misconfiguration.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ClickhouseAccessDeniedErrors
expr: 'increase(ClickHouseErrorMetric_RESOURCE_ACCESS_DENIED[5m]) > 3'
for: 0m
labels:
severity: info
annotations:
summary: ClickHouse Access Denied Errors (instance {{ $labels.instance }})
description: "Access denied errors have been logged, which could indicate permission issues or unauthorized access attempts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ClickhouseRejectedInsertQueries
expr: 'increase(ClickHouseProfileEvents_RejectedInserts[1m]) > 2'
for: 1m
labels:
severity: warning
annotations:
summary: ClickHouse rejected insert queries (instance {{ $labels.instance }})
description: "INSERTs rejected due to too many active data parts. Reduce insert frequency.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ClickhouseDelayedInsertQueries
expr: 'increase(ClickHouseProfileEvents_DelayedInserts[5m]) > 10'
for: 2m
labels:
severity: warning
annotations:
summary: ClickHouse delayed insert queries (instance {{ $labels.instance }})
description: "INSERTs delayed due to high number of active parts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ClickhouseZookeeperHardwareException
expr: 'increase(ClickHouseProfileEvents_ZooKeeperHardwareExceptions[1m]) > 0'
for: 1m
labels:
severity: critical
annotations:
summary: ClickHouse zookeeper hardware exception (instance {{ $labels.instance }})
description: "Zookeeper hardware exception: network issues communicating with ZooKeeper\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Please replace the threshold with an appropriate value
- alert: ClickhouseHighNetworkUsage
expr: 'rate(ClickHouseProfileEvents_NetworkSendBytes[1m]) > 100*1024*1024 or rate(ClickHouseProfileEvents_NetworkReceiveBytes[1m]) > 100*1024*1024'
for: 2m
labels:
severity: warning
annotations:
summary: ClickHouse high network usage (instance {{ $labels.instance }})
description: "High network usage. ClickHouse network usage exceeds 100MB/s.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ClickhouseDistributedRejectedInserts
expr: 'increase(ClickHouseProfileEvents_DistributedRejectedInserts[5m]) > 3'
for: 2m
labels:
severity: critical
annotations:
summary: ClickHouse distributed rejected inserts (instance {{ $labels.instance }})
description: "INSERTs into Distributed tables rejected due to pending bytes limit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -2,11 +2,10 @@ groups:
- name: LablabsCloudflareExporter
rules:
- alert: CloudflareHttp4xxErrorRate
expr: '(sum by(zone) (rate(cloudflare_zone_requests_status{status=~"^4.."}[15m])) / on (zone) sum by (zone) (rate(cloudflare_zone_requests_status[15m]))) * 100 > 5 and sum by (zone) (rate(cloudflare_zone_requests_status[15m])) > 0'
expr: '(sum by(zone) (rate(cloudflare_zone_requests_status{status=~"^4.."}[15m])) / on (zone) sum by (zone) (rate(cloudflare_zone_requests_status[15m]))) * 100 > 5'
for: 0m
labels:
severity: warning
@ -15,7 +14,7 @@ groups:
description: "Cloudflare high HTTP 4xx error rate (> 5% for domain {{ $labels.zone }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CloudflareHttp5xxErrorRate
expr: '(sum by (zone) (rate(cloudflare_zone_requests_status{status=~"^5.."}[5m])) / on (zone) sum by (zone) (rate(cloudflare_zone_requests_status[5m]))) * 100 > 5 and sum by (zone) (rate(cloudflare_zone_requests_status[5m])) > 0'
expr: '(sum by (zone) (rate(cloudflare_zone_requests_status{status=~"^5.."}[5m])) / on (zone) sum by (zone) (rate(cloudflare_zone_requests_status[5m]))) * 100 > 5'
for: 0m
labels:
severity: critical

View file

@ -2,7 +2,6 @@ groups:
- name: ConsulExporter
rules:
- alert: ConsulServiceHealthcheckFailed

View file

@ -2,7 +2,6 @@ groups:
- name: EmbeddedExporter
rules:
- alert: CorednsPanicCount

View file

@ -2,7 +2,6 @@ groups:
- name: EmbeddedExporter
rules:
- alert: CortexRulerConfigurationReloadFailure
@ -23,25 +22,23 @@ groups:
summary: Cortex not connected to Alertmanager (instance {{ $labels.instance }})
description: "Cortex not connected to Alertmanager (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 0.05/s avoids firing on transient single-event spikes.
- alert: CortexNotificationsAreBeingDropped
expr: 'rate(cortex_prometheus_notifications_dropped_total[5m]) > 0.05'
- alert: CortexNotificationAreBeingDropped
expr: 'rate(cortex_prometheus_notifications_dropped_total[5m]) > 0'
for: 0m
labels:
severity: critical
annotations:
summary: Cortex notifications are being dropped (instance {{ $labels.instance }})
description: "Cortex notifications are being dropped due to errors (instance {{ $labels.instance }}, {{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Cortex notification are being dropped (instance {{ $labels.instance }})
description: "Cortex notification are being dropped due to errors (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 0.05/s avoids firing on transient single-event spikes.
- alert: CortexNotificationErrors
expr: 'rate(cortex_prometheus_notifications_errors_total[5m]) > 0.05'
- alert: CortexNotificationError
expr: 'rate(cortex_prometheus_notifications_errors_total[5m]) > 0'
for: 0m
labels:
severity: critical
annotations:
summary: Cortex notification errors (instance {{ $labels.instance }})
description: "Cortex is failing when sending alert notifications (instance {{ $labels.instance }}, {{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Cortex notification error (instance {{ $labels.instance }})
description: "Cortex is failing when sending alert notifications (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CortexIngesterUnhealthy
expr: 'cortex_ring_members{state="Unhealthy", name="ingester"} > 0'

View file

@ -1,170 +0,0 @@
groups:
- name: GesellixCouchdbPrometheusExporter
rules:
- alert: CouchdbNodeDown
expr: 'couchdb_httpd_node_up == 0 or couchdb_httpd_up == 0'
for: 2m
labels:
severity: critical
annotations:
summary: CouchDB node down (instance {{ $labels.instance }})
description: "CouchDB node is not responding (node_up metric is 0) for more than 2 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CouchdbAtomMemoryUsageCritical
expr: 'couchdb_erlang_memory_atom_used > 0.9 * couchdb_erlang_memory_atom'
for: 5m
labels:
severity: critical
annotations:
summary: CouchDB atom memory usage critical (instance {{ $labels.instance }})
description: "Atom memory usage is above 90% of limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# The default max_dbs_open is 500. Adjust the threshold (currently 0.9 * 1000) to match your max_dbs_open setting.
- alert: CouchdbOpenDatabasesCritical
expr: 'couchdb_httpd_open_databases > 0.9 * 1000'
for: 5m
labels:
severity: critical
annotations:
summary: CouchDB open databases critical (instance {{ $labels.instance }})
description: "Number of open databases exceeds 90% of node capacity\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Adjust 65535 to match your system's file descriptor limit (ulimit -n).
- alert: CouchdbOpenOsFilesCritical
expr: 'couchdb_httpd_open_os_files > 0.9 * 65535'
for: 5m
labels:
severity: critical
annotations:
summary: CouchDB open OS files critical (instance {{ $labels.instance }})
description: "CouchDB is using more than 90% of allowed OS file descriptors, may fail to open new files\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: Couchdb5xxErrorRatioHigh
expr: 'rate(couchdb_httpd_status_codes{code=~"5.."}[5m]) / rate(couchdb_httpd_requests[5m]) > 0.05 and rate(couchdb_httpd_requests[5m]) > 0'
for: 5m
labels:
severity: critical
annotations:
summary: CouchDB 5xx error ratio high (instance {{ $labels.instance }})
description: "More than 5% of HTTP requests are returning 5xx errors\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CouchdbTemporaryViewReadRateCritical
expr: 'rate(couchdb_httpd_temporary_view_reads[5m]) > 100'
for: 5m
labels:
severity: critical
annotations:
summary: CouchDB temporary view read rate critical (instance {{ $labels.instance }})
description: "Temporary view read rate exceeds 100 reads/sec, high risk of performance degradation\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CouchdbMangoQueriesScanningTooManyDocs
expr: 'rate(couchdb_mango_too_many_docs_scanned[5m]) > 50'
for: 5m
labels:
severity: warning
annotations:
summary: CouchDB Mango queries scanning too many docs (instance {{ $labels.instance }})
description: "Some Mango queries are scanning too many documents, consider adding indexes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CouchdbMangoQueriesFailedDueToInvalidIndex
expr: 'rate(couchdb_mango_query_invalid_index[5m]) > 5'
for: 5m
labels:
severity: warning
annotations:
summary: CouchDB Mango queries failed due to invalid index (instance {{ $labels.instance }})
description: "Some Mango queries failed to execute because the index was missing or invalid\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CouchdbMangoDocsExaminedHigh
expr: 'rate(couchdb_mango_docs_examined[5m]) > 1000'
for: 5m
labels:
severity: warning
annotations:
summary: CouchDB Mango docs examined high (instance {{ $labels.instance }})
description: "High number of documents examined per Mango queries, consider indexing\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CouchdbReplicatorManagerDied
expr: 'increase(couchdb_replicator_changes_manager_deaths[5m]) > 0'
for: 1m
labels:
severity: critical
annotations:
summary: CouchDB Replicator manager died (instance {{ $labels.instance }})
description: "Replication manager process has crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CouchdbReplicatorQueueProcessDied
expr: 'increase(couchdb_replicator_changes_queue_deaths[5m]) > 0'
for: 1m
labels:
severity: critical
annotations:
summary: CouchDB Replicator queue process died (instance {{ $labels.instance }})
description: "Replication queue process has crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CouchdbReplicatorReaderProcessDied
expr: 'increase(couchdb_replicator_changes_reader_deaths[5m]) > 0'
for: 1m
labels:
severity: critical
annotations:
summary: CouchDB Replicator reader process died (instance {{ $labels.instance }})
description: "Replication reader process has crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CouchdbReplicatorFailedToStart
expr: 'increase(couchdb_replicator_failed_starts[5m]) > 0'
for: 1m
labels:
severity: critical
annotations:
summary: CouchDB Replicator failed to start (instance {{ $labels.instance }})
description: "One or more replication tasks failed to start\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CouchdbReplicationClusterUnstable
expr: 'couchdb_replicator_cluster_is_stable == 0'
for: 2m
labels:
severity: critical
annotations:
summary: CouchDB replication cluster unstable (instance {{ $labels.instance }})
description: "The replication cluster is unstable, replication may be interrupted\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CouchdbReplicationReadFailures
expr: 'increase(couchdb_replicator_changes_read_failures[5m]) > 5'
for: 5m
labels:
severity: warning
annotations:
summary: CouchDB replication read failures (instance {{ $labels.instance }})
description: "Replication changes feed has failed reads more than 5 times in 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CouchdbFileDescriptorsHigh
expr: 'process_open_fds / process_max_fds > 0.85 and process_max_fds > 0'
for: 5m
labels:
severity: warning
annotations:
summary: CouchDB file descriptors high (instance {{ $labels.instance }})
description: "Process is using more than 85% of allowed file descriptors\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CouchdbProcessRestarted
expr: 'changes(process_start_time_seconds[1h]) > 0'
for: 1m
labels:
severity: info
annotations:
summary: CouchDB process restarted (instance {{ $labels.instance }})
description: "CouchDB process has restarted recently\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CouchdbCriticalLogEntries
expr: 'increase(couchdb_server_couch_log{level=~"error|critical"}[5m]) > 5'
for: 1m
labels:
severity: critical
annotations:
summary: CouchDB critical log entries (instance {{ $labels.instance }})
description: "Critical or error log entries detected in the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -1,97 +0,0 @@
groups:
- name: DigitaloceanExporter
rules:
- alert: DigitaloceanDropletDown
expr: 'digitalocean_droplet_up == 0'
for: 5m
labels:
severity: critical
annotations:
summary: DigitalOcean droplet down (instance {{ $labels.instance }})
description: "DigitalOcean droplet {{ $labels.name }} ({{ $labels.id }}) in {{ $labels.region }} is not running.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: DigitaloceanAccountNotActive
expr: 'digitalocean_account_active != 1'
for: 5m
labels:
severity: critical
annotations:
summary: DigitalOcean account not active (instance {{ $labels.instance }})
description: "DigitalOcean account is not active. It may be suspended or locked.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: DigitaloceanDatabaseDown
expr: 'digitalocean_database_status == 0'
for: 2m
labels:
severity: critical
annotations:
summary: DigitalOcean database down (instance {{ $labels.instance }})
description: "DigitalOcean managed database {{ $labels.name }} ({{ $labels.engine }}) in {{ $labels.region }} is offline.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: DigitaloceanKubernetesClusterDown
expr: 'digitalocean_kubernetes_cluster_up == 0'
for: 5m
labels:
severity: critical
annotations:
summary: DigitalOcean Kubernetes cluster down (instance {{ $labels.instance }})
description: "DigitalOcean Kubernetes cluster {{ $labels.name }} ({{ $labels.version }}) in {{ $labels.region }} is not running.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: DigitaloceanLoadBalancerDown
expr: 'digitalocean_loadbalancer_status == 0'
for: 2m
labels:
severity: critical
annotations:
summary: DigitalOcean load balancer down (instance {{ $labels.instance }})
description: "DigitalOcean load balancer {{ $labels.name }} ({{ $labels.ip }}) is not active.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: DigitaloceanLoadBalancerNoBackends
expr: 'digitalocean_loadbalancer_droplets == 0'
for: 1m
labels:
severity: warning
annotations:
summary: DigitalOcean load balancer no backends (instance {{ $labels.instance }})
description: "DigitalOcean load balancer {{ $labels.name }} ({{ $labels.ip }}) has no droplets attached.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: DigitaloceanFloatingIpNotAssigned
expr: 'digitalocean_floating_ipv4_active == 0'
for: 0m
labels:
severity: warning
annotations:
summary: DigitalOcean floating IP not assigned (instance {{ $labels.instance }})
description: "DigitalOcean floating IP {{ $labels.ipv4 }} in {{ $labels.region }} is not assigned to any droplet.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: DigitaloceanActiveIncidents
expr: 'digitalocean_incidents_total > 0'
for: 0m
labels:
severity: warning
annotations:
summary: DigitalOcean active incidents (instance {{ $labels.instance }})
description: "DigitalOcean platform has {{ $value }} active incident(s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: DigitaloceanExporterCollectionErrors
expr: 'increase(digitalocean_errors_total[5m]) > 3'
for: 5m
labels:
severity: warning
annotations:
summary: DigitalOcean exporter collection errors (instance {{ $labels.instance }})
description: "DigitalOcean exporter {{ $labels.collector }} collector has {{ $value }} errors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Fires when more than 80% of the account's droplet limit is in use.
- alert: DigitaloceanDropletLimitApproaching
expr: '(count(digitalocean_droplet_up) / digitalocean_account_droplet_limit) * 100 > 80 and digitalocean_account_droplet_limit > 0'
for: 0m
labels:
severity: warning
annotations:
summary: DigitalOcean droplet limit approaching (instance {{ $labels.instance }})
description: "DigitalOcean account is using {{ $value }}% of its droplet quota.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -2,10 +2,8 @@ groups:
- name: GoogleCadvisor
rules:
# This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment.
- alert: ContainerKilled
expr: 'time() - container_last_seen > 60'
for: 0m
@ -15,7 +13,6 @@ groups:
summary: Container killed (instance {{ $labels.instance }})
description: "A container has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment.
- alert: ContainerAbsent
expr: 'absent(container_last_seen)'
for: 5m
@ -25,17 +22,15 @@ groups:
summary: Container absent (instance {{ $labels.instance }})
description: "A container is absent for 5 min\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Only fires for containers with explicit CPU limits. Containers without limits have cpu_quota=0, which is filtered out by the guard.
- alert: ContainerHighCpuUtilization
expr: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) > 80 and sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) > 0'
expr: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) > 80'
for: 2m
labels:
severity: warning
annotations:
summary: Container High CPU utilization (instance {{ $labels.instance }})
description: "Container CPU utilization is above 80% (current: {{ $value | printf \"%.2f\" }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Container CPU utilization is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# See https://medium.com/faun/how-much-is-too-much-the-linux-oomkiller-and-used-memory-d32186f29c9d
- alert: ContainerHighMemoryUsage
expr: '(sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80'
for: 2m
@ -46,7 +41,7 @@ groups:
description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ContainerVolumeUsage
expr: '(1 - (sum(container_fs_inodes_free{name!=""}) BY (instance) / sum(container_fs_inodes_total) BY (instance))) * 100 > 80 and sum(container_fs_inodes_total) BY (instance) > 0'
expr: '(1 - (sum(container_fs_inodes_free{name!=""}) BY (instance) / sum(container_fs_inodes_total) BY (instance))) * 100 > 80'
for: 2m
labels:
severity: warning
@ -55,31 +50,22 @@ groups:
description: "Container Volume usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ContainerHighThrottleRate
expr: 'sum(rate(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) by (container, pod, namespace) / sum(rate(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > ( 25 / 100 ) and sum(rate(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > 0'
expr: 'sum(increase(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) by (container, pod, namespace) / sum(increase(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > ( 25 / 100 )'
for: 5m
labels:
severity: warning
annotations:
summary: Container high throttle rate (instance {{ $labels.instance }})
description: "Container is being throttled ({{ $value | humanizePercentage }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ContainerHighLowChangeCpuUsage
expr: '(abs((sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[1m])) * 100) - (sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[1m] offset 1m)) * 100)) or abs((sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[1m])) * 100) - (sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[5m] offset 1m)) * 100))) > 25'
for: 0m
labels:
severity: info
annotations:
summary: Container high low change CPU usage (instance {{ $labels.instance }})
description: "This alert rule monitors the absolute change in CPU usage within a time window and triggers an alert when the change exceeds 25%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Container is being throttled\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ContainerLowCpuUtilization
expr: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) < 20 and sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) > 0'
expr: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) < 20'
for: 7d
labels:
severity: info
annotations:
summary: Container Low CPU utilization (instance {{ $labels.instance }})
description: "Container CPU utilization is under 20% for 1 week. Consider reducing the allocated CPU. (current: {{ $value | printf \"%.2f\" }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Container CPU utilization is under 20% for 1 week. Consider reducing the allocated CPU.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ContainerLowMemoryUsage
expr: '(sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) < 20'

View file

@ -1,34 +0,0 @@
groups:
- name: EbpfExporter
rules:
# The exporter uses loose attachment: if a program fails to load (missing BTF, kernel incompatibility), it sets this metric to 0 and continues running.
- alert: EbpfExporterProgramNotAttached
expr: 'ebpf_exporter_ebpf_program_attached == 0'
for: 5m
labels:
severity: warning
annotations:
summary: eBPF exporter program not attached (instance {{ $labels.instance }})
description: "eBPF program {{ $labels.id }} failed to attach. The program is not collecting data. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EbpfExporterDecoderErrors
expr: 'rate(ebpf_exporter_decoder_errors_total[5m]) > 0.05'
for: 5m
labels:
severity: warning
annotations:
summary: eBPF exporter decoder errors (instance {{ $labels.instance }})
description: "eBPF exporter is experiencing decoder errors for config {{ $labels.config }}. Kernel data is not being correctly transformed into labels. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EbpfExporterNoEnabledConfigs
expr: 'ebpf_exporter_enabled_configs == 0 or absent(ebpf_exporter_enabled_configs)'
for: 5m
labels:
severity: warning
annotations:
summary: eBPF exporter no enabled configs (instance {{ $labels.instance }})
description: "eBPF exporter has no enabled configurations. No eBPF programs are being run. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -2,11 +2,10 @@ groups:
- name: PrometheusCommunityElasticsearchExporter
rules:
- alert: ElasticsearchHeapUsageTooHigh
expr: '(elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 90 and elasticsearch_jvm_memory_max_bytes{area="heap"} > 0'
expr: '(elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 90'
for: 2m
labels:
severity: critical
@ -15,7 +14,7 @@ groups:
description: "The heap usage is over 90%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ElasticsearchHeapUsageWarning
expr: '(elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 80 and elasticsearch_jvm_memory_max_bytes{area="heap"} > 0'
expr: '(elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 80'
for: 2m
labels:
severity: warning
@ -24,7 +23,7 @@ groups:
description: "The heap usage is over 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ElasticsearchDiskOutOfSpace
expr: 'elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 10 and elasticsearch_filesystem_data_size_bytes > 0'
expr: 'elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 10'
for: 0m
labels:
severity: critical
@ -33,7 +32,7 @@ groups:
description: "The disk usage is over 90%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ElasticsearchDiskSpaceLow
expr: 'elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 20 and elasticsearch_filesystem_data_size_bytes > 0'
expr: 'elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 20'
for: 2m
labels:
severity: warning
@ -59,20 +58,18 @@ groups:
summary: Elasticsearch Cluster Yellow (instance {{ $labels.instance }})
description: "Elastic Cluster Yellow status\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# 1m delay allows a restart without triggering an alert.
- alert: ElasticsearchHealthyNodes
expr: 'elasticsearch_cluster_health_number_of_nodes < 3'
for: 1m
for: 0m
labels:
severity: critical
annotations:
summary: Elasticsearch Healthy Nodes (instance {{ $labels.instance }})
description: "Missing node in Elasticsearch cluster\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# 1m delay allows a restart without triggering an alert.
- alert: ElasticsearchHealthyDataNodes
expr: 'elasticsearch_cluster_health_number_of_data_nodes < 3'
for: 1m
for: 0m
labels:
severity: critical
annotations:
@ -117,7 +114,7 @@ groups:
- alert: ElasticsearchUnassignedShards
expr: 'elasticsearch_cluster_health_unassigned_shards > 0'
for: 2m
for: 0m
labels:
severity: critical
annotations:
@ -141,42 +138,3 @@ groups:
annotations:
summary: Elasticsearch no new documents (instance {{ $labels.instance }})
description: "No new documents for 10 min!\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 10ms (0.01s) per indexing operation is a rough default. Adjust based on your document size and cluster performance.
- alert: ElasticsearchHighIndexingLatency
expr: 'rate(elasticsearch_indices_indexing_index_time_seconds_total[5m]) / rate(elasticsearch_indices_indexing_index_total[5m]) > 0.01 and rate(elasticsearch_indices_indexing_index_total[5m]) > 0'
for: 10m
labels:
severity: warning
annotations:
summary: Elasticsearch High Indexing Latency (instance {{ $labels.instance }})
description: "The indexing latency on Elasticsearch cluster is higher than the threshold (current value: {{ $value }}s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 10000 ops/s is a rough default. Adjust based on your cluster capacity and expected workload.
- alert: ElasticsearchHighIndexingRate
expr: 'sum(rate(elasticsearch_indices_indexing_index_total[1m]))> 10000'
for: 5m
labels:
severity: warning
annotations:
summary: Elasticsearch High Indexing Rate (instance {{ $labels.instance }})
description: "The indexing rate on Elasticsearch cluster is higher than the threshold.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 100 queries/s is very low for most production clusters. Adjust based on your expected query volume.
- alert: ElasticsearchHighQueryRate
expr: 'sum(rate(elasticsearch_indices_search_query_total[1m])) > 100'
for: 5m
labels:
severity: warning
annotations:
summary: Elasticsearch High Query Rate (instance {{ $labels.instance }})
description: "The query rate on Elasticsearch cluster is higher than the threshold.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ElasticsearchHighQueryLatency
expr: 'rate(elasticsearch_indices_search_query_time_seconds[1m]) / rate(elasticsearch_indices_search_query_total[1m]) > 1 and rate(elasticsearch_indices_search_query_total[1m]) > 0'
for: 5m
labels:
severity: warning
annotations:
summary: Elasticsearch High Query Latency (instance {{ $labels.instance }})
description: "The query latency on Elasticsearch cluster is higher than the threshold (current value: {{ $value }}s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -1,177 +0,0 @@
groups:
- name: EmbeddedExporter
rules:
- alert: EnvoyServerNotLive
expr: 'envoy_server_live != 1'
for: 1m
labels:
severity: critical
annotations:
summary: Envoy server not live (instance {{ $labels.instance }})
description: "Envoy server is not live (draining or shutting down) on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EnvoyHighMemoryUsage
expr: 'envoy_server_memory_allocated / envoy_server_memory_heap_size * 100 > 90 and envoy_server_memory_heap_size > 0'
for: 5m
labels:
severity: warning
annotations:
summary: Envoy high memory usage (instance {{ $labels.instance }})
description: "Envoy memory allocated is above 90% of heap size on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EnvoyHighDownstreamHttp5xxErrorRate
expr: 'sum by (instance) (rate(envoy_http_downstream_rq_xx{envoy_response_code_class="5"}[5m])) / sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) * 100 > 5 and sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) > 0'
for: 1m
labels:
severity: critical
annotations:
summary: Envoy high downstream HTTP 5xx error rate (instance {{ $labels.instance }})
description: "More than 5% of downstream HTTP responses are 5xx on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EnvoyHighDownstreamHttp4xxErrorRate
expr: 'sum by (instance) (rate(envoy_http_downstream_rq_xx{envoy_response_code_class="4"}[5m])) / sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) * 100 > 10 and sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) > 0'
for: 5m
labels:
severity: warning
annotations:
summary: Envoy high downstream HTTP 4xx error rate (instance {{ $labels.instance }})
description: "More than 10% of downstream HTTP responses are 4xx on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EnvoyDownstreamConnectionsOverflowing
expr: 'increase(envoy_listener_downstream_cx_overflow[5m]) > 5'
for: 0m
labels:
severity: warning
annotations:
summary: Envoy downstream connections overflowing (instance {{ $labels.instance }})
description: "Downstream connections are being rejected due to listener overflow on {{ $labels.instance }} ({{ $value }} in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EnvoyClusterMembershipEmpty
expr: 'envoy_cluster_membership_healthy == 0'
for: 1m
labels:
severity: critical
annotations:
summary: Envoy cluster membership empty (instance {{ $labels.instance }})
description: "Envoy cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} has no healthy members\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EnvoyClusterMembershipDegraded
expr: 'envoy_cluster_membership_healthy / envoy_cluster_membership_total * 100 < 75 and envoy_cluster_membership_total > 0'
for: 5m
labels:
severity: warning
annotations:
summary: Envoy cluster membership degraded (instance {{ $labels.instance }})
description: "Only {{ $value | printf \"%.1f\" }}% of members in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} are healthy (threshold: 75%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EnvoyHighClusterUpstreamConnectionFailures
expr: 'increase(envoy_cluster_upstream_cx_connect_fail[5m]) > 10'
for: 5m
labels:
severity: warning
annotations:
summary: Envoy high cluster upstream connection failures (instance {{ $labels.instance }})
description: "High rate of upstream connection failures in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} ({{ $value }} in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EnvoyHighClusterUpstreamRequestTimeoutRate
expr: 'rate(envoy_cluster_upstream_rq_timeout[5m]) / rate(envoy_cluster_upstream_rq_completed[5m]) * 100 > 5 and rate(envoy_cluster_upstream_rq_completed[5m]) > 0'
for: 5m
labels:
severity: warning
annotations:
summary: Envoy high cluster upstream request timeout rate (instance {{ $labels.instance }})
description: "More than 5% of upstream requests are timing out in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EnvoyHighClusterUpstream5xxErrorRate
expr: 'rate(envoy_cluster_upstream_rq_xx{envoy_response_code_class="5"}[5m]) / rate(envoy_cluster_upstream_rq_completed[5m]) * 100 > 5 and rate(envoy_cluster_upstream_rq_completed[5m]) > 0'
for: 1m
labels:
severity: critical
annotations:
summary: Envoy high cluster upstream 5xx error rate (instance {{ $labels.instance }})
description: "More than 5% of upstream requests return 5xx in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EnvoyClusterHealthCheckFailures
expr: 'increase(envoy_cluster_health_check_failure[5m]) > 5'
for: 5m
labels:
severity: warning
annotations:
summary: Envoy cluster health check failures (instance {{ $labels.instance }})
description: "Health checks are consistently failing in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} ({{ $value }} in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EnvoyClusterOutlierDetectionEjectionsActive
expr: 'envoy_cluster_outlier_detection_ejections_active > 0'
for: 5m
labels:
severity: info
annotations:
summary: Envoy cluster outlier detection ejections active (instance {{ $labels.instance }})
description: "There are active outlier detection ejections in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EnvoyListenerSslConnectionErrors
expr: 'increase(envoy_listener_ssl_connection_error[5m]) > 5'
for: 0m
labels:
severity: warning
annotations:
summary: Envoy listener SSL connection errors (instance {{ $labels.instance }})
description: "Envoy listener is experiencing SSL/TLS connection errors on {{ $labels.instance }} ({{ $value }} in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EnvoyGlobalDownstreamConnectionsOverflowing
expr: 'increase(envoy_listener_downstream_global_cx_overflow[5m]) > 5'
for: 0m
labels:
severity: critical
annotations:
summary: Envoy global downstream connections overflowing (instance {{ $labels.instance }})
description: "Downstream connections are being rejected due to global connection limit on {{ $labels.instance }} ({{ $value }} in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EnvoySslCertificateExpiringSoon
expr: 'envoy_server_days_until_first_cert_expiring < 7'
for: 0m
labels:
severity: warning
annotations:
summary: Envoy SSL certificate expiring soon (instance {{ $labels.instance }})
description: "SSL certificate loaded by Envoy on {{ $labels.instance }} expires in less than 7 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EnvoySslCertificateExpired
expr: 'envoy_server_days_until_first_cert_expiring < 0'
for: 0m
labels:
severity: critical
annotations:
summary: Envoy SSL certificate expired (instance {{ $labels.instance }})
description: "SSL certificate loaded by Envoy on {{ $labels.instance }} has expired\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EnvoyClusterCircuitBreakerTripped
expr: 'envoy_cluster_circuit_breakers_default_cx_open == 1 or envoy_cluster_circuit_breakers_default_rq_open == 1'
for: 0m
labels:
severity: critical
annotations:
summary: Envoy cluster circuit breaker tripped (instance {{ $labels.instance }})
description: "Circuit breaker is open for cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EnvoyNoHealthyUpstream
expr: 'increase(envoy_cluster_upstream_cx_none_healthy[5m]) > 3'
for: 0m
labels:
severity: critical
annotations:
summary: Envoy no healthy upstream (instance {{ $labels.instance }})
description: "Upstream connection attempts failed because no healthy upstream was available in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} ({{ $value }} in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EnvoyHighDownstreamRequestTimeoutRate
expr: 'increase(envoy_http_downstream_rq_timeout[5m]) > 5'
for: 5m
labels:
severity: warning
annotations:
summary: Envoy high downstream request timeout rate (instance {{ $labels.instance }})
description: "Downstream requests are timing out on {{ $labels.instance }} ({{ $value }} in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -2,7 +2,6 @@ groups:
- name: EmbeddedExporter
rules:
- alert: EtcdInsufficientMembers
@ -30,26 +29,24 @@ groups:
severity: warning
annotations:
summary: Etcd high number of leader changes (instance {{ $labels.instance }})
description: "Etcd leader changed {{ $value }} times during 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Etcd leader changed more than 2 times during 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Filters to actual error codes. grpc_code!="OK" includes benign codes like NotFound, AlreadyExists, and Cancelled.
- alert: EtcdHighNumberOfFailedGrpcRequestsWarning
expr: 'sum(rate(grpc_server_handled_total{grpc_code=~"Internal|Unavailable|DeadlineExceeded|ResourceExhausted|Aborted|Unknown"}[1m])) BY (grpc_service, grpc_method) / sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0.01 and sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0'
- alert: EtcdHighNumberOfFailedGrpcRequests
expr: 'sum(rate(grpc_server_handled_total{grpc_code!="OK"}[1m])) BY (grpc_service, grpc_method) / sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0.01'
for: 2m
labels:
severity: warning
annotations:
summary: Etcd high number of failed GRPC requests warning (instance {{ $labels.instance }})
summary: Etcd high number of failed GRPC requests (instance {{ $labels.instance }})
description: "More than 1% GRPC request failure detected in Etcd\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Filters to actual error codes. grpc_code!="OK" includes benign codes like NotFound, AlreadyExists, and Cancelled.
- alert: EtcdHighNumberOfFailedGrpcRequestsCritical
expr: 'sum(rate(grpc_server_handled_total{grpc_code=~"Internal|Unavailable|DeadlineExceeded|ResourceExhausted|Aborted|Unknown"}[1m])) BY (grpc_service, grpc_method) / sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0.05 and sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0'
- alert: EtcdHighNumberOfFailedGrpcRequests
expr: 'sum(rate(grpc_server_handled_total{grpc_code!="OK"}[1m])) BY (grpc_service, grpc_method) / sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0.05'
for: 2m
labels:
severity: critical
annotations:
summary: Etcd high number of failed GRPC requests critical (instance {{ $labels.instance }})
summary: Etcd high number of failed GRPC requests (instance {{ $labels.instance }})
description: "More than 5% GRPC request failure detected in Etcd\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EtcdGrpcRequestsSlow
@ -61,27 +58,24 @@ groups:
summary: Etcd GRPC requests slow (instance {{ $labels.instance }})
description: "GRPC requests slowing down, 99th percentile is over 0.15s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# These etcd_http_* metrics are from the etcd v2 API and do not exist in etcd 3.x. Remove these rules if running etcd 3.x.
- alert: EtcdHighNumberOfFailedHttpRequestsWarning
expr: 'sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.01 and sum(rate(etcd_http_received_total[1m])) BY (method) > 0'
- alert: EtcdHighNumberOfFailedHttpRequests
expr: 'sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.01'
for: 2m
labels:
severity: warning
annotations:
summary: Etcd high number of failed HTTP requests warning (instance {{ $labels.instance }})
summary: Etcd high number of failed HTTP requests (instance {{ $labels.instance }})
description: "More than 1% HTTP failure detected in Etcd\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# These etcd_http_* metrics are from the etcd v2 API and do not exist in etcd 3.x. Remove these rules if running etcd 3.x.
- alert: EtcdHighNumberOfFailedHttpRequestsCritical
expr: 'sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.05 and sum(rate(etcd_http_received_total[1m])) BY (method) > 0'
- alert: EtcdHighNumberOfFailedHttpRequests
expr: 'sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.05'
for: 2m
labels:
severity: critical
annotations:
summary: Etcd high number of failed HTTP requests critical (instance {{ $labels.instance }})
summary: Etcd high number of failed HTTP requests (instance {{ $labels.instance }})
description: "More than 5% HTTP failure detected in Etcd\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# This etcd_http_* metric is from the etcd v2 API and does not exist in etcd 3.x. Remove this rule if running etcd 3.x.
- alert: EtcdHttpRequestsSlow
expr: 'histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[1m])) > 0.15'
for: 2m
@ -92,7 +86,7 @@ groups:
description: "HTTP requests slowing down, 99th percentile is over 0.15s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EtcdMemberCommunicationSlow
expr: 'histogram_quantile(0.99, sum(rate(etcd_network_peer_round_trip_time_seconds_bucket[5m])) by (instance, le)) > 0.15'
expr: 'histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) > 0.15'
for: 2m
labels:
severity: warning
@ -107,10 +101,10 @@ groups:
severity: warning
annotations:
summary: Etcd high number of failed proposals (instance {{ $labels.instance }})
description: "Etcd server got {{ $value }} failed proposals in the past hour\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Etcd server got more than 5 failed proposals past hour\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EtcdHighFsyncDurations
expr: 'histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) by (instance, le)) > 0.5'
expr: 'histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) > 0.5'
for: 2m
labels:
severity: warning
@ -119,7 +113,7 @@ groups:
description: "Etcd WAL fsync duration increasing, 99th percentile is over 0.5s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EtcdHighCommitDurations
expr: 'histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) by (instance, le)) > 0.25'
expr: 'histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[1m])) > 0.25'
for: 2m
labels:
severity: warning

View file

@ -1,42 +0,0 @@
groups:
- name: EmbeddedExporter
rules:
- alert: FluxKustomizationFailure
expr: 'gotk_resource_info{ready="False", customresource_kind="Kustomization"} > 0'
for: 15m
labels:
severity: warning
annotations:
summary: Flux Kustomization Failure (instance {{ $labels.instance }})
description: "The {{ $labels.customresource_kind }} '{{ $labels.name }}' in namespace {{ $labels.exported_namespace }} is marked as not ready.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: FluxHelmreleaseFailure
expr: 'gotk_resource_info{ready="False", customresource_kind="HelmRelease"} > 0'
for: 15m
labels:
severity: warning
annotations:
summary: Flux HelmRelease Failure (instance {{ $labels.instance }})
description: "The {{ $labels.customresource_kind }} '{{ $labels.name }}' in namespace {{ $labels.exported_namespace }} is marked as not ready.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: FluxSourceIssue
expr: 'gotk_resource_info{ready="False", customresource_kind=~"GitRepository|HelmRepository|Bucket|OCIRepository"} > 0'
for: 15m
labels:
severity: warning
annotations:
summary: Flux Source Issue (instance {{ $labels.instance }})
description: "Flux source {{ $labels.customresource_kind }} '{{ $labels.name }}' has issue(s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: FluxImageIssue
expr: 'gotk_resource_info{ready="False", customresource_kind=~"ImagePolicy|ImageRepository|ImageUpdateAutomation"} > 0'
for: 15m
labels:
severity: warning
annotations:
summary: Flux Image Issue (instance {{ $labels.instance }})
description: "The {{ $labels.customresource_kind }} '{{ $labels.name }}' is marked as not ready.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -2,20 +2,19 @@ groups:
- name: ZnerolFreeswitchExporter
rules:
- alert: FreeswitchDown
expr: 'freeswitch_up == 0'
for: 1m
for: 0m
labels:
severity: critical
annotations:
summary: Freeswitch down (instance {{ $labels.instance }})
description: "Freeswitch {{ $labels.instance }} is unresponsive.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Freeswitch is unresponsive\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: FreeswitchSessionsWarning
expr: '(freeswitch_session_active * 100 / freeswitch_session_limit) > 80 and freeswitch_session_limit > 0'
expr: '(freeswitch_session_active * 100 / freeswitch_session_limit) > 80'
for: 10m
labels:
severity: warning
@ -24,7 +23,7 @@ groups:
description: "High sessions usage on {{ $labels.instance }}: {{ $value | printf \"%.2f\"}}%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: FreeswitchSessionsCritical
expr: '(freeswitch_session_active * 100 / freeswitch_session_limit) > 90 and freeswitch_session_limit > 0'
expr: '(freeswitch_session_active * 100 / freeswitch_session_limit) > 90'
for: 5m
labels:
severity: critical

View file

@ -1,66 +0,0 @@
groups:
- name: Gitaly
rules:
# Filters to actual error codes. grpc_code!="OK" includes benign codes like NotFound, AlreadyExists, and Cancelled.
- alert: GitlabGitalyHighGrpcErrorRate
expr: 'sum(rate(grpc_server_handled_total{job="gitaly",grpc_code=~"Internal|Unavailable|DeadlineExceeded|ResourceExhausted|Aborted|Unknown|DataLoss"}[5m])) / sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) * 100 > 5 and sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) > 0'
for: 5m
labels:
severity: warning
annotations:
summary: GitLab Gitaly high gRPC error rate (instance {{ $labels.instance }})
description: "Gitaly on {{ $labels.instance }} is returning more than 5% gRPC errors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# ResourceExhausted errors from Gitaly mean Git operations are being rejected due to
# concurrency limits. This directly impacts users trying to push, pull, or clone.
- alert: GitlabGitalyResourceExhausted
expr: 'sum(rate(grpc_server_handled_total{job="gitaly",grpc_code="ResourceExhausted"}[5m])) / sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) * 100 > 1 and sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) > 0'
for: 5m
labels:
severity: critical
annotations:
summary: GitLab Gitaly resource exhausted (instance {{ $labels.instance }})
description: "Gitaly on {{ $labels.instance }} is returning ResourceExhausted errors, indicating overload ({{ $value }}%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: GitlabGitalyHighRpcLatency
expr: 'histogram_quantile(0.95, sum(rate(grpc_server_handling_seconds_bucket{job="gitaly",grpc_type="unary"}[5m])) by (le)) > 1'
for: 5m
labels:
severity: warning
annotations:
summary: GitLab Gitaly high RPC latency (instance {{ $labels.instance }})
description: "Gitaly on {{ $labels.instance }} p95 unary RPC latency exceeds 1 second ({{ $value }}s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Brief throttling spikes are normal. Threshold of 0.1s/s (10% of CPU time throttled) filters out transient noise.
- alert: GitlabGitalyCpuThrottled
expr: 'rate(gitaly_cgroup_cpu_cfs_throttled_seconds_total[5m]) > 0.1'
for: 5m
labels:
severity: warning
annotations:
summary: GitLab Gitaly CPU throttled (instance {{ $labels.instance }})
description: "Gitaly processes on {{ $labels.instance }} are being CPU throttled by cgroups.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: GitlabGitalyAuthenticationFailures
expr: 'increase(gitaly_authentications_total{status="failed"}[5m]) > 3'
for: 0m
labels:
severity: warning
annotations:
summary: GitLab Gitaly authentication failures (instance {{ $labels.instance }})
description: "Gitaly on {{ $labels.instance }} has authentication failures ({{ $value }}).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# When the circuit breaker trips to "open" state, Git operations (push, pull, clone) will fail.
# Check Gitaly service health and logs.
- alert: GitlabGitalyCircuitBreakerTripped
expr: 'increase(gitaly_circuit_breaker_transitions_total{to_state="open"}[5m]) > 0'
for: 0m
labels:
severity: critical
annotations:
summary: GitLab Gitaly circuit breaker tripped (instance {{ $labels.instance }})
description: "Gitaly circuit breaker has tripped on {{ $labels.instance }}. Git operations are failing.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -1,216 +0,0 @@
groups:
- name: GitlabBuiltInExporter
rules:
# Queued connections indicate Puma workers are saturated.
# Consider increasing puma['worker_processes'] or puma['max_threads'] in gitlab.rb.
- alert: GitlabPumaHighQueuedConnections
expr: 'puma_queued_connections > 5'
for: 5m
labels:
severity: warning
annotations:
summary: GitLab Puma high queued connections (instance {{ $labels.instance }})
description: "GitLab Puma has {{ $value }} queued connections on {{ $labels.instance }}. Requests are waiting for an available worker thread.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: GitlabPumaNoAvailablePoolCapacity
expr: 'puma_pool_capacity == 0'
for: 5m
labels:
severity: critical
annotations:
summary: GitLab Puma no available pool capacity (instance {{ $labels.instance }})
description: "GitLab Puma pool capacity on {{ $labels.instance }} has been at 0 for 5 minutes. All threads are busy.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: GitlabPumaWorkersNotRunning
expr: 'puma_running_workers < puma_workers'
for: 5m
labels:
severity: warning
annotations:
summary: GitLab Puma workers not running (instance {{ $labels.instance }})
description: "GitLab Puma on {{ $labels.instance }} has {{ $value }} running workers out of expected total.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold is 5% of all requests returning server errors.
# Check GitLab logs at /var/log/gitlab/ for root cause.
- alert: GitlabHighHttpErrorRate
expr: 'sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m])) * 100 > 5 and sum(rate(http_requests_total[5m])) > 0'
for: 5m
labels:
severity: critical
annotations:
summary: GitLab high HTTP error rate (instance {{ $labels.instance }})
description: "GitLab is returning more than 5% HTTP 5xx errors on {{ $labels.instance }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 10s may need adjustment based on your instance size and workload.
- alert: GitlabHighHttpRequestLatency
expr: 'histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le)) > 10'
for: 5m
labels:
severity: warning
annotations:
summary: GitLab high HTTP request latency (instance {{ $labels.instance }})
description: "GitLab p95 HTTP request latency on {{ $labels.instance }} is above 10 seconds.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# This metric requires the emit_sidekiq_histogram_metrics feature flag to be enabled.
# A sustained failure rate indicates background processing issues.
- alert: GitlabSidekiqJobsFailing
expr: 'rate(sidekiq_jobs_failed_total[5m]) > 0.1'
for: 10m
labels:
severity: warning
annotations:
summary: GitLab Sidekiq jobs failing (instance {{ $labels.instance }})
description: "GitLab Sidekiq jobs are failing at a rate of {{ $value }} per second on {{ $labels.instance }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# When running jobs approach the concurrency limit, new jobs will queue up.
# Consider scaling Sidekiq workers or increasing concurrency.
- alert: GitlabSidekiqQueueTooLarge
expr: 'sum(sidekiq_running_jobs) >= sum(sidekiq_concurrency) * 0.9'
for: 10m
labels:
severity: warning
annotations:
summary: GitLab Sidekiq queue too large (instance {{ $labels.instance }})
description: "GitLab Sidekiq has {{ $value }} running jobs, approaching concurrency limit on {{ $labels.instance }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# This metric requires the emit_sidekiq_histogram_metrics feature flag to be enabled.
- alert: GitlabSidekiqHighJobCompletionTime
expr: 'histogram_quantile(0.95, sum(rate(sidekiq_jobs_completion_seconds_bucket[5m])) by (le, worker)) > 300'
for: 10m
labels:
severity: warning
annotations:
summary: GitLab Sidekiq high job completion time (instance {{ $labels.instance }})
description: "GitLab Sidekiq job p95 completion time on {{ $labels.instance }} is above 5 minutes ({{ $value | humanizeDuration }}).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# This metric requires the emit_sidekiq_histogram_metrics feature flag to be enabled.
# High queue latency means jobs are stuck waiting. Check Sidekiq concurrency and queue sizes.
- alert: GitlabSidekiqHighQueueLatency
expr: 'histogram_quantile(0.95, sum(rate(sidekiq_jobs_queue_duration_seconds_bucket[5m])) by (le)) > 60'
for: 5m
labels:
severity: warning
annotations:
summary: GitLab Sidekiq high queue latency (instance {{ $labels.instance }})
description: "GitLab Sidekiq jobs on {{ $labels.instance }} are waiting more than 60 seconds before being processed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# When the pool is near saturation, requests may block waiting for a connection.
# Increase db_pool_size in gitlab.rb or investigate slow queries.
- alert: GitlabDatabaseConnectionPoolSaturation
expr: 'gitlab_database_connection_pool_busy / gitlab_database_connection_pool_size * 100 > 90 and gitlab_database_connection_pool_size > 0'
for: 5m
labels:
severity: warning
annotations:
summary: GitLab database connection pool saturation (instance {{ $labels.instance }})
description: "GitLab database connection pool on {{ $labels.instance }} ({{ $labels.class }}) is {{ $value }}% busy.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: GitlabDatabaseConnectionPoolDeadConnections
expr: 'gitlab_database_connection_pool_dead > 0'
for: 5m
labels:
severity: warning
annotations:
summary: GitLab database connection pool dead connections (instance {{ $labels.instance }})
description: "GitLab database connection pool on {{ $labels.instance }} ({{ $labels.class }}) has {{ $value }} dead connections.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: GitlabDatabaseConnectionPoolWaiting
expr: 'gitlab_database_connection_pool_waiting > 0'
for: 5m
labels:
severity: warning
annotations:
summary: GitLab database connection pool waiting (instance {{ $labels.instance }})
description: "GitLab on {{ $labels.instance }} has {{ $value }} threads waiting for a database connection.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: GitlabCiPipelineCreationSlow
expr: 'histogram_quantile(0.95, sum(rate(gitlab_ci_pipeline_creation_duration_seconds_bucket[5m])) by (le)) > 30'
for: 5m
labels:
severity: warning
annotations:
summary: GitLab CI pipeline creation slow (instance {{ $labels.instance }})
description: "GitLab CI pipeline creation p95 latency on {{ $labels.instance }} is above 30 seconds.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# This metric may not exist in all GitLab versions. Verify against your GitLab installation.
- alert: GitlabCiPipelineFailuresIncreasing
expr: 'deriv(gitlab_ci_pipeline_failure_reasons[5m]) > 0.05'
for: 10m
labels:
severity: warning
annotations:
summary: GitLab CI pipeline failures increasing (instance {{ $labels.instance }})
description: "GitLab CI pipeline failures are increasing on {{ $labels.instance }} ({{ $value }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Frequent runner auth failures may indicate expired tokens or misconfigured runners.
- alert: GitlabCiRunnerAuthenticationFailures
expr: 'increase(gitlab_ci_runner_authentication_failure_total[5m]) > 5'
for: 5m
labels:
severity: warning
annotations:
summary: GitLab CI runner authentication failures (instance {{ $labels.instance }})
description: "GitLab CI runners are experiencing authentication failures on {{ $labels.instance }} ({{ $value }} failures).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 2GB may need adjustment based on your instance size.
# High memory usage can lead to OOM kills and service disruptions.
- alert: GitlabHighMemoryUsage
expr: 'process_resident_memory_bytes{job=~".*gitlab.*"} > 2e+9'
for: 10m
labels:
severity: warning
annotations:
summary: GitLab high memory usage (instance {{ $labels.instance }})
description: "GitLab process on {{ $labels.instance }} is using {{ $value | humanize1024 }}B of RSS memory.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Heap fragmentation above 50% means a significant amount of memory is wasted.
# A Puma worker restart may help reclaim memory.
- alert: GitlabRubyHeapFragmentation
expr: 'ruby_gc_stat_ext_heap_fragmentation{job=~".*gitlab.*"} > 0.5'
for: 15m
labels:
severity: warning
annotations:
summary: GitLab Ruby heap fragmentation (instance {{ $labels.instance }})
description: "GitLab Ruby heap fragmentation on {{ $labels.instance }} is {{ $value }}. High fragmentation wastes memory.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: GitlabRackUncaughtErrors
expr: 'rate(rack_uncaught_errors_total[5m]) > 0.05'
for: 5m
labels:
severity: warning
annotations:
summary: GitLab rack uncaught errors (instance {{ $labels.instance }})
description: "GitLab is experiencing uncaught errors in the Rack layer on {{ $labels.instance }} ({{ $value }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# This may happen during a rolling deployment. If it persists, investigate incomplete upgrades.
- alert: GitlabVersionMismatch
expr: 'count(count by (version) (gitlab_build_info)) > 1'
for: 0m
labels:
severity: warning
annotations:
summary: GitLab version mismatch (instance {{ $labels.instance }})
description: "Multiple GitLab versions are running across the fleet.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: GitlabHighFileDescriptorUsage
expr: 'process_open_fds{job=~".*gitlab.*"} / process_max_fds * 100 > 80 and process_max_fds > 0'
for: 5m
labels:
severity: warning
annotations:
summary: GitLab high file descriptor usage (instance {{ $labels.instance }})
description: "GitLab on {{ $labels.instance }} is using {{ $value }}% of available file descriptors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: GitlabRubyThreadsSaturated
expr: 'sum by (instance) (gitlab_ruby_threads_running_threads) > on(instance) gitlab_ruby_threads_max_expected_threads * 1.5'
for: 10m
labels:
severity: warning
annotations:
summary: GitLab Ruby threads saturated (instance {{ $labels.instance }})
description: "GitLab running threads on {{ $labels.instance }} have exceeded the expected maximum ({{ $value }}).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -1,36 +0,0 @@
groups:
- name: Workhorse
rules:
# Workhorse sits in front of Puma and handles Git HTTP, file uploads, and proxying.
# Threshold from GitLab Omnibus default rules: 10% for high-traffic instances.
- alert: GitlabWorkhorseHighErrorRate
expr: 'sum(rate(gitlab_workhorse_http_request_duration_seconds_count{code=~"5.."}[5m])) / sum(rate(gitlab_workhorse_http_request_duration_seconds_count[5m])) * 100 > 10 and sum(rate(gitlab_workhorse_http_request_duration_seconds_count[5m])) > 0'
for: 5m
labels:
severity: critical
annotations:
summary: GitLab Workhorse high error rate (instance {{ $labels.instance }})
description: "GitLab Workhorse on {{ $labels.instance }} is returning more than 10% HTTP 5xx errors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: GitlabWorkhorseHighLatency
expr: 'histogram_quantile(0.95, sum(rate(gitlab_workhorse_http_request_duration_seconds_bucket[5m])) by (le)) > 10'
for: 5m
labels:
severity: warning
annotations:
summary: GitLab Workhorse high latency (instance {{ $labels.instance }})
description: "GitLab Workhorse on {{ $labels.instance }} p95 request latency is above 10 seconds.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 100 may need adjustment based on instance size.
- alert: GitlabWorkhorseHighIn-flightRequests
expr: 'gitlab_workhorse_http_in_flight_requests > 100'
for: 5m
labels:
severity: warning
annotations:
summary: GitLab Workhorse high in-flight requests (instance {{ $labels.instance }})
description: "GitLab Workhorse on {{ $labels.instance }} has {{ $value }} in-flight requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -1,109 +0,0 @@
groups:
- name: GolangExporter
rules:
# Threshold is a rough default. High-concurrency servers may legitimately run thousands of goroutines. Adjust to match your baseline.
- alert: GoGoroutineCountHigh
expr: 'go_goroutines > 1000'
for: 5m
labels:
severity: warning
annotations:
summary: Go goroutine count high (instance {{ $labels.instance }})
description: "Go application has too many goroutines (> 1000), potential goroutine leak\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# quantile="1" is the maximum observed GC pause in the current summary window, not p99.
# A single outlier pause can push this above 1s. The for: 5m ensures the max stays elevated.
- alert: GoGcDurationHigh
expr: 'go_gc_duration_seconds{quantile="1"} > 1'
for: 5m
labels:
severity: warning
annotations:
summary: Go GC duration high (instance {{ $labels.instance }})
description: "Go GC pause duration is too high (max > 1s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# go_memstats_sys_bytes is the total memory obtained from the OS by the Go runtime, not total host memory.
# This ratio measures Go-internal memory utilization, not system-level memory pressure.
- alert: GoMemoryUsageHigh
expr: '(go_memstats_heap_alloc_bytes / go_memstats_sys_bytes) * 100 > 90'
for: 5m
labels:
severity: warning
annotations:
summary: Go memory usage high (instance {{ $labels.instance }})
description: "Go heap allocation is using most of the runtime's reserved memory (> 90%), indicating the process may need more memory or has a leak\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold is workload-dependent. Applications with heavy CGo or blocking I/O may legitimately use more OS threads. Adjust to match your baseline.
- alert: GoThreadCountHigh
expr: 'go_threads > 500'
for: 5m
labels:
severity: warning
annotations:
summary: Go thread count high (instance {{ $labels.instance }})
description: "Go OS thread count is high (> 500), potential blocking syscall or CGo leak\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold is a rough default. Adjust based on your application's normal object count.
- alert: GoHeapObjectsCountHigh
expr: 'go_memstats_heap_objects > 10000000'
for: 5m
labels:
severity: warning
annotations:
summary: Go heap objects count high (instance {{ $labels.instance }})
description: "Go heap has too many live objects (> 10M), high GC pressure\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# rate(go_gc_duration_seconds_sum) approximates the fraction of wall-clock time spent in GC.
# This replaces go_memstats_gc_cpu_fraction which was removed in client_golang v1.12+.
- alert: GoGcCpuFractionHigh
expr: 'rate(go_gc_duration_seconds_sum[5m]) > 0.05'
for: 5m
labels:
severity: warning
annotations:
summary: Go GC CPU fraction high (instance {{ $labels.instance }})
description: "Go GC is consuming too much CPU (> 5%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# A threshold of 100/s only catches catastrophic leaks (30k goroutines in 5m). 10/s catches gradual leaks (~3k in 5m).
# Adjust based on your application's expected concurrency patterns.
- alert: GoGoroutineSpike
expr: 'deriv(go_goroutines[5m]) > 10'
for: 5m
labels:
severity: warning
annotations:
summary: Go goroutine spike (instance {{ $labels.instance }})
description: "Go goroutine count is growing rapidly ({{ $value | printf \"%.0f\" }} goroutines/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Alerts when heap in-use grows by more than 10MB/s sustained over 10 minutes.
# Adjust threshold based on your workload.
- alert: GoHeapIn-useGrowing
expr: 'deriv(go_memstats_heap_inuse_bytes[10m]) > 1e7'
for: 0m
labels:
severity: warning
annotations:
summary: Go heap in-use growing (instance {{ $labels.instance }})
description: "Go heap in-use memory is growing steadily, potential memory leak or under-sized heap\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: GoMemoryLeak
expr: 'rate(go_memstats_alloc_bytes_total[5m]) > 1e9'
for: 5m
labels:
severity: warning
annotations:
summary: Go memory leak (instance {{ $labels.instance }})
description: "Go application has sustained high allocation rate (> 1GB/s), potential memory leak\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: GoStackMemoryHigh
expr: 'go_memstats_stack_inuse_bytes > 1e9'
for: 5m
labels:
severity: warning
annotations:
summary: Go stack memory high (instance {{ $labels.instance }})
description: "Go stack memory usage is high (> 1GB), likely excessive goroutines or deep recursion\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -1,53 +0,0 @@
groups:
- name: StackdriverExporter
# Self-monitoring metrics use the stackdriver_monitoring_* prefix.
# All self-monitoring metrics include a project_id label.
rules:
- alert: StackdriverExporterScrapeError
expr: 'stackdriver_monitoring_last_scrape_error > 0'
for: 5m
labels:
severity: warning
annotations:
summary: Stackdriver exporter scrape error (instance {{ $labels.instance }})
description: "Stackdriver exporter failed to scrape metrics from Google Cloud Monitoring API for project {{ $labels.project_id }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: StackdriverExporterSlowScrape
expr: 'stackdriver_monitoring_last_scrape_duration_seconds > 300'
for: 5m
labels:
severity: warning
annotations:
summary: Stackdriver exporter slow scrape (instance {{ $labels.instance }})
description: "Stackdriver exporter scrape for project {{ $labels.project_id }} is taking more than 5 minutes ({{ $value }}s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: StackdriverExporterScrapeErrorsIncreasing
expr: 'increase(stackdriver_monitoring_scrape_errors_total[15m]) > 5'
for: 0m
labels:
severity: warning
annotations:
summary: Stackdriver exporter scrape errors increasing (instance {{ $labels.instance }})
description: "Stackdriver exporter has had {{ $value }} scrape errors in the last 15 minutes for project {{ $labels.project_id }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: StackdriverExporterHighApiCalls
expr: 'rate(stackdriver_monitoring_api_calls_total[5m]) * 60 > 100'
for: 0m
labels:
severity: warning
annotations:
summary: Stackdriver exporter high API calls (instance {{ $labels.instance }})
description: "Stackdriver exporter is making {{ $value }} API calls per minute for project {{ $labels.project_id }}. This may hit Google Cloud Monitoring API quotas.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: StackdriverExporterScrapeStale
expr: 'time() - stackdriver_monitoring_last_scrape_timestamp > 600'
for: 0m
labels:
severity: warning
annotations:
summary: Stackdriver exporter scrape stale (instance {{ $labels.instance }})
description: "Stackdriver exporter has not successfully scraped metrics for project {{ $labels.project_id }} in the last 10 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -1,15 +0,0 @@
groups:
- name: EmbeddedExporter
rules:
- alert: GrafanaAlloyServiceDown
expr: 'count by (instance) (alloy_build_info offset 2h) unless count by (instance) (alloy_build_info)'
for: 0m
labels:
severity: critical
annotations:
summary: Grafana Alloy service down (instance {{ $labels.instance }})
description: "Alloy on instance {{ $labels.instance }} is not responding or has stopped running.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -1,465 +0,0 @@
groups:
- name: EmbeddedExporter
# Mimir uses the `cortex_` metric prefix for backward compatibility with Cortex. This is intentional and expected.
rules:
- alert: MimirIngesterUnhealthy
expr: 'min by (job) (cortex_ring_members{state="Unhealthy", name="ingester"}) > 0'
for: 15m
labels:
severity: critical
annotations:
summary: Mimir ingester unhealthy (instance {{ $labels.instance }})
description: "Mimir has {{ $value }} unhealthy ingester(s) in the ring.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MimirRequestErrors
expr: '100 * sum by (job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..", route!~"ready|debug_pprof"}[5m])) / sum by (job, route) (rate(cortex_request_duration_seconds_count{route!~"ready|debug_pprof"}[5m])) > 1 and sum by (job, route) (rate(cortex_request_duration_seconds_count{route!~"ready|debug_pprof"}[5m])) > 0'
for: 15m
labels:
severity: critical
annotations:
summary: Mimir request errors (instance {{ $labels.instance }})
description: "Mimir {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}% errors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MimirInconsistentRuntimeConfig
expr: 'count(count by (job, sha256) (cortex_runtime_config_hash)) without(sha256) > 1'
for: 1h
labels:
severity: critical
annotations:
summary: Mimir inconsistent runtime config (instance {{ $labels.instance }})
description: "An inconsistent runtime config file is used across Mimir instances.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MimirBadRuntimeConfig
expr: 'sum by (job) (cortex_runtime_config_last_reload_successful == 0) > 0'
for: 5m
labels:
severity: critical
annotations:
summary: Mimir bad runtime config (instance {{ $labels.instance }})
description: "{{ $labels.job }} failed to reload runtime config.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MimirSchedulerQueriesStuck
expr: 'sum by (job) (min_over_time(cortex_query_scheduler_queue_length[1m])) > 0'
for: 7m
labels:
severity: critical
annotations:
summary: Mimir scheduler queries stuck (instance {{ $labels.instance }})
description: "There are {{ $value }} queued up queries in {{ $labels.job }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MimirCacheRequestErrors
expr: '(sum by (name, operation, job) (rate(thanos_cache_operation_failures_total[5m])) / sum by (name, operation, job) (rate(thanos_cache_operations_total[5m]))) * 100 > 5 and sum by (name, operation, job) (rate(thanos_cache_operations_total[5m])) > 0'
for: 5m
labels:
severity: warning
annotations:
summary: Mimir cache request errors (instance {{ $labels.instance }})
description: "Mimir cache {{ $labels.name }} is experiencing {{ printf \"%.2f\" $value }}% errors for {{ $labels.operation }} operation.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MimirKvStoreFailure
expr: '(sum by (job, kv_name) (rate(cortex_kv_request_duration_seconds_count{status_code!~"2.."}[5m])) / sum by (job, kv_name) (rate(cortex_kv_request_duration_seconds_count[5m]))) == 1 and sum by (job, kv_name) (rate(cortex_kv_request_duration_seconds_count[5m])) > 0'
for: 5m
labels:
severity: critical
annotations:
summary: Mimir KV store failure (instance {{ $labels.instance }})
description: "Mimir {{ $labels.job }} KV store {{ $labels.kv_name }} is failing with 100% error rate.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MimirMemoryMapAreasTooHigh
expr: 'process_memory_map_areas{job=~".*(ingester|cortex|mimir|store-gateway).*"} / process_memory_map_areas_limit{job=~".*(ingester|cortex|mimir|store-gateway).*"} * 100 > 80 and process_memory_map_areas_limit{job=~".*(ingester|cortex|mimir|store-gateway).*"} > 0'
for: 5m
labels:
severity: critical
annotations:
summary: Mimir memory map areas too high (instance {{ $labels.instance }})
description: "Mimir {{ $labels.job }} is using {{ printf \"%.0f\" $value }}% of its memory map area limit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MimirIngesterInstanceHasNoTenants
expr: '(cortex_ingester_memory_users == 0) and on (instance) (cortex_ingester_memory_users offset 1h > 0)'
for: 1h
labels:
severity: warning
annotations:
summary: Mimir ingester instance has no tenants (instance {{ $labels.instance }})
description: "Mimir ingester {{ $labels.instance }} has no tenants assigned.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MimirRulerInstanceHasNoRuleGroups
expr: '(cortex_ruler_managers_total == 0) and on (instance) (cortex_ruler_managers_total offset 1h > 0)'
for: 1h
labels:
severity: warning
annotations:
summary: Mimir ruler instance has no rule groups (instance {{ $labels.instance }})
description: "Mimir ruler {{ $labels.instance }} has no rule groups assigned.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MimirIngestedDataTooFarInTheFuture
expr: 'max by (job) (cortex_ingester_tsdb_head_max_timestamp_seconds - time() and cortex_ingester_tsdb_head_max_timestamp_seconds > 0) > 3600'
for: 5m
labels:
severity: warning
annotations:
summary: Mimir ingested data too far in the future (instance {{ $labels.instance }})
description: "Mimir ingester {{ $labels.job }} has ingested samples with timestamps more than 1 hour in the future.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 0.05/s avoids firing on transient single-event spikes.
- alert: MimirStoreGatewayTooManyFailedOperations
expr: 'sum by (job) (rate(thanos_objstore_bucket_operation_failures_total[5m])) > 0.05'
for: 5m
labels:
severity: warning
annotations:
summary: Mimir store gateway too many failed operations (instance {{ $labels.instance }})
description: "Mimir store-gateway {{ $labels.job }} bucket operations are failing ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MimirRingMembersMismatch
expr: 'max by (name, job) (sum by (name, job, instance) (cortex_ring_members)) != min by (name, job) (sum by (name, job, instance) (cortex_ring_members))'
for: 15m
labels:
severity: warning
annotations:
summary: Mimir ring members mismatch (instance {{ $labels.instance }})
description: "Mimir {{ $labels.name }} ring has inconsistent member counts across instances.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MimirIngesterReachingSeriesLimitWarning
expr: '(cortex_ingester_memory_series / ignoring(limit) cortex_ingester_instance_limits{limit="max_series"} * 100 > 80) and cortex_ingester_instance_limits{limit="max_series"} > 0'
for: 3h
labels:
severity: warning
annotations:
summary: Mimir ingester reaching series limit warning (instance {{ $labels.instance }})
description: "Mimir ingester {{ $labels.instance }} has reached {{ printf \"%.0f\" $value }}% of its series limit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MimirIngesterReachingSeriesLimitCritical
expr: '(cortex_ingester_memory_series / ignoring(limit) cortex_ingester_instance_limits{limit="max_series"} * 100 > 90) and cortex_ingester_instance_limits{limit="max_series"} > 0'
for: 5m
labels:
severity: critical
annotations:
summary: Mimir ingester reaching series limit critical (instance {{ $labels.instance }})
description: "Mimir ingester {{ $labels.instance }} has reached {{ printf \"%.0f\" $value }}% of its series limit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MimirIngesterReachingTenantsLimitWarning
expr: '(cortex_ingester_memory_users / ignoring(limit) cortex_ingester_instance_limits{limit="max_tenants"} * 100 > 70) and cortex_ingester_instance_limits{limit="max_tenants"} > 0'
for: 5m
labels:
severity: warning
annotations:
summary: Mimir ingester reaching tenants limit warning (instance {{ $labels.instance }})
description: "Mimir ingester {{ $labels.instance }} has reached {{ printf \"%.0f\" $value }}% of its tenants limit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MimirIngesterReachingTenantsLimitCritical
expr: '(cortex_ingester_memory_users / ignoring(limit) cortex_ingester_instance_limits{limit="max_tenants"} * 100 > 80) and cortex_ingester_instance_limits{limit="max_tenants"} > 0'
for: 5m
labels:
severity: critical
annotations:
summary: Mimir ingester reaching tenants limit critical (instance {{ $labels.instance }})
description: "Mimir ingester {{ $labels.instance }} has reached {{ printf \"%.0f\" $value }}% of its tenants limit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MimirReachingTcpConnectionsLimit
expr: 'cortex_tcp_connections / cortex_tcp_connections_limit * 100 > 80 and cortex_tcp_connections_limit > 0'
for: 5m
labels:
severity: critical
annotations:
summary: Mimir reaching TCP connections limit (instance {{ $labels.instance }})
description: "Mimir instance {{ $labels.instance }} is using {{ printf \"%.0f\" $value }}% of its TCP connections limit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MimirDistributorInflightRequestsHigh
expr: '(cortex_distributor_inflight_push_requests / ignoring(limit) cortex_distributor_instance_limits{limit="max_inflight_push_requests"} * 100 > 80) and cortex_distributor_instance_limits{limit="max_inflight_push_requests"} > 0'
for: 5m
labels:
severity: critical
annotations:
summary: Mimir distributor inflight requests high (instance {{ $labels.instance }})
description: "Mimir distributor {{ $labels.instance }} is using {{ printf \"%.0f\" $value }}% of its inflight push requests limit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 0.05/s avoids firing on transient single-event spikes.
- alert: MimirIngesterTsdbHeadCompactionFailed
expr: 'rate(cortex_ingester_tsdb_compactions_failed_total[5m]) > 0.05'
for: 15m
labels:
severity: critical
annotations:
summary: Mimir ingester TSDB head compaction failed (instance {{ $labels.instance }})
description: "Mimir ingester {{ $labels.instance }} is failing to compact TSDB head ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 0.05/s avoids firing on transient single-event spikes.
- alert: MimirIngesterTsdbHeadTruncationFailed
expr: 'rate(cortex_ingester_tsdb_head_truncations_failed_total[5m]) > 0.05'
for: 15m
labels:
severity: critical
annotations:
summary: Mimir ingester TSDB head truncation failed (instance {{ $labels.instance }})
description: "Mimir ingester {{ $labels.instance }} is failing to truncate TSDB head ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 0.05/s avoids firing on transient single-event spikes.
- alert: MimirIngesterTsdbCheckpointCreationFailed
expr: 'rate(cortex_ingester_tsdb_checkpoint_creations_failed_total[5m]) > 0.05'
for: 15m
labels:
severity: critical
annotations:
summary: Mimir ingester TSDB checkpoint creation failed (instance {{ $labels.instance }})
description: "Mimir ingester {{ $labels.instance }} is failing to create TSDB checkpoints ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 0.05/s avoids firing on transient single-event spikes.
- alert: MimirIngesterTsdbCheckpointDeletionFailed
expr: 'rate(cortex_ingester_tsdb_checkpoint_deletions_failed_total[5m]) > 0.05'
for: 0m
labels:
severity: critical
annotations:
summary: Mimir ingester TSDB checkpoint deletion failed (instance {{ $labels.instance }})
description: "Mimir ingester {{ $labels.instance }} is failing to delete TSDB checkpoints ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 0.05/s avoids firing on transient single-event spikes.
- alert: MimirIngesterTsdbWalTruncationFailed
expr: 'rate(cortex_ingester_tsdb_wal_truncations_failed_total[5m]) > 0.05'
for: 0m
labels:
severity: warning
annotations:
summary: Mimir ingester TSDB WAL truncation failed (instance {{ $labels.instance }})
description: "Mimir ingester {{ $labels.instance }} is failing to truncate TSDB WAL ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 0.05/s avoids firing on transient single-event spikes.
- alert: MimirIngesterTsdbWalWritesFailed
expr: 'rate(cortex_ingester_tsdb_wal_writes_failed_total[1m]) > 0.05'
for: 3m
labels:
severity: critical
annotations:
summary: Mimir ingester TSDB WAL writes failed (instance {{ $labels.instance }})
description: "Mimir ingester {{ $labels.instance }} is failing to write to TSDB WAL ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 30 minutes. Adjust based on your sync interval.
- alert: MimirStoreGatewayHasNotSyncedBucket
expr: '(time() - cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 1800) and cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 0'
for: 5m
labels:
severity: critical
annotations:
summary: Mimir store gateway has not synced bucket (instance {{ $labels.instance }})
description: "Mimir store-gateway {{ $labels.instance }} has not synced the bucket for more than 30 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MimirStoreGatewayNoSyncedTenants
expr: '(min by (instance, job) (cortex_bucket_stores_tenants_synced{component="store-gateway"}) == 0) and on (instance) (cortex_bucket_stores_tenants_synced{component="store-gateway"} offset 1h > 0)'
for: 1h
labels:
severity: warning
annotations:
summary: Mimir store gateway no synced tenants (instance {{ $labels.instance }})
description: "Mimir store-gateway {{ $labels.instance }} has no synced tenants.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MimirBucketIndexNotUpdated
expr: 'min by (user, job) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 2100'
for: 0m
labels:
severity: critical
annotations:
summary: Mimir bucket index not updated (instance {{ $labels.instance }})
description: "Mimir bucket index for tenant {{ $labels.user }} has not been updated for more than 35 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MimirCompactorNotCleaningUpBlocks
expr: '(time() - cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds > 21600) and cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds > 0'
for: 1h
labels:
severity: critical
annotations:
summary: Mimir compactor not cleaning up blocks (instance {{ $labels.instance }})
description: "Mimir compactor {{ $labels.instance }} has not cleaned up blocks in the last 6 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MimirCompactorNotRunningCompaction
expr: '(time() - cortex_compactor_last_successful_run_timestamp_seconds > 86400) and cortex_compactor_last_successful_run_timestamp_seconds > 0'
for: 15m
labels:
severity: critical
annotations:
summary: Mimir compactor not running compaction (instance {{ $labels.instance }})
description: "Mimir compactor {{ $labels.instance }} has not run compaction in the last 24 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MimirCompactorHasConsecutiveFailures
expr: 'increase(cortex_compactor_runs_failed_total{reason!="shutdown"}[2h]) > 1'
for: 0m
labels:
severity: critical
annotations:
summary: Mimir compactor has consecutive failures (instance {{ $labels.instance }})
description: "Mimir compactor {{ $labels.instance }} has had {{ $value }} compaction failures in the last 2 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# cortex_compactor_disk_out_of_space_errors_total is declared as gauge by Mimir despite the _total suffix, so delta() is used instead of increase().
- alert: MimirCompactorHasRunOutOfDiskSpace
expr: 'delta(cortex_compactor_disk_out_of_space_errors_total[24h]) >= 1'
for: 0m
labels:
severity: critical
annotations:
summary: Mimir compactor has run out of disk space (instance {{ $labels.instance }})
description: "Mimir compactor {{ $labels.instance }} has run out of disk space.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MimirCompactorHasNotUploadedBlocks
expr: '(time() - thanos_objstore_bucket_last_successful_upload_time{component="compactor"} > 86400) and thanos_objstore_bucket_last_successful_upload_time{component="compactor"} > 0'
for: 15m
labels:
severity: critical
annotations:
summary: Mimir compactor has not uploaded blocks (instance {{ $labels.instance }})
description: "Mimir compactor {{ $labels.instance }} has not uploaded any block in the last 24 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Using a 24h window as compaction skips are rare events.
- alert: MimirCompactorSkippedBlocks
expr: 'increase(cortex_compactor_blocks_marked_for_no_compaction_total[24h]) > 0'
for: 5m
labels:
severity: warning
annotations:
summary: Mimir compactor skipped blocks (instance {{ $labels.instance }})
description: "Mimir compactor has found {{ $value }} blocks that cannot be compacted (reason {{ $labels.reason }}).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MimirRulerTooManyFailedPushes
expr: '100 * sum by (instance, job) (rate(cortex_ruler_write_requests_failed_total[5m])) / sum by (instance, job) (rate(cortex_ruler_write_requests_total[5m])) > 1 and sum by (instance, job) (rate(cortex_ruler_write_requests_total[5m])) > 0'
for: 5m
labels:
severity: critical
annotations:
summary: Mimir ruler too many failed pushes (instance {{ $labels.instance }})
description: "Mimir ruler {{ $labels.instance }} is failing to push {{ printf \"%.2f\" $value }}% of write requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MimirRulerTooManyFailedQueries
expr: '100 * sum by (instance, job) (rate(cortex_ruler_queries_failed_total[5m])) / sum by (instance, job) (rate(cortex_ruler_queries_total[5m])) > 1 and sum by (instance, job) (rate(cortex_ruler_queries_total[5m])) > 0'
for: 5m
labels:
severity: critical
annotations:
summary: Mimir ruler too many failed queries (instance {{ $labels.instance }})
description: "Mimir ruler {{ $labels.instance }} is failing {{ printf \"%.2f\" $value }}% of query evaluations.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MimirRulerMissedEvaluations
expr: '100 * sum by (instance, job) (rate(cortex_prometheus_rule_group_iterations_missed_total[5m])) / sum by (instance, job) (rate(cortex_prometheus_rule_group_iterations_total[5m])) > 1 and sum by (instance, job) (rate(cortex_prometheus_rule_group_iterations_total[5m])) > 0'
for: 5m
labels:
severity: warning
annotations:
summary: Mimir ruler missed evaluations (instance {{ $labels.instance }})
description: "Mimir ruler {{ $labels.instance }} is missing {{ printf \"%.2f\" $value }}% of rule group evaluations.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 0.05/s avoids firing on transient single-event spikes.
- alert: MimirRulerFailedRingCheck
expr: 'sum by (job) (rate(cortex_ruler_ring_check_errors_total[5m])) > 0.05'
for: 5m
labels:
severity: critical
annotations:
summary: Mimir ruler failed ring check (instance {{ $labels.instance }})
description: "Mimir ruler {{ $labels.job }} is failing ring checks ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 0.05/s avoids firing on transient single-event spikes.
- alert: MimirAlertmanagerSyncConfigsFailing
expr: 'rate(cortex_alertmanager_sync_configs_failed_total[5m]) > 0.05'
for: 30m
labels:
severity: critical
annotations:
summary: Mimir alertmanager sync configs failing (instance {{ $labels.instance }})
description: "Mimir alertmanager {{ $labels.job }} is failing to sync configs ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 0.05/s avoids firing on transient single-event spikes.
- alert: MimirAlertmanagerRingCheckFailing
expr: 'rate(cortex_alertmanager_ring_check_errors_total[5m]) > 0.05'
for: 10m
labels:
severity: critical
annotations:
summary: Mimir alertmanager ring check failing (instance {{ $labels.instance }})
description: "Mimir alertmanager {{ $labels.job }} is failing ring checks ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 0.05/s avoids firing on transient single-event spikes.
- alert: MimirAlertmanagerStateMergeFailing
expr: 'rate(cortex_alertmanager_partial_state_merges_failed_total[5m]) > 0.05'
for: 10m
labels:
severity: critical
annotations:
summary: Mimir alertmanager state merge failing (instance {{ $labels.instance }})
description: "Mimir alertmanager {{ $labels.job }} is failing to merge state updates ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 0.05/s avoids firing on transient single-event spikes.
- alert: MimirAlertmanagerReplicationFailing
expr: 'rate(cortex_alertmanager_state_replication_failed_total[5m]) > 0.05'
for: 10m
labels:
severity: critical
annotations:
summary: Mimir alertmanager replication failing (instance {{ $labels.instance }})
description: "Mimir alertmanager {{ $labels.job }} is failing to replicate state ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 0.05/s avoids firing on transient single-event spikes.
- alert: MimirAlertmanagerPersistStateFailing
expr: 'rate(cortex_alertmanager_state_persist_failed_total[15m]) > 0.05'
for: 1h
labels:
severity: critical
annotations:
summary: Mimir alertmanager persist state failing (instance {{ $labels.instance }})
description: "Mimir alertmanager {{ $labels.job }} is failing to persist state ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MimirAlertmanagerInitialSyncFailed
expr: 'increase(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed"}[1m]) > 0'
for: 0m
labels:
severity: warning
annotations:
summary: Mimir alertmanager initial sync failed (instance {{ $labels.instance }})
description: "Mimir alertmanager {{ $labels.job }} failed initial state sync.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MimirAlertmanagerInstanceHasNoTenants
expr: '(cortex_alertmanager_tenants_owned == 0) and on (instance) (cortex_alertmanager_tenants_owned offset 1h > 0)'
for: 1h
labels:
severity: warning
annotations:
summary: Mimir alertmanager instance has no tenants (instance {{ $labels.instance }})
description: "Mimir alertmanager {{ $labels.instance }} has no tenants assigned.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MimirGossipMembersCountTooHigh
expr: 'avg(memberlist_client_cluster_members_count{job=~".*(mimir|cortex).*"}) by (job) * 1.15 + 10 < max(memberlist_client_cluster_members_count{job=~".*(mimir|cortex).*"}) by (job)'
for: 20m
labels:
severity: warning
annotations:
summary: Mimir gossip members count too high (instance {{ $labels.instance }})
description: "Mimir gossip cluster has more members than expected.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MimirGossipMembersCountTooLow
expr: 'avg(memberlist_client_cluster_members_count{job=~".*(mimir|cortex).*"}) by (job) * 0.5 > min(memberlist_client_cluster_members_count{job=~".*(mimir|cortex).*"}) by (job)'
for: 20m
labels:
severity: warning
annotations:
summary: Mimir gossip members count too low (instance {{ $labels.instance }})
description: "Mimir gossip cluster has fewer members than expected.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# A high number of Go threads may indicate a goroutine leak.
- alert: MimirGoThreadsTooHighWarning
expr: 'go_threads{job=~".*(mimir|cortex).*"} > 5000'
for: 15m
labels:
severity: warning
annotations:
summary: Mimir go threads too high warning (instance {{ $labels.instance }})
description: "Mimir {{ $labels.instance }} has {{ $value }} Go threads.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MimirGoThreadsTooHighCritical
expr: 'go_threads{job=~".*(mimir|cortex).*"} > 8000'
for: 15m
labels:
severity: critical
annotations:
summary: Mimir go threads too high critical (instance {{ $labels.instance }})
description: "Mimir {{ $labels.instance }} has {{ $value }} Go threads.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -1,175 +0,0 @@
groups:
- name: EmbeddedExporter
rules:
- alert: TempoDistributorUnhealthy
expr: 'max by (job) (tempo_ring_members{state="Unhealthy", name="distributor"}) > 0'
for: 15m
labels:
severity: warning
annotations:
summary: Tempo distributor unhealthy (instance {{ $labels.instance }})
description: "Tempo has {{ $value }} unhealthy distributor(s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: TempoLiveStoreUnhealthy
expr: 'max by (job) (tempo_ring_members{state="Unhealthy", name="live-store"}) > 0'
for: 15m
labels:
severity: critical
annotations:
summary: Tempo live store unhealthy (instance {{ $labels.instance }})
description: "Tempo has {{ $value }} unhealthy live store(s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: TempoMetricsGeneratorUnhealthy
expr: 'max by (job) (tempo_ring_members{state="Unhealthy", name="metrics-generator"}) > 0'
for: 15m
labels:
severity: critical
annotations:
summary: Tempo metrics generator unhealthy (instance {{ $labels.instance }})
description: "Tempo has {{ $value }} unhealthy metrics generator(s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Uses a two-window approach: 1h for historical count and 5m to confirm the issue is ongoing.
- alert: TempoCompactionsFailing
expr: 'sum by (job) (increase(tempodb_compaction_errors_total[1h])) > 2 and sum by (job) (increase(tempodb_compaction_errors_total[5m])) > 0'
for: 1h
labels:
severity: critical
annotations:
summary: Tempo compactions failing (instance {{ $labels.instance }})
description: "{{ $value }} compactions have failed in the past hour.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: TempoPollsFailing
expr: 'sum by (job) (increase(tempodb_blocklist_poll_errors_total[1h])) > 2 and sum by (job) (increase(tempodb_blocklist_poll_errors_total[5m])) > 0'
for: 0m
labels:
severity: critical
annotations:
summary: Tempo polls failing (instance {{ $labels.instance }})
description: "{{ $value }} blocklist polls have failed in the past hour.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: TempoTenantIndexFailures
expr: 'sum by (job) (increase(tempodb_blocklist_tenant_index_errors_total[1h])) > 2 and sum by (job) (increase(tempodb_blocklist_tenant_index_errors_total[5m])) > 0'
for: 0m
labels:
severity: critical
annotations:
summary: Tempo tenant index failures (instance {{ $labels.instance }})
description: "{{ $value }} tenant index failures in the past hour.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: TempoNoTenantIndexBuilders
expr: 'sum by (tenant) (tempodb_blocklist_tenant_index_builder) == 0 and on() max(tempodb_blocklist_length) > 0'
for: 5m
labels:
severity: critical
annotations:
summary: Tempo no tenant index builders (instance {{ $labels.instance }})
description: "No tenant index builders for tenant {{ $labels.tenant }}. Tenant index will quickly become stale.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 600s (10 minutes). Adjust based on your tenant index build interval.
- alert: TempoTenantIndexTooOld
expr: 'max by (tenant) (tempodb_blocklist_tenant_index_age_seconds) > 600'
for: 5m
labels:
severity: critical
annotations:
summary: Tempo tenant index too old (instance {{ $labels.instance }})
description: "Tenant index for {{ $labels.tenant }} is {{ $value }}s old.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Fires when the blocklist grows more than 40% over 7 days.
- alert: TempoBlockListRisingQuickly
expr: '(avg(tempodb_blocklist_length) / avg(tempodb_blocklist_length offset 7d) - 1) * 100 > 40 and avg(tempodb_blocklist_length offset 7d) > 0'
for: 15m
labels:
severity: critical
annotations:
summary: Tempo block list rising quickly (instance {{ $labels.instance }})
description: "Tempo blocklist length is up {{ printf \"%.0f\" $value }}% over the last 7 days. Consider scaling compactors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: TempoBadOverrides
expr: 'sum by (job) (tempo_runtime_config_last_reload_successful == 0) > 0'
for: 15m
labels:
severity: critical
annotations:
summary: Tempo bad overrides (instance {{ $labels.instance }})
description: "{{ $labels.job }} failed to reload runtime overrides.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: TempoUserConfigurableOverridesReloadFailing
expr: 'sum by (job) (increase(tempo_overrides_user_configurable_overrides_reload_failed_total[1h])) > 5 and sum by (job) (increase(tempo_overrides_user_configurable_overrides_reload_failed_total[5m])) > 0'
for: 0m
labels:
severity: critical
annotations:
summary: Tempo user configurable overrides reload failing (instance {{ $labels.instance }})
description: "{{ $value }} user-configurable overrides reloads have failed in the past hour.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 100 blocks per compactor instance. Adjust based on your environment.
- alert: TempoCompactionTooManyOutstandingBlocksWarning
expr: 'sum by (instance) (tempodb_compaction_outstanding_blocks) > 100'
for: 6h
labels:
severity: warning
annotations:
summary: Tempo compaction too many outstanding blocks warning (instance {{ $labels.instance }})
description: "There are too many outstanding compaction blocks for {{ $labels.instance }}. Consider increasing compactor resources.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 100 blocks per compactor instance. Normalize by backend-worker count if needed. Adjust based on your environment.
- alert: TempoCompactionTooManyOutstandingBlocksCritical
expr: 'sum by (instance) (tempodb_compaction_outstanding_blocks) > 250'
for: 24h
labels:
severity: critical
annotations:
summary: Tempo compaction too many outstanding blocks critical (instance {{ $labels.instance }})
description: "There are too many outstanding compaction blocks for {{ $labels.instance }}. Increase compactor resources immediately.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 0.05/s avoids firing on transient single-event spikes.
- alert: TempoDistributorUsageTrackerErrors
expr: 'sum by (job, reason) (rate(tempo_distributor_usage_tracker_errors_total[5m])) > 0.05'
for: 30m
labels:
severity: critical
annotations:
summary: Tempo distributor usage tracker errors (instance {{ $labels.instance }})
description: "Tempo distributor usage tracker errors for {{ $labels.job }} at {{ $value | humanize }}/s (reason {{ $labels.reason }}).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: TempoMetricsGeneratorProcessorUpdatesFailing
expr: 'sum by (job) (increase(tempo_metrics_generator_active_processors_update_failed_total[5m])) > 2'
for: 15m
labels:
severity: critical
annotations:
summary: Tempo metrics generator processor updates failing (instance {{ $labels.instance }})
description: "Tempo metrics generator processor updates are failing for {{ $labels.job }} ({{ $value }} failures in 5m).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: TempoMetricsGeneratorServiceGraphsDroppingSpans
expr: '100 * sum by (job) (rate(tempo_metrics_generator_processor_service_graphs_dropped_spans_total[5m])) / sum by (job) (rate(tempo_metrics_generator_spans_received_total[5m])) > 0.5 and sum by (job) (rate(tempo_metrics_generator_spans_received_total[5m])) > 0'
for: 15m
labels:
severity: warning
annotations:
summary: Tempo metrics generator service graphs dropping spans (instance {{ $labels.instance }})
description: "Tempo metrics generator is dropping {{ printf \"%.2f\" $value }}% of spans in service graphs for {{ $labels.job }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: TempoMetricsGeneratorCollectionsFailing
expr: 'sum by (job) (increase(tempo_metrics_generator_registry_collections_failed_total[5m])) > 2'
for: 5m
labels:
severity: critical
annotations:
summary: Tempo metrics generator collections failing (instance {{ $labels.instance }})
description: "Tempo metrics generator collections are failing for {{ $labels.job }} ({{ $value }} failures in 5m).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Fires when the memcached error rate exceeds 20%. Only relevant if Tempo is configured with memcached caching.
- alert: TempoMemcachedErrorsElevated
expr: '100 * sum by (name, job) (rate(tempo_memcache_request_duration_seconds_count{status_code="500"}[5m])) / sum by (name, job) (rate(tempo_memcache_request_duration_seconds_count[5m])) > 20 and sum by (name, job) (rate(tempo_memcache_request_duration_seconds_count[5m])) > 0'
for: 10m
labels:
severity: warning
annotations:
summary: Tempo memcached errors elevated (instance {{ $labels.instance }})
description: "Tempo memcached error rate is {{ printf \"%.2f\" $value }}% for {{ $labels.name }} in {{ $labels.job }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -2,7 +2,6 @@ groups:
- name: EmbeddedExporter
rules:
- alert: ProviderFailedBecauseNet_versionFailed
@ -41,22 +40,20 @@ groups:
summary: Provider failed because get genesis timeout (instance {{ $labels.instance }})
description: "Timeout to get genesis for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 10ms. Adjust based on your expected database latency.
- alert: StoreConnectionSlow
- alert: StoreConnectionIsTooSlow
expr: 'store_connection_wait_time_ms > 10'
for: 0m
labels:
severity: warning
annotations:
summary: Store connection slow (instance {{ $labels.instance }})
summary: Store connection is too slow (instance {{ $labels.instance }})
description: "Store connection is too slow to `{{$labels.pool}}` pool, `{{$labels.shard}}` shard in Graph node `{{$labels.instance}}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 20ms. Adjust based on your expected database latency.
- alert: StoreConnectionVerySlow
- alert: StoreConnectionIsTooSlow
expr: 'store_connection_wait_time_ms > 20'
for: 0m
labels:
severity: critical
annotations:
summary: Store connection very slow (instance {{ $labels.instance }})
description: "Store connection is very slow to `{{$labels.pool}}` pool, `{{$labels.shard}}` shard in Graph node `{{$labels.instance}}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Store connection is too slow (instance {{ $labels.instance }})
description: "Store connection is too slow to `{{$labels.pool}}` pool, `{{$labels.shard}}` shard in Graph node `{{$labels.instance}}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -2,12 +2,8 @@ groups:
- name: Jmx_exporter
rules:
# When targets are managed via service discovery, a disappeared target goes stale rather than reporting up==0,
# so this alert may not fire. Prefer application-level availability metrics if available.
# Rename job="hadoop-namenode" to match the actual job name in your Prometheus scrape config.
- alert: HadoopNameNodeDown
expr: 'up{job="hadoop-namenode"} == 0'
for: 5m
@ -17,9 +13,6 @@ groups:
summary: Hadoop Name Node Down (instance {{ $labels.instance }})
description: "The Hadoop NameNode service is unavailable.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# When targets are managed via service discovery, a disappeared target goes stale rather than reporting up==0,
# so this alert may not fire. Prefer application-level availability metrics if available.
# Rename job="hadoop-resourcemanager" to match the actual job name in your Prometheus scrape config.
- alert: HadoopResourceManagerDown
expr: 'up{job="hadoop-resourcemanager"} == 0'
for: 5m
@ -39,7 +32,7 @@ groups:
description: "The Hadoop DataNode is not sending heartbeats.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HadoopHdfsDiskSpaceLow
expr: '(hadoop_hdfs_bytes_total - hadoop_hdfs_bytes_used) / hadoop_hdfs_bytes_total < 0.1 and hadoop_hdfs_bytes_total > 0'
expr: '(hadoop_hdfs_bytes_total - hadoop_hdfs_bytes_used) / hadoop_hdfs_bytes_total < 0.1'
for: 15m
labels:
severity: warning
@ -48,7 +41,7 @@ groups:
description: "Available HDFS disk space is running low.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HadoopMapReduceTaskFailures
expr: 'increase(hadoop_mapreduce_task_failures_total[1h]) > 100'
expr: 'hadoop_mapreduce_task_failures_total > 100'
for: 10m
labels:
severity: critical
@ -57,7 +50,7 @@ groups:
description: "There is an unusually high number of MapReduce task failures.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HadoopResourceManagerMemoryHigh
expr: 'hadoop_resourcemanager_memory_bytes / hadoop_resourcemanager_memory_max_bytes > 0.8 and hadoop_resourcemanager_memory_max_bytes > 0'
expr: 'hadoop_resourcemanager_memory_bytes / hadoop_resourcemanager_memory_max_bytes > 0.8'
for: 15m
labels:
severity: warning
@ -66,7 +59,7 @@ groups:
description: "The Hadoop ResourceManager is approaching its memory limit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HadoopYarnContainerAllocationFailures
expr: 'increase(hadoop_yarn_container_allocation_failures_total[1h]) > 10'
expr: 'hadoop_yarn_container_allocation_failures_total > 10'
for: 10m
labels:
severity: warning
@ -84,10 +77,10 @@ groups:
description: "The HBase cluster has an unusually high number of regions.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HadoopHbaseRegionServerHeapLow
expr: 'hadoop_hbase_region_server_heap_bytes / hadoop_hbase_region_server_max_heap_bytes > 0.8 and hadoop_hbase_region_server_max_heap_bytes > 0'
expr: 'hadoop_hbase_region_server_heap_bytes / hadoop_hbase_region_server_max_heap_bytes < 0.2'
for: 10m
labels:
severity: warning
severity: critical
annotations:
summary: Hadoop HBase Region Server Heap Low (instance {{ $labels.instance }})
description: "HBase Region Servers are running low on heap space.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -2,29 +2,28 @@ groups:
- name: EmbeddedExporterV2
rules:
- alert: HaproxyHighHttp4xxErrorRateBackend
expr: '((sum by (proxy) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 and sum by (proxy) (rate(haproxy_server_http_responses_total[1m])) > 0'
expr: '((sum by (proxy) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5'
for: 1m
labels:
severity: critical
annotations:
summary: HAProxy high HTTP 4xx error rate backend (instance {{ $labels.instance }})
description: "Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.proxy }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyHighHttp5xxErrorRateBackend
expr: '((sum by (proxy) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 and sum by (proxy) (rate(haproxy_server_http_responses_total[1m])) > 0'
expr: '((sum by (proxy) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5'
for: 1m
labels:
severity: critical
annotations:
summary: HAProxy high HTTP 5xx error rate backend (instance {{ $labels.instance }})
description: "Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.proxy }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyHighHttp4xxErrorRateServer
expr: '((sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0'
expr: '((sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5'
for: 1m
labels:
severity: critical
@ -33,7 +32,7 @@ groups:
description: "Too many HTTP requests with status 4xx (> 5%) on server {{ $labels.server }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyHighHttp5xxErrorRateServer
expr: '((sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0'
expr: '((sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5'
for: 1m
labels:
severity: critical
@ -42,7 +41,7 @@ groups:
description: "Too many HTTP requests with status 5xx (> 5%) on server {{ $labels.server }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyServerResponseErrors
expr: '(sum by (server) (rate(haproxy_server_response_errors_total[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100 > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0'
expr: '(sum by (server) (rate(haproxy_server_response_errors_total[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100 > 5'
for: 1m
labels:
severity: critical
@ -57,7 +56,7 @@ groups:
severity: critical
annotations:
summary: HAProxy backend connection errors (instance {{ $labels.instance }})
description: "Too many connection errors to {{ $labels.proxy }} backend (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Too many connection errors to {{ $labels.fqdn }}/{{ $labels.backend }} backend (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyServerConnectionErrors
expr: '(sum by (proxy) (rate(haproxy_server_connection_errors_total[1m]))) > 100'
@ -66,20 +65,19 @@ groups:
severity: critical
annotations:
summary: HAProxy server connection errors (instance {{ $labels.instance }})
description: "Too many connection errors to {{ $labels.proxy }} (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyBackendMaxActiveSession>80%
expr: '(haproxy_backend_current_sessions / haproxy_backend_limit_sessions * 100) > 80 and haproxy_backend_limit_sessions > 0'
expr: '((haproxy_server_max_sessions >0) * 100) / (haproxy_server_limit_sessions > 0) > 80'
for: 2m
labels:
severity: warning
annotations:
summary: HAProxy backend max active session > 80% (instance {{ $labels.instance }})
description: "Session limit from backend {{ $labels.proxy }} reached 80% of limit - {{ $value | printf \"%.2f\"}}%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Session limit from backend {{ $labels.proxy }} to server {{ $labels.server }} reached 80% of limit - {{ $value | printf \"%.2f\"}}%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# haproxy_backend_current_queue is a gauge (current queue depth), not a counter.
- alert: HaproxyPendingRequests
expr: 'sum by (proxy) (haproxy_backend_current_queue) > 0'
expr: 'sum by (proxy) (rate(haproxy_backend_current_queue[2m])) > 0'
for: 2m
labels:
severity: warning
@ -94,7 +92,7 @@ groups:
severity: warning
annotations:
summary: HAProxy HTTP slowing down (instance {{ $labels.instance }})
description: "HAProxy backend max total time is above 1s on {{ $labels.proxy }} - {{ $value | printf \"%.2f\"}}s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Average request time is increasing - {{ $value | printf \"%.2f\"}}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyRetryHigh
expr: 'sum by (proxy) (rate(haproxy_backend_retry_warnings_total[1m])) > 10'
@ -124,10 +122,10 @@ groups:
description: "HAProxy is blocking requests for security reason\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyServerHealthcheckFailure
expr: 'increase(haproxy_server_check_failures_total[1m]) > 2'
for: 0m
expr: 'increase(haproxy_server_check_failures_total[1m]) > 0'
for: 1m
labels:
severity: warning
annotations:
summary: HAProxy server healthcheck failure (instance {{ $labels.instance }})
description: "Some server healthcheck are failing on {{ $labels.server }} ({{ $value }} in the last 1m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Some server healthcheck are failing on {{ $labels.server }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -2,7 +2,6 @@ groups:
- name: HaproxyExporterV1
rules:
- alert: HaproxyDown
@ -14,104 +13,104 @@ groups:
summary: HAProxy down (instance {{ $labels.instance }})
description: "HAProxy down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyHighHttp4xxErrorRateBackend(v1)
expr: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) * 100 > 5 and sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 0'
- alert: HaproxyHighHttp4xxErrorRateBackend
expr: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 5'
for: 1m
labels:
severity: critical
annotations:
summary: HAProxy high HTTP 4xx error rate backend (v1) (instance {{ $labels.instance }})
description: "Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.backend }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: HAProxy high HTTP 4xx error rate backend (instance {{ $labels.instance }})
description: "Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyHighHttp5xxErrorRateBackend(v1)
expr: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) * 100 > 5 and sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 0'
- alert: HaproxyHighHttp5xxErrorRateBackend
expr: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 5'
for: 1m
labels:
severity: critical
annotations:
summary: HAProxy high HTTP 5xx error rate backend (v1) (instance {{ $labels.instance }})
description: "Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.backend }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: HAProxy high HTTP 5xx error rate backend (instance {{ $labels.instance }})
description: "Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyHighHttp4xxErrorRateServer(v1)
expr: 'sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0'
- alert: HaproxyHighHttp4xxErrorRateServer
expr: 'sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5'
for: 1m
labels:
severity: critical
annotations:
summary: HAProxy high HTTP 4xx error rate server (v1) (instance {{ $labels.instance }})
summary: HAProxy high HTTP 4xx error rate server (instance {{ $labels.instance }})
description: "Too many HTTP requests with status 4xx (> 5%) on server {{ $labels.server }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyHighHttp5xxErrorRateServer(v1)
expr: 'sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0'
- alert: HaproxyHighHttp5xxErrorRateServer
expr: 'sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5'
for: 1m
labels:
severity: critical
annotations:
summary: HAProxy high HTTP 5xx error rate server (v1) (instance {{ $labels.instance }})
summary: HAProxy high HTTP 5xx error rate server (instance {{ $labels.instance }})
description: "Too many HTTP requests with status 5xx (> 5%) on server {{ $labels.server }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyServerResponseErrors(v1)
expr: 'sum by (server) (rate(haproxy_server_response_errors_total[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0'
- alert: HaproxyServerResponseErrors
expr: 'sum by (server) (rate(haproxy_server_response_errors_total[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5'
for: 1m
labels:
severity: critical
annotations:
summary: HAProxy server response errors (v1) (instance {{ $labels.instance }})
summary: HAProxy server response errors (instance {{ $labels.instance }})
description: "Too many response errors to {{ $labels.server }} server (> 5%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyBackendConnectionErrors(v1)
- alert: HaproxyBackendConnectionErrors
expr: 'sum by (backend) (rate(haproxy_backend_connection_errors_total[1m])) > 100'
for: 1m
labels:
severity: critical
annotations:
summary: HAProxy backend connection errors (v1) (instance {{ $labels.instance }})
description: "Too many connection errors to {{ $labels.backend }} backend (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: HAProxy backend connection errors (instance {{ $labels.instance }})
description: "Too many connection errors to {{ $labels.fqdn }}/{{ $labels.backend }} backend (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyServerConnectionErrors(v1)
- alert: HaproxyServerConnectionErrors
expr: 'sum by (server) (rate(haproxy_server_connection_errors_total[1m])) > 100'
for: 0m
labels:
severity: critical
annotations:
summary: HAProxy server connection errors (v1) (instance {{ $labels.instance }})
summary: HAProxy server connection errors (instance {{ $labels.instance }})
description: "Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyBackendMaxActiveSession
expr: '((sum by (backend) (haproxy_backend_current_sessions * 100) / sum by (backend) (haproxy_backend_limit_sessions))) > 80 and sum by (backend) (haproxy_backend_limit_sessions) > 0'
expr: '((sum by (backend) (avg_over_time(haproxy_backend_max_sessions[2m]) * 100) / sum by (backend) (avg_over_time(haproxy_backend_limit_sessions[2m])))) > 80'
for: 2m
labels:
severity: warning
annotations:
summary: HAProxy backend max active session (instance {{ $labels.instance }})
description: "HAProxy backend {{ $labels.backend }} is reaching session limit (> 80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "HAproxy backend {{ $labels.fqdn }}/{{ $labels.backend }} is reaching session limit (> 80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyPendingRequests(v1)
- alert: HaproxyPendingRequests
expr: 'sum by (backend) (haproxy_backend_current_queue) > 0'
for: 2m
labels:
severity: warning
annotations:
summary: HAProxy pending requests (v1) (instance {{ $labels.instance }})
description: "Some HAProxy requests are pending on {{ $labels.backend }} backend\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: HAProxy pending requests (instance {{ $labels.instance }})
description: "Some HAProxy requests are pending on {{ $labels.fqdn }}/{{ $labels.backend }} backend\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyHttpSlowingDown(v1)
- alert: HaproxyHttpSlowingDown
expr: 'avg by (backend) (haproxy_backend_http_total_time_average_seconds) > 1'
for: 1m
labels:
severity: warning
annotations:
summary: HAProxy HTTP slowing down (v1) (instance {{ $labels.instance }})
summary: HAProxy HTTP slowing down (instance {{ $labels.instance }})
description: "Average request time is increasing\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyRetryHigh(v1)
- alert: HaproxyRetryHigh
expr: 'sum by (backend) (rate(haproxy_backend_retry_warnings_total[1m])) > 10'
for: 2m
labels:
severity: warning
annotations:
summary: HAProxy retry high (v1) (instance {{ $labels.instance }})
description: "High rate of retry on {{ $labels.backend }} backend\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: HAProxy retry high (instance {{ $labels.instance }})
description: "High rate of retry on {{ $labels.fqdn }}/{{ $labels.backend }} backend\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyBackendDown
expr: 'haproxy_backend_up == 0'
@ -131,20 +130,20 @@ groups:
summary: HAProxy server down (instance {{ $labels.instance }})
description: "HAProxy server is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyFrontendSecurityBlockedRequests(v1)
- alert: HaproxyFrontendSecurityBlockedRequests
expr: 'sum by (frontend) (rate(haproxy_frontend_requests_denied_total[2m])) > 10'
for: 2m
labels:
severity: warning
annotations:
summary: HAProxy frontend security blocked requests (v1) (instance {{ $labels.instance }})
summary: HAProxy frontend security blocked requests (instance {{ $labels.instance }})
description: "HAProxy is blocking requests for security reason\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyServerHealthcheckFailure(v1)
expr: 'increase(haproxy_server_check_failures_total[1m]) > 2'
for: 0m
- alert: HaproxyServerHealthcheckFailure
expr: 'increase(haproxy_server_check_failures_total[1m]) > 0'
for: 1m
labels:
severity: warning
annotations:
summary: HAProxy server healthcheck failure (v1) (instance {{ $labels.instance }})
description: "Some server healthcheck are failing on {{ $labels.server }} ({{ $value }} in the last 1m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: HAProxy server healthcheck failure (instance {{ $labels.instance }})
description: "Some server healthcheck are failing on {{ $labels.server }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -2,12 +2,11 @@ groups:
- name: EmbeddedExporter
rules:
- alert: VaultSealed
expr: 'vault_core_unsealed == 0'
for: 1m
for: 0m
labels:
severity: critical
annotations:
@ -21,7 +20,7 @@ groups:
severity: warning
annotations:
summary: Vault too many pending tokens (instance {{ $labels.instance }})
description: "Too many pending tokens on {{ $labels.instance }}: {{ $value }} tokens created but not yet stored.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Too many pending tokens {{ $labels.instance }}: {{ $value | printf \"%.2f\"}}%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: VaultTooManyInfinityTokens
expr: 'vault_token_count_by_ttl{creation_ttl="+Inf"} > 3'
@ -30,13 +29,13 @@ groups:
severity: warning
annotations:
summary: Vault too many infinity tokens (instance {{ $labels.instance }})
description: "Too many non-expiring tokens on {{ $labels.instance }}: {{ $value }} tokens with infinite TTL.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Too many infinity tokens {{ $labels.instance }}: {{ $value | printf \"%.2f\"}}%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: VaultClusterHealth
expr: 'sum(vault_core_active) / count(vault_core_active) <= 0.5 and count(vault_core_active) > 0'
expr: 'sum(vault_core_active) / count(vault_core_active) <= 0.5'
for: 0m
labels:
severity: critical
annotations:
summary: Vault cluster health (instance {{ $labels.instance }})
description: "Vault cluster is not healthy: only {{ $value | humanizePercentage }} of nodes are active.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Vault cluster is not healthy {{ $labels.instance }}: {{ $value | printf \"%.2f\"}}%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -2,11 +2,10 @@ groups:
- name: NodeExporter
rules:
- alert: HostOutOfMemory
expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < .10)'
expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
@ -14,106 +13,107 @@ groups:
summary: Host out of memory (instance {{ $labels.instance }})
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# node_vmstat_pgmajfault is exposed as untyped/gauge by node_exporter (from /proc/vmstat), so deriv() is used instead of rate().
- alert: HostMemoryUnderMemoryPressure
expr: '(deriv(node_vmstat_pgmajfault[5m]) > 1000)'
for: 0m
expr: '(rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host memory under memory pressure (instance {{ $labels.instance }})
description: "The node is under heavy memory pressure. High rate of major page faults ({{ $value }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
- alert: HostMemoryIsUnderutilized
expr: 'min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8'
for: 0m
expr: '(100 - (avg_over_time(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 1w
labels:
severity: info
annotations:
summary: Host Memory is underutilized (instance {{ $labels.instance }})
description: "Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualNetworkThroughputIn
expr: '((rate(node_network_receive_bytes_total[5m]) / node_network_speed_bytes) > .80) and node_network_speed_bytes > 0'
for: 0m
expr: '(sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual network throughput in (instance {{ $labels.instance }})
description: "Host receive bandwidth is high (>80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualNetworkThroughputOut
expr: '((rate(node_network_transmit_bytes_total[5m]) / node_network_speed_bytes) > .80) and node_network_speed_bytes > 0'
for: 0m
expr: '(sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual network throughput out (instance {{ $labels.instance }})
description: "Host transmit bandwidth is high (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostDiskIoUtilizationHigh
expr: '(rate(node_disk_io_time_seconds_total[5m]) > .80)'
for: 0m
- alert: HostUnusualDiskReadRate
expr: '(sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 5m
labels:
severity: warning
annotations:
summary: Host disk IO utilization high (instance {{ $labels.instance }})
description: "Disk utilization is high (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Host unusual disk read rate (instance {{ $labels.instance }})
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Please add ignored mountpoints in node_exporter parameters like
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
- alert: HostOutOfDiskSpace
expr: '(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0)'
- alert: HostUnusualDiskWriteRate
expr: '(sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: critical
severity: warning
annotations:
summary: Host unusual disk write rate (instance {{ $labels.instance }})
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOutOfDiskSpace
expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host out of disk space (instance {{ $labels.instance }})
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Please add ignored mountpoints in node_exporter parameters like
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
- alert: HostDiskMayFillIn24Hours
expr: 'predict_linear(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[3h], 86400) <= 0 and node_filesystem_avail_bytes > 0'
- alert: HostDiskWillFillIn24Hours
expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host disk may fill in 24 hours (instance {{ $labels.instance }})
description: "Filesystem will likely run out of space within the next 24 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOutOfInodes
expr: '(node_filesystem_files_free / node_filesystem_files < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) and node_filesystem_files > 0'
expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: critical
severity: warning
annotations:
summary: Host out of inodes (instance {{ $labels.instance }})
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostFilesystemDeviceError
expr: 'node_filesystem_device_error{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} == 1'
expr: 'node_filesystem_device_error == 1'
for: 2m
labels:
severity: critical
annotations:
summary: Host filesystem device error (instance {{ $labels.instance }})
description: "Error stat-ing the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostInodesMayFillIn24Hours
expr: 'predict_linear(node_filesystem_files_free{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_files_free > 0'
- alert: HostInodesWillFillIn24Hours
expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{fstype!="msdosfs"} == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host inodes may fill in 24 hours (instance {{ $labels.instance }})
description: "Filesystem will likely run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskReadLatency
expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0)'
expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
@ -122,7 +122,7 @@ groups:
description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskWriteLatency
expr: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0)'
expr: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
@ -131,7 +131,7 @@ groups:
description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostHighCpuLoad
expr: '1 - (avg without (cpu) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) > .80'
expr: '(sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 10m
labels:
severity: warning
@ -139,18 +139,17 @@ groups:
summary: Host high CPU load (instance {{ $labels.instance }})
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
- alert: HostCpuIsUnderutilized
expr: '(min without (cpu) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8'
expr: '(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 1w
labels:
severity: info
annotations:
summary: Host CPU is underutilized (instance {{ $labels.instance }})
description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostCpuStealNoisyNeighbor
expr: 'avg without (cpu) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10'
expr: '(avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: warning
@ -159,37 +158,34 @@ groups:
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostCpuHighIowait
expr: 'avg without (cpu) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10'
expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: warning
annotations:
summary: Host CPU high iowait (instance {{ $labels.instance }})
description: "CPU iowait > 10%. Your CPU is idling waiting for storage to respond.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskIo
expr: 'rate(node_disk_io_time_seconds_total[5m]) > 0.8'
expr: '(rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual disk IO (instance {{ $labels.instance }})
description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# x2 context switches is an arbitrary number.
# The alert threshold depends on the nature of the application.
# Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
- alert: HostContextSwitchingHigh
expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2 and rate(node_context_switches_total[1d]) > 0'
- alert: HostContextSwitching
expr: '((rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: warning
annotations:
summary: Host context switching high (instance {{ $labels.instance }})
description: "Context switching is growing on the node (twice the daily average during the last 15m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Host context switching (instance {{ $labels.instance }})
description: "Context switching is growing on the node (> 10000 / CPU / s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostSwapIsFillingUp
expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) and node_memory_SwapTotal_bytes > 0'
expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
@ -198,16 +194,16 @@ groups:
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostSystemdServiceCrashed
expr: '(node_systemd_unit_state{state="failed"} == 1)'
expr: '(node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: warning
annotations:
summary: Host systemd service crashed (instance {{ $labels.instance }})
description: "systemd service {{ $labels.name }} crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostPhysicalComponentTooHot
expr: 'node_hwmon_temp_celsius > node_hwmon_temp_max_celsius'
expr: '((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl"} > 75)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 5m
labels:
severity: warning
@ -216,7 +212,7 @@ groups:
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNodeOvertemperatureAlarm
expr: '((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1))'
expr: '(node_hwmon_temp_crit_alarm_celsius == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: critical
@ -224,37 +220,35 @@ groups:
summary: Host node overtemperature alarm (instance {{ $labels.instance }})
description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Uses ignoring(state) to handle additional labels on node_md_disks.
- alert: HostSoftwareRaidInsufficientDrives
expr: '((node_md_disks_required - ignoring(state) node_md_disks{state="active"}) > 0)'
- alert: HostRaidArrayGotInactive
expr: '(node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: critical
annotations:
summary: Host software RAID insufficient drives (instance {{ $labels.instance }})
description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} has insufficient drives remaining.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Host RAID array got inactive (instance {{ $labels.instance }})
description: "RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostSoftwareRaidDiskFailure
expr: '(node_md_disks{state="failed"} > 0)'
- alert: HostRaidDiskFailure
expr: '(node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host software RAID disk failure (instance {{ $labels.instance }})
description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} needs attention.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Host RAID disk failure (instance {{ $labels.instance }})
description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostKernelVersionDeviations
expr: 'changes(node_uname_info[1h]) > 0'
for: 0m
expr: '(count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 6h
labels:
severity: info
severity: warning
annotations:
summary: Host kernel version deviations (instance {{ $labels.instance }})
description: "Kernel version for {{ $labels.instance }} has changed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Different kernel versions are running\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# When a machine runs out of memory, the node exporter can become unresponsive for several minutes. Even if the system takes 1520 minutes to recover, the alert should still trigger.
- alert: HostOomKillDetected
expr: '(delta(node_vmstat_oom_kill[30m]) > 0)'
expr: '(increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: warning
@ -263,25 +257,25 @@ groups:
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostEdacCorrectableErrorsDetected
expr: '(increase(node_edac_correctable_errors_total[1m]) > 0)'
expr: '(increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: info
annotations:
summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 1 minute.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostEdacUncorrectableErrorsDetected
expr: '(node_edac_uncorrectable_errors_total > 0)'
expr: '(node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: warning
annotations:
summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkReceiveErrors
expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) and rate(node_network_receive_packets_total[2m]) > 0'
expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
@ -290,7 +284,7 @@ groups:
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkTransmitErrors
expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) and rate(node_network_transmit_packets_total[2m]) > 0'
expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
@ -298,8 +292,17 @@ groups:
summary: Host Network Transmit Errors (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkInterfaceSaturated
expr: '((rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} > 0.8 < 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 1m
labels:
severity: warning
annotations:
summary: Host Network Interface Saturated (instance {{ $labels.instance }})
description: "The network interface \"{{ $labels.device }}\" on \"{{ $labels.instance }}\" is getting overloaded.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkBondDegraded
expr: '((node_bonding_active - node_bonding_slaves) != 0)'
expr: '((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
@ -308,7 +311,7 @@ groups:
description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostConntrackLimit
expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) and node_nf_conntrack_entries_limit > 0'
expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 5m
labels:
severity: warning
@ -317,7 +320,7 @@ groups:
description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostClockSkew
expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0))'
expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 10m
labels:
severity: warning
@ -326,10 +329,19 @@ groups:
description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostClockNotSynchronising
expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16)'
expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host clock not synchronising (instance {{ $labels.instance }})
description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostRequiresReboot
expr: '(node_reboot_required > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 4h
labels:
severity: info
annotations:
summary: Host requires reboot (instance {{ $labels.instance }})
description: "{{ $labels.instance }} requires a reboot.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -1,165 +0,0 @@
groups:
- name: IpmiExporter
rules:
# The ipmi_up metric is per-collector. A value of 0 means the collector could not retrieve data from the BMC.
- alert: IpmiCollectorDown
expr: 'ipmi_up == 0'
for: 5m
labels:
severity: warning
annotations:
summary: IPMI collector down (instance {{ $labels.instance }})
description: "IPMI collector {{ $labels.collector }} on {{ $labels.instance }} failed to scrape sensor data. Check FreeIPMI tools and BMC connectivity.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# State values: 0=nominal, 1=warning, 2=critical. Thresholds are defined in the BMC firmware.
- alert: IpmiTemperatureSensorWarning
expr: 'ipmi_temperature_state == 1'
for: 5m
labels:
severity: warning
annotations:
summary: IPMI temperature sensor warning (instance {{ $labels.instance }})
description: "IPMI temperature sensor {{ $labels.name }} on {{ $labels.instance }} is in warning state.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: IpmiTemperatureSensorCritical
expr: 'ipmi_temperature_state == 2'
for: 0m
labels:
severity: critical
annotations:
summary: IPMI temperature sensor critical (instance {{ $labels.instance }})
description: "IPMI temperature sensor {{ $labels.name }} on {{ $labels.instance }} is in critical state. Immediate attention required to prevent hardware damage.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: IpmiFanSpeedSensorWarning
expr: 'ipmi_fan_speed_state == 1'
for: 5m
labels:
severity: warning
annotations:
summary: IPMI fan speed sensor warning (instance {{ $labels.instance }})
description: "IPMI fan sensor {{ $labels.name }} on {{ $labels.instance }} is in warning state.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: IpmiFanSpeedSensorCritical
expr: 'ipmi_fan_speed_state == 2'
for: 0m
labels:
severity: critical
annotations:
summary: IPMI fan speed sensor critical (instance {{ $labels.instance }})
description: "IPMI fan sensor {{ $labels.name }} on {{ $labels.instance }} is in critical state. A fan may have failed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: IpmiFanSpeedZero
expr: 'ipmi_fan_speed_rpm == 0'
for: 5m
labels:
severity: critical
annotations:
summary: IPMI fan speed zero (instance {{ $labels.instance }})
description: "IPMI fan {{ $labels.name }} on {{ $labels.instance }} reports 0 RPM. The fan may have failed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: IpmiVoltageSensorWarning
expr: 'ipmi_voltage_state == 1'
for: 5m
labels:
severity: warning
annotations:
summary: IPMI voltage sensor warning (instance {{ $labels.instance }})
description: "IPMI voltage sensor {{ $labels.name }} on {{ $labels.instance }} is in warning state.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: IpmiVoltageSensorCritical
expr: 'ipmi_voltage_state == 2'
for: 0m
labels:
severity: critical
annotations:
summary: IPMI voltage sensor critical (instance {{ $labels.instance }})
description: "IPMI voltage sensor {{ $labels.name }} on {{ $labels.instance }} is in critical state. Power supply or motherboard issue possible.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: IpmiCurrentSensorWarning
expr: 'ipmi_current_state == 1'
for: 5m
labels:
severity: warning
annotations:
summary: IPMI current sensor warning (instance {{ $labels.instance }})
description: "IPMI current sensor {{ $labels.name }} on {{ $labels.instance }} is in warning state.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: IpmiCurrentSensorCritical
expr: 'ipmi_current_state == 2'
for: 0m
labels:
severity: critical
annotations:
summary: IPMI current sensor critical (instance {{ $labels.instance }})
description: "IPMI current sensor {{ $labels.name }} on {{ $labels.instance }} is in critical state.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: IpmiPowerSensorWarning
expr: 'ipmi_power_state == 1'
for: 5m
labels:
severity: warning
annotations:
summary: IPMI power sensor warning (instance {{ $labels.instance }})
description: "IPMI power sensor {{ $labels.name }} on {{ $labels.instance }} is in warning state.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: IpmiPowerSensorCritical
expr: 'ipmi_power_state == 2'
for: 0m
labels:
severity: critical
annotations:
summary: IPMI power sensor critical (instance {{ $labels.instance }})
description: "IPMI power sensor {{ $labels.name }} on {{ $labels.instance }} is in critical state.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Catches any sensor type not covered by the specific temperature/fan/voltage/current/power alerts.
- alert: IpmiGenericSensorCritical
expr: 'ipmi_sensor_state == 2'
for: 5m
labels:
severity: critical
annotations:
summary: IPMI generic sensor critical (instance {{ $labels.instance }})
description: "IPMI sensor {{ $labels.name }} (type={{ $labels.type }}) on {{ $labels.instance }} is in critical state.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: IpmiChassisPowerOff
expr: 'ipmi_chassis_power_state == 0'
for: 0m
labels:
severity: critical
annotations:
summary: IPMI chassis power off (instance {{ $labels.instance }})
description: "IPMI reports chassis power is off on {{ $labels.instance }}. The server may have shut down unexpectedly.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# The metric uses inverted logic: 1=no fault, 0=fault detected.
- alert: IpmiChassisDriveFault
expr: 'ipmi_chassis_drive_fault_state == 0'
for: 0m
labels:
severity: critical
annotations:
summary: IPMI chassis drive fault (instance {{ $labels.instance }})
description: "IPMI reports a drive fault on {{ $labels.instance }}. Check disk health.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# The metric uses inverted logic: 1=no fault, 0=fault detected.
- alert: IpmiChassisCoolingFault
expr: 'ipmi_chassis_cooling_fault_state == 0'
for: 0m
labels:
severity: critical
annotations:
summary: IPMI chassis cooling fault (instance {{ $labels.instance }})
description: "IPMI reports a cooling/fan fault on {{ $labels.instance }}. Check fans and airflow.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# SEL storage is typically very limited (e.g., 16KB). When full, new events may be dropped.
- alert: IpmiSelAlmostFull
expr: 'ipmi_sel_free_space_bytes < 512'
for: 5m
labels:
severity: warning
annotations:
summary: IPMI SEL almost full (instance {{ $labels.instance }})
description: "IPMI System Event Log on {{ $labels.instance }} has only {{ printf \"%.0f\" $value }} bytes free. Clear the SEL to prevent loss of new events.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -2,7 +2,6 @@ groups:
- name: EmbeddedExporter
rules:
- alert: IstioKubernetesGatewayAvailabilityDrop
@ -12,18 +11,17 @@ groups:
severity: warning
annotations:
summary: Istio Kubernetes gateway availability drop (instance {{ $labels.instance }})
description: "Istio ingress gateway has only {{ $value }} available pod(s). Inbound traffic will likely be affected.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Gateway pods have dropped. Inbound traffic will likely be affected.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: IstioPilotHighPushErrorRate
expr: 'sum(rate(pilot_xds_push_errors[1m])) / sum(rate(pilot_xds_pushes[1m])) * 100 > 5 and sum(rate(pilot_xds_pushes[1m])) > 0'
- alert: IstioPilotHighTotalRequestRate
expr: 'sum(rate(pilot_xds_push_errors[1m])) / sum(rate(pilot_xds_pushes[1m])) * 100 > 5'
for: 1m
labels:
severity: warning
annotations:
summary: Istio Pilot high push error rate (instance {{ $labels.instance }})
summary: Istio Pilot high total request rate (instance {{ $labels.instance }})
description: "Number of Istio Pilot push errors is too high (> 5%). Envoy sidecars might have outdated configuration.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Mixer was deprecated in Istio 1.5 and removed in Istio 1.8+. This alert only applies to Istio < 1.8.
- alert: IstioMixerPrometheusDispatchesLow
expr: 'sum(rate(mixer_runtime_dispatches_total{adapter=~"prometheus"}[1m])) < 180'
for: 1m
@ -33,7 +31,6 @@ groups:
summary: Istio Mixer Prometheus dispatches low (instance {{ $labels.instance }})
description: "Number of Mixer dispatches to Prometheus is too low. Istio metrics might not be being exported properly.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 1000 req/s is a rough default. Adjust to your expected peak traffic.
- alert: IstioHighTotalRequestRate
expr: 'sum(rate(istio_requests_total{reporter="destination"}[5m])) > 1000'
for: 2m
@ -41,9 +38,8 @@ groups:
severity: warning
annotations:
summary: Istio high total request rate (instance {{ $labels.instance }})
description: "Global request rate in the service mesh is unusually high ({{ $value | printf \"%.2f\" }} req/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Global request rate in the service mesh is unusually high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 100 req/s is a rough default. Adjust to your expected baseline traffic. This alert may fire on startup or low-traffic environments.
- alert: IstioLowTotalRequestRate
expr: 'sum(rate(istio_requests_total{reporter="destination"}[5m])) < 100'
for: 2m
@ -51,49 +47,49 @@ groups:
severity: warning
annotations:
summary: Istio low total request rate (instance {{ $labels.instance }})
description: "Global request rate in the service mesh is unusually low ({{ $value | printf \"%.2f\" }} req/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Global request rate in the service mesh is unusually low.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: IstioHigh4xxErrorRate
expr: 'sum(rate(istio_requests_total{reporter="destination", response_code=~"4.*"}[5m])) / sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100 > 5 and sum(rate(istio_requests_total{reporter="destination"}[5m])) > 0'
expr: 'sum(rate(istio_requests_total{reporter="destination", response_code=~"4.*"}[5m])) / sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100 > 5'
for: 1m
labels:
severity: warning
annotations:
summary: Istio high 4xx error rate (instance {{ $labels.instance }})
description: "High percentage of HTTP 4xx responses in Istio ({{ $value | printf \"%.1f\" }}% > 5%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "High percentage of HTTP 5xx responses in Istio (> 5%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: IstioHigh5xxErrorRate
expr: 'sum(rate(istio_requests_total{reporter="destination", response_code=~"5.*"}[5m])) / sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100 > 5 and sum(rate(istio_requests_total{reporter="destination"}[5m])) > 0'
expr: 'sum(rate(istio_requests_total{reporter="destination", response_code=~"5.*"}[5m])) / sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100 > 5'
for: 1m
labels:
severity: warning
annotations:
summary: Istio high 5xx error rate (instance {{ $labels.instance }})
description: "High percentage of HTTP 5xx responses in Istio ({{ $value | printf \"%.1f\" }}% > 5%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "High percentage of HTTP 5xx responses in Istio (> 5%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: IstioHighRequestLatency
expr: 'rate(istio_request_duration_milliseconds_sum{reporter="destination"}[1m]) / rate(istio_request_duration_milliseconds_count{reporter="destination"}[1m]) > 100 and rate(istio_request_duration_milliseconds_count{reporter="destination"}[1m]) > 0'
expr: 'rate(istio_request_duration_milliseconds_sum{reporter="destination"}[1m]) / rate(istio_request_duration_milliseconds_count{reporter="destination"}[1m]) > 100'
for: 1m
labels:
severity: warning
annotations:
summary: Istio high request latency (instance {{ $labels.instance }})
description: "Istio average request duration is {{ $value }}ms (> 100ms).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Istio average requests execution is longer than 100ms.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: IstioLatency99Percentile
expr: 'histogram_quantile(0.99, sum(rate(istio_request_duration_milliseconds_bucket[1m])) by (destination_canonical_service, destination_workload_namespace, le)) > 1000'
expr: 'histogram_quantile(0.99, sum(rate(istio_request_duration_milliseconds_bucket[1m])) by (destination_canonical_service, destination_workload_namespace, source_canonical_service, source_workload_namespace, le)) > 1000'
for: 1m
labels:
severity: warning
annotations:
summary: Istio latency 99 percentile (instance {{ $labels.instance }})
description: "Istio p99 request latency is {{ $value }}ms (threshold: 1000ms).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Istio 1% slowest requests are longer than 1000ms.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: IstioPilotDuplicateEntry
expr: 'sum(pilot_duplicate_envoy_clusters{}) > 0'
expr: 'sum(rate(pilot_duplicate_envoy_clusters{}[5m])) > 0'
for: 0m
labels:
severity: critical
annotations:
summary: Istio Pilot Duplicate Entry (instance {{ $labels.instance }})
description: "Istio Pilot has detected {{ $value }} duplicate Envoy cluster(s), indicating misconfigured DestinationRules or ServiceEntries.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Istio pilot duplicate entry error.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -1,82 +0,0 @@
groups:
- name: EmbeddedExporterLegacy
# These rules target Jaeger v1.x metrics (jaeger_* prefix).
# Jaeger v1 reached end-of-life on December 31, 2025.
# For Jaeger v2+, use the "Embedded exporter (v2+)" rules instead.
# Note: jaeger-agent was deprecated in v1.35 and removed in v2.0.
rules:
- alert: JaegerAgentHttpServerErrors
expr: '100 * sum(rate(jaeger_agent_http_server_errors_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_agent_http_server_total[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_agent_http_server_total[1m])) by (instance, job, namespace) > 0'
for: 15m
labels:
severity: warning
annotations:
summary: Jaeger agent HTTP server errors (instance {{ $labels.instance }})
description: "Jaeger agent on {{ $labels.instance }} is experiencing {{ $value | humanize }}% HTTP server errors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: JaegerClientRpcRequestErrors
expr: '100 * sum(rate(jaeger_client_jaeger_rpc_http_requests{status_code=~"4xx|5xx"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_client_jaeger_rpc_http_requests[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_client_jaeger_rpc_http_requests[1m])) by (instance, job, namespace) > 0'
for: 15m
labels:
severity: warning
annotations:
summary: Jaeger client RPC request errors (instance {{ $labels.instance }})
description: "Jaeger client on {{ $labels.instance }} is experiencing {{ $value | humanize }}% RPC HTTP errors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: JaegerClientSpansDropped
expr: '100 * sum(rate(jaeger_reporter_spans{result=~"dropped|err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_reporter_spans[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_reporter_spans[1m])) by (instance, job, namespace) > 0'
for: 15m
labels:
severity: warning
annotations:
summary: Jaeger client spans dropped (instance {{ $labels.instance }})
description: "Jaeger client on {{ $labels.instance }} is dropping {{ $value | humanize }}% of spans.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: JaegerAgentSpansDropped
expr: '100 * sum(rate(jaeger_agent_reporter_batches_failures_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_agent_reporter_batches_submitted_total[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_agent_reporter_batches_submitted_total[1m])) by (instance, job, namespace) > 0'
for: 15m
labels:
severity: warning
annotations:
summary: Jaeger agent spans dropped (instance {{ $labels.instance }})
description: "Jaeger agent on {{ $labels.instance }} is dropping {{ $value | humanize }}% of span batches.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: JaegerCollectorDroppingSpans
expr: '100 * sum(rate(jaeger_collector_spans_dropped_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_collector_spans_received_total[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_collector_spans_received_total[1m])) by (instance, job, namespace) > 0'
for: 15m
labels:
severity: warning
annotations:
summary: Jaeger collector dropping spans (instance {{ $labels.instance }})
description: "Jaeger collector on {{ $labels.instance }} is dropping {{ $value | humanize }}% of spans.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: JaegerSamplingUpdateFailing
expr: '100 * sum(rate(jaeger_sampler_queries{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_sampler_queries[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_sampler_queries[1m])) by (instance, job, namespace) > 0'
for: 15m
labels:
severity: warning
annotations:
summary: Jaeger sampling update failing (instance {{ $labels.instance }})
description: "Jaeger on {{ $labels.instance }} is failing {{ $value | humanize }}% of sampling policy updates.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: JaegerThrottlingUpdateFailing
expr: '100 * sum(rate(jaeger_throttler_updates{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_throttler_updates[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_throttler_updates[1m])) by (instance, job, namespace) > 0'
for: 15m
labels:
severity: warning
annotations:
summary: Jaeger throttling update failing (instance {{ $labels.instance }})
description: "Jaeger on {{ $labels.instance }} is failing {{ $value | humanize }}% of throttling policy updates.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: JaegerQueryRequestFailures
expr: '100 * sum(rate(jaeger_query_requests_total{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_query_requests_total[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_query_requests_total[1m])) by (instance, job, namespace) > 0'
for: 15m
labels:
severity: warning
annotations:
summary: Jaeger query request failures (instance {{ $labels.instance }})
description: "Jaeger query on {{ $labels.instance }} is failing {{ $value | humanize }}% of requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -1,94 +0,0 @@
groups:
- name: EmbeddedExporter
# Jaeger v2 is built on OpenTelemetry Collector and exposes metrics on port 8888 (/metrics).
# It emits standard otelcol_* pipeline metrics alongside Jaeger-specific storage and query metrics.
# For span ingestion pipeline alerts (refused spans, export failures, queue saturation),
# use the OpenTelemetry Collector rules instead.
rules:
- alert: JaegerHighStorageErrorRate
expr: '100 * sum(rate(jaeger_storage_requests_total{result="err"}[1m])) by (instance, job, namespace, operation) / sum(rate(jaeger_storage_requests_total[1m])) by (instance, job, namespace, operation) > 1 and sum(rate(jaeger_storage_requests_total[1m])) by (instance, job, namespace, operation) > 0'
for: 5m
labels:
severity: warning
annotations:
summary: Jaeger high storage error rate (instance {{ $labels.instance }})
description: "Jaeger on {{ $labels.instance }} is experiencing {{ $value | humanize }}% storage errors on {{ $labels.operation }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 1s is a rough default. Adjust based on your storage backend and data volume.
- alert: JaegerSlowStorageOperations
expr: 'histogram_quantile(0.99, sum(rate(jaeger_storage_latency_seconds_bucket[5m])) by (le, instance, job, namespace, operation)) > 1'
for: 5m
labels:
severity: warning
annotations:
summary: Jaeger slow storage operations (instance {{ $labels.instance }})
description: "Jaeger on {{ $labels.instance }} storage p99 latency for {{ $labels.operation }} is {{ $value | humanizeDuration }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Filters on http_route="/api/traces" (the trace search endpoint). The http_server_request_duration_seconds
# metric is emitted by the otelhttp middleware used by the Jaeger query service.
- alert: JaegerQueryServiceHighErrorRate
expr: '100 * sum(rate(http_server_request_duration_seconds_count{http_route="/api/traces",http_response_status_code=~"5.."}[1m])) by (instance, job, namespace) / sum(rate(http_server_request_duration_seconds_count{http_route="/api/traces"}[1m])) by (instance, job, namespace) > 1 and sum(rate(http_server_request_duration_seconds_count{http_route="/api/traces"}[1m])) by (instance, job, namespace) > 0'
for: 5m
labels:
severity: warning
annotations:
summary: Jaeger query service high error rate (instance {{ $labels.instance }})
description: "Jaeger query service on {{ $labels.instance }} is returning {{ $value | humanize }}% HTTP 5xx errors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 2s is a rough default. Adjust based on your storage backend and data volume.
- alert: JaegerQueryServiceSlowResponses
expr: 'histogram_quantile(0.99, sum(rate(http_server_request_duration_seconds_bucket{http_route="/api/traces"}[5m])) by (le, instance, job, namespace)) > 2'
for: 5m
labels:
severity: warning
annotations:
summary: Jaeger query service slow responses (instance {{ $labels.instance }})
description: "Jaeger query service on {{ $labels.instance }} p99 response latency is {{ $value | humanizeDuration }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Fires when all storage operations for a given type are failing and none are succeeding.
# Indicates the storage backend (Cassandra, Elasticsearch, etc.) is likely unreachable or misconfigured.
- alert: JaegerStorageCompletelyUnavailable
expr: 'sum(rate(jaeger_storage_requests_total{result="err"}[1m])) by (instance, job, namespace, operation) > 0 and sum(rate(jaeger_storage_requests_total{result="ok"}[1m])) by (instance, job, namespace, operation) == 0'
for: 2m
labels:
severity: critical
annotations:
summary: Jaeger storage completely unavailable (instance {{ $labels.instance }})
description: "Jaeger on {{ $labels.instance }} has 100% storage errors for {{ $labels.operation }} — storage backend may be down.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Single trace retrieval (/api/traces/{traceID}) can be slower than search, especially for large traces.
# Threshold of 5s is a rough default.
- alert: JaegerSlowSingleTraceRetrieval
expr: 'histogram_quantile(0.99, sum(rate(http_server_request_duration_seconds_bucket{http_route="/api/traces/{traceID}"}[5m])) by (le, instance, job, namespace)) > 5'
for: 5m
labels:
severity: warning
annotations:
summary: Jaeger slow single trace retrieval (instance {{ $labels.instance }})
description: "Jaeger on {{ $labels.instance }} p99 latency for single trace retrieval is {{ $value | humanizeDuration }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Errors on /api/services indicate the storage backend cannot return the list of instrumented services,
# which breaks the Jaeger UI service selector.
- alert: JaegerServiceDiscoveryErrors
expr: '100 * sum(rate(http_server_request_duration_seconds_count{http_route="/api/services",http_response_status_code=~"5.."}[1m])) by (instance, job, namespace) / sum(rate(http_server_request_duration_seconds_count{http_route="/api/services"}[1m])) by (instance, job, namespace) > 1 and sum(rate(http_server_request_duration_seconds_count{http_route="/api/services"}[1m])) by (instance, job, namespace) > 0'
for: 5m
labels:
severity: warning
annotations:
summary: Jaeger service discovery errors (instance {{ $labels.instance }})
description: "Jaeger on {{ $labels.instance }} is returning {{ $value | humanize }}% HTTP 5xx errors on the services endpoint.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Fires when an operation (e.g. find_traces, get_services) has received requests but none succeeded.
# May indicate a persistent storage error or a backend that is slow to recover.
- alert: JaegerNoStorageReadsSucceeding
expr: 'sum(increase(jaeger_storage_requests_total{result="ok"}[15m])) by (instance, job, namespace, operation) == 0 and sum(increase(jaeger_storage_requests_total[15m])) by (instance, job, namespace, operation) > 0'
for: 5m
labels:
severity: warning
annotations:
summary: Jaeger no storage reads succeeding (instance {{ $labels.instance }})
description: "Jaeger on {{ $labels.instance }} has no successful storage reads for {{ $labels.operation }} in the past 15 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -2,26 +2,16 @@ groups:
- name: MetricPlugin
rules:
- alert: JenkinsNodeOffline
expr: 'jenkins_node_offline_value > 0'
for: 5m
labels:
severity: critical
annotations:
summary: Jenkins node offline (instance {{ $labels.instance }})
description: "At least one Jenkins node offline: `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: JenkinsNoNodeOnline
expr: 'jenkins_node_online_value == 0'
- alert: JenkinsOffline
expr: 'jenkins_node_offline_value > 1'
for: 0m
labels:
severity: critical
annotations:
summary: Jenkins no node online (instance {{ $labels.instance }})
description: "No Jenkins nodes are online: `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Jenkins offline (instance {{ $labels.instance }})
description: "Jenkins offline: `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: JenkinsHealthcheck
expr: 'jenkins_health_check_score < 1'
@ -51,7 +41,7 @@ groups:
description: "Healthcheck failure for `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: JenkinsRunFailureTotal
expr: 'increase(jenkins_runs_failure_total[1h]) > 100'
expr: 'delta(jenkins_runs_failure_total[1h]) > 100'
for: 0m
labels:
severity: warning
@ -68,12 +58,6 @@ groups:
summary: Jenkins build tests failing (instance {{ $labels.instance }})
description: "Last build tests failed: {{$labels.jenkins_job}}. Failed build Tests for job `{{$labels.jenkins_job}}` on {{$labels.instance}}/{{$labels.env}} ({{$labels.region}})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# * RUNNING -1 true - The build had no errors.
# * SUCCESS 0 true - The build had no errors.
# * UNSTABLE 1 true - The build had some errors but they were not fatal. For example, some tests failed.
# * FAILURE 2 false - The build had a fatal error.
# * NOT_BUILT 3 false - The module was not built.
# * ABORTED 4 false - The build was manually aborted.
- alert: JenkinsLastBuildFailed
expr: 'default_jenkins_builds_last_build_result_ordinal == 2'
for: 0m

View file

@ -2,7 +2,6 @@ groups:
- name: CzerwonkJunosExporter
rules:
- alert: JuniperSwitchDown
@ -14,20 +13,20 @@ groups:
summary: Juniper switch down (instance {{ $labels.instance }})
description: "The switch appears to be down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: JuniperCriticalBandwidthUsage1gib
- alert: JuniperHighBandwidthUsage1gib
expr: 'rate(junos_interface_transmit_bytes[1m]) * 8 > 1e+9 * 0.90'
for: 1m
labels:
severity: critical
annotations:
summary: Juniper critical Bandwidth Usage 1GiB (instance {{ $labels.instance }})
summary: Juniper high Bandwidth Usage 1GiB (instance {{ $labels.instance }})
description: "Interface is highly saturated. (> 0.90GiB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: JuniperWarningBandwidthUsage1gib
- alert: JuniperHighBandwidthUsage1gib
expr: 'rate(junos_interface_transmit_bytes[1m]) * 8 > 1e+9 * 0.80'
for: 1m
labels:
severity: warning
annotations:
summary: Juniper warning Bandwidth Usage 1GiB (instance {{ $labels.instance }})
summary: Juniper high Bandwidth Usage 1GiB (instance {{ $labels.instance }})
description: "Interface is getting saturated. (> 0.80GiB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -2,119 +2,13 @@ groups:
- name: JvmExporter
rules:
- alert: JvmMemoryFillingUp
expr: '(sum by (instance)(jvm_memory_used_bytes{area="heap"}) / sum by (instance)(jvm_memory_max_bytes{area="heap"})) * 100 > 80 and sum by (instance)(jvm_memory_max_bytes{area="heap"}) > 0'
expr: '(sum by (instance)(jvm_memory_used_bytes{area="heap"}) / sum by (instance)(jvm_memory_max_bytes{area="heap"})) * 100 > 80'
for: 2m
labels:
severity: warning
annotations:
summary: JVM memory filling up (instance {{ $labels.instance }})
description: "JVM memory is filling up (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Many JVM configurations leave metaspace unbounded, in which case jvm_memory_max_bytes{area="nonheap"} is -1 and this alert will not fire.
# The query filters out max_bytes <= 0 to avoid false negatives.
- alert: JvmNon-heapMemoryFillingUp
expr: '(sum by (instance)(jvm_memory_used_bytes{area="nonheap"}) / (sum by (instance)(jvm_memory_max_bytes{area="nonheap"}) > 0)) * 100 > 80'
for: 2m
labels:
severity: warning
annotations:
summary: JVM non-heap memory filling up (instance {{ $labels.instance }})
description: "JVM non-heap memory (metaspace/code cache) is filling up (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: JvmGcTimeTooHigh
expr: 'sum by (instance)(rate(jvm_gc_collection_seconds_sum[5m])) > 0.05'
for: 5m
labels:
severity: warning
annotations:
summary: JVM GC time too high (instance {{ $labels.instance }})
description: "JVM is spending too much time in garbage collection (> 5% of wall clock time)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: JvmThreadsDeadlocked
expr: 'jvm_threads_deadlocked > 0'
for: 1m
labels:
severity: critical
annotations:
summary: JVM threads deadlocked (instance {{ $labels.instance }})
description: "JVM has deadlocked threads\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: JvmThreadCountHigh
expr: 'jvm_threads_current > 300'
for: 5m
labels:
severity: warning
annotations:
summary: JVM thread count high (instance {{ $labels.instance }})
description: "JVM thread count is high (> 300), potential thread leak\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: JvmThreadsBlocked
expr: 'jvm_threads_state{state="BLOCKED"} > 50'
for: 5m
labels:
severity: warning
annotations:
summary: JVM threads BLOCKED (instance {{ $labels.instance }})
description: "JVM has high number of BLOCKED threads, indicating lock contention\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# This regex matches CMS, G1, and Parallel collector names. It will not match ZGC or Shenandoah cycle names.
# Adjust the gc label filter if you use a different collector.
- alert: JvmOldGenGcFrequency
expr: 'rate(jvm_gc_collection_seconds_count{gc=~".*old.*|.*major.*"}[5m]) > 0.3'
for: 5m
labels:
severity: warning
annotations:
summary: JVM old gen GC frequency (instance {{ $labels.instance }})
description: "Frequent old/major GC cycles, indicating memory pressure\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: JvmDirectBufferPoolFillingUp
expr: '(jvm_buffer_pool_used_bytes / jvm_buffer_pool_capacity_bytes) * 100 > 90 and jvm_buffer_pool_capacity_bytes > 0'
for: 5m
labels:
severity: warning
annotations:
summary: JVM direct buffer pool filling up (instance {{ $labels.instance }})
description: "JVM direct buffer pool is filling up (> 90%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: JvmObjectsPendingFinalization
expr: 'jvm_memory_objects_pending_finalization > 1000'
for: 5m
labels:
severity: warning
annotations:
summary: JVM objects pending finalization (instance {{ $labels.instance }})
description: "JVM has objects pending finalization, potential memory leak\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# process_open_fds and process_max_fds are generic metrics from the Prometheus client library, not JVM-specific.
# This alert will also fire for Go, Python, or any process exposing these metrics.
- alert: JvmFileDescriptorsExhaustion
expr: '(process_open_fds / process_max_fds) * 100 > 90 and process_max_fds > 0'
for: 5m
labels:
severity: warning
annotations:
summary: JVM file descriptors exhaustion (instance {{ $labels.instance }})
description: "JVM process is running out of file descriptors (> 90% used)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: JvmClassLoadingAnomaly
expr: 'rate(jvm_classes_loaded_total[5m]) > 100'
for: 5m
labels:
severity: warning
annotations:
summary: JVM class loading anomaly (instance {{ $labels.instance }})
description: "Rapid class loading detected, potential classloader leak\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: JvmCompilationTimeSpike
expr: 'rate(jvm_compilation_time_seconds_total[5m]) > 0.1'
for: 5m
labels:
severity: warning
annotations:
summary: JVM compilation time spike (instance {{ $labels.instance }})
description: "Excessive JIT compilation time consuming CPU\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -2,23 +2,22 @@ groups:
- name: DanielqsjKafkaExporter
rules:
- alert: KafkaTopicsReplicas
expr: 'min(kafka_topic_partition_in_sync_replica) by (topic) < 3'
expr: 'sum(kafka_topic_partition_in_sync_replica) by (topic) < 3'
for: 0m
labels:
severity: critical
annotations:
summary: Kafka topics replicas (instance {{ $labels.instance }})
description: "Kafka topic {{ $labels.topic }} has fewer than 3 in-sync replicas ({{ $value }}), data durability is at risk.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Kafka topic in-sync partition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KafkaConsumerGroupLag
expr: 'sum(kafka_consumergroup_lag) by (consumergroup) > 10000'
- alert: KafkaConsumersGroup
expr: 'sum(kafka_consumergroup_lag) by (consumergroup) > 50'
for: 1m
labels:
severity: warning
severity: critical
annotations:
summary: Kafka consumer group lag (instance {{ $labels.instance }})
description: "Kafka consumer group {{ $labels.consumergroup }} is lagging behind ({{ $value }} messages)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: Kafka consumers group (instance {{ $labels.instance }})
description: "Kafka consumers group\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -2,7 +2,6 @@ groups:
- name: LinkedinKafkaExporter
rules:
- alert: KafkaTopicOffsetDecreased

View file

@ -1,67 +0,0 @@
groups:
- name: AerogearKeycloakMetricsSpi
rules:
# Threshold of 5% is a rough default. Adjust based on your user base and expected error rates.
# A spike in failed logins may indicate a brute-force attack or misconfigured client.
- alert: KeycloakHighLoginFailureRate
expr: '(sum by (realm) (rate(keycloak_failed_login_attempts_total[5m])) / (sum by (realm) (rate(keycloak_logins_total[5m])) + sum by (realm) (rate(keycloak_failed_login_attempts_total[5m])))) * 100 > 5 and (sum by (realm) (rate(keycloak_logins_total[5m])) + sum by (realm) (rate(keycloak_failed_login_attempts_total[5m]))) > 0'
for: 5m
labels:
severity: warning
annotations:
summary: Keycloak high login failure rate (instance {{ $labels.instance }})
description: "More than 5% of login attempts are failing in realm {{ $labels.realm }} (current value: {{ $value | printf \"%.1f\" }}%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Only fires when login attempts exist but none succeed — may indicate an authentication outage.
- alert: KeycloakNoSuccessfulLogins
expr: 'sum by (realm) (rate(keycloak_logins_total[15m])) == 0 and (sum by (realm) (rate(keycloak_logins_total[15m])) + sum by (realm) (rate(keycloak_failed_login_attempts_total[15m]))) > 0'
for: 5m
labels:
severity: critical
annotations:
summary: Keycloak no successful logins (instance {{ $labels.instance }})
description: "No successful logins in realm {{ $labels.realm }} for the last 15 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 10% is a rough default. High refresh token errors may indicate expired sessions or token store issues.
- alert: KeycloakHighTokenRefreshErrorRate
expr: '(sum by (realm) (rate(keycloak_refresh_tokens_errors_total[5m])) / sum by (realm) (rate(keycloak_refresh_tokens_total[5m]))) * 100 > 10 and sum by (realm) (rate(keycloak_refresh_tokens_total[5m])) > 0'
for: 5m
labels:
severity: warning
annotations:
summary: Keycloak high token refresh error rate (instance {{ $labels.instance }})
description: "More than 10% of token refresh attempts are failing in realm {{ $labels.realm }} (current value: {{ $value | printf \"%.1f\" }}%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 10% is a rough default. Code-to-token failures may indicate misconfigured OAuth clients or replay attacks.
- alert: KeycloakHighCode-to-tokenExchangeErrorRate
expr: '(sum by (realm) (rate(keycloak_code_to_tokens_errors_total[5m])) / sum by (realm) (rate(keycloak_code_to_tokens_total[5m]))) * 100 > 10 and sum by (realm) (rate(keycloak_code_to_tokens_total[5m])) > 0'
for: 5m
labels:
severity: warning
annotations:
summary: Keycloak high code-to-token exchange error rate (instance {{ $labels.instance }})
description: "More than 10% of code-to-token exchanges are failing in realm {{ $labels.realm }} (current value: {{ $value | printf \"%.1f\" }}%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 10% is a rough default.
- alert: KeycloakHighRegistrationFailureRate
expr: '(sum by (realm) (rate(keycloak_registrations_errors_total[5m])) / sum by (realm) (rate(keycloak_registrations_total[5m]))) * 100 > 10 and sum by (realm) (rate(keycloak_registrations_total[5m])) > 0'
for: 5m
labels:
severity: warning
annotations:
summary: Keycloak high registration failure rate (instance {{ $labels.instance }})
description: "More than 10% of registration attempts are failing in realm {{ $labels.realm }} (current value: {{ $value | printf \"%.1f\" }}%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# keycloak_request_duration is in milliseconds. Threshold of 2000ms (2 seconds) is a rough default.
- alert: KeycloakSlowRequestResponseTime
expr: 'sum by (method) (rate(keycloak_request_duration_sum[5m])) / sum by (method) (rate(keycloak_request_duration_count[5m])) > 2000 and sum by (method) (rate(keycloak_request_duration_count[5m])) > 0'
for: 5m
labels:
severity: warning
annotations:
summary: Keycloak slow request response time (instance {{ $labels.instance }})
description: "Keycloak {{ $labels.method }} requests are taking more than 2 seconds on average.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -2,7 +2,6 @@ groups:
- name: KubestateExporter
rules:
- alert: KubernetesNodeNotReady
@ -11,27 +10,16 @@ groups:
labels:
severity: critical
annotations:
summary: Kubernetes Node not ready (instance {{ $labels.instance }})
summary: Kubernetes Node ready (node {{ $labels.node }})
description: "Node {{ $labels.node }} has been unready for a long time\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Kubernetes Node with disabled schedules are fine.
# This alarm can be useful to get warned if there are nodes which are longer unscheduled.
- alert: KubernetesNodeSchedulingDisabled
expr: 'kube_node_spec_taint{key="node.kubernetes.io/unschedulable"} == 1'
for: 30m
labels:
severity: warning
annotations:
summary: Kubernetes Node scheduling disabled (instance {{ $labels.instance }})
description: "Node {{ $labels.node }} has been marked as unschedulable for more than 30 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesNodeMemoryPressure
expr: 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1'
for: 2m
labels:
severity: critical
annotations:
summary: Kubernetes Node memory pressure (instance {{ $labels.instance }})
summary: Kubernetes memory pressure (node {{ $labels.node }})
description: "Node {{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesNodeDiskPressure
@ -40,7 +28,7 @@ groups:
labels:
severity: critical
annotations:
summary: Kubernetes Node disk pressure (instance {{ $labels.instance }})
summary: Kubernetes disk pressure (node {{ $labels.node }})
description: "Node {{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesNodeNetworkUnavailable
@ -53,7 +41,7 @@ groups:
description: "Node {{ $labels.node }} has NetworkUnavailable condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesNodeOutOfPodCapacity
expr: 'sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid, instance) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90'
expr: 'sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90'
for: 2m
labels:
severity: warning
@ -67,7 +55,7 @@ groups:
labels:
severity: warning
annotations:
summary: Kubernetes Container oom killer (instance {{ $labels.instance }})
summary: Kubernetes container oom killer ({{ $labels.namespace }}/{{ $labels.pod }}:{{ $labels.container }})
description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesJobFailed
@ -76,34 +64,16 @@ groups:
labels:
severity: warning
annotations:
summary: Kubernetes Job failed (instance {{ $labels.instance }})
summary: Kubernetes Job failed ({{ $labels.namespace }}/{{ $labels.job_name }})
description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesJobNotStarting
expr: 'kube_job_status_active == 0 and kube_job_status_failed == 0 and kube_job_status_succeeded == 0 and (time() - kube_job_status_start_time) > 600'
for: 0m
labels:
severity: warning
annotations:
summary: Kubernetes Job not starting (instance {{ $labels.instance }})
description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} did not start for 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesCronjobFailing
expr: '(kube_cronjob_status_last_schedule_time > kube_cronjob_status_last_successful_time) AND (kube_cronjob_status_active == 0) AND (kube_cronjob_spec_suspend == 0)'
for: 0m
labels:
severity: critical
annotations:
summary: Kubernetes CronJob failing (instance {{ $labels.instance }})
description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is failing\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesCronjobSuspended
expr: 'kube_cronjob_spec_suspend != 0'
for: 0m
labels:
severity: warning
annotations:
summary: Kubernetes CronJob suspended (instance {{ $labels.instance }})
summary: Kubernetes CronJob suspended ({{ $labels.namespace }}/{{ $labels.cronjob }})
description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is suspended\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesPersistentvolumeclaimPending
@ -112,11 +82,11 @@ groups:
labels:
severity: warning
annotations:
summary: Kubernetes PersistentVolumeClaim pending (instance {{ $labels.instance }})
summary: Kubernetes PersistentVolumeClaim pending ({{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }})
description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pending\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesVolumeOutOfDiskSpace
expr: 'kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes * 100 < 10 and kubelet_volume_stats_capacity_bytes > 0'
expr: 'kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes * 100 < 10'
for: 2m
labels:
severity: warning
@ -134,12 +104,12 @@ groups:
description: "Volume under {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesPersistentvolumeError
expr: 'kube_persistentvolume_status_phase{phase=~"Failed|Pending"} > 0'
expr: 'kube_persistentvolume_status_phase{phase=~"Failed|Pending", job="kube-state-metrics"} > 0'
for: 0m
labels:
severity: critical
annotations:
summary: Kubernetes PersistentVolume error (instance {{ $labels.instance }})
summary: Kubernetes PersistentVolumeClaim pending ({{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }})
description: "Persistent volume {{ $labels.persistentvolume }} is in bad state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesStatefulsetDown
@ -148,11 +118,11 @@ groups:
labels:
severity: critical
annotations:
summary: Kubernetes StatefulSet down (instance {{ $labels.instance }})
summary: Kubernetes StatefulSet down ({{ $labels.namespace }}/{{ $labels.statefulset }})
description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} went down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesHpaScaleInability
expr: '(kube_horizontalpodautoscaler_spec_max_replicas - kube_horizontalpodautoscaler_status_desired_replicas) * on (horizontalpodautoscaler,namespace) (kube_horizontalpodautoscaler_status_condition{condition="ScalingLimited", status="true"} == 1) == 0'
expr: 'kube_horizontalpodautoscaler_status_condition{status="false", condition="AbleToScale"} == 1'
for: 2m
labels:
severity: warning
@ -170,7 +140,7 @@ groups:
description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to collect metrics\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesHpaScaleMaximum
expr: '(kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas) and (kube_horizontalpodautoscaler_spec_max_replicas > 1) and (kube_horizontalpodautoscaler_spec_min_replicas != kube_horizontalpodautoscaler_spec_max_replicas)'
expr: 'kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas'
for: 2m
labels:
severity: info
@ -193,7 +163,7 @@ groups:
labels:
severity: critical
annotations:
summary: Kubernetes Pod not healthy (instance {{ $labels.instance }})
summary: Kubernetes Pod not healthy ({{ $labels.namespace }}/{{ $labels.pod }})
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-running state for longer than 15 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesPodCrashLooping
@ -202,7 +172,7 @@ groups:
labels:
severity: warning
annotations:
summary: Kubernetes pod crash looping (instance {{ $labels.instance }})
summary: Kubernetes pod crash looping ({{ $labels.namespace }}/{{ $labels.pod }})
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesReplicasetReplicasMismatch
@ -211,7 +181,7 @@ groups:
labels:
severity: warning
annotations:
summary: Kubernetes ReplicaSet replicas mismatch (instance {{ $labels.instance }})
summary: Kubernetes ReplicasSet mismatch ({{ $labels.namespace }}/{{ $labels.replicaset }})
description: "ReplicaSet {{ $labels.namespace }}/{{ $labels.replicaset }} replicas mismatch\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesDeploymentReplicasMismatch
@ -220,7 +190,7 @@ groups:
labels:
severity: warning
annotations:
summary: Kubernetes Deployment replicas mismatch (instance {{ $labels.instance }})
summary: Kubernetes Deployment replicas mismatch ({{ $labels.namespace }}/{{ $labels.deployment }})
description: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replicas mismatch\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesStatefulsetReplicasMismatch
@ -238,7 +208,7 @@ groups:
labels:
severity: critical
annotations:
summary: Kubernetes Deployment generation mismatch (instance {{ $labels.instance }})
summary: Kubernetes Deployment generation mismatch ({{ $labels.namespace }}/{{ $labels.deployment }})
description: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has failed but has not been rolled back.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesStatefulsetGenerationMismatch
@ -247,7 +217,7 @@ groups:
labels:
severity: critical
annotations:
summary: Kubernetes StatefulSet generation mismatch (instance {{ $labels.instance }})
summary: Kubernetes StatefulSet generation mismatch ({{ $labels.namespace }}/{{ $labels.statefulset }})
description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has failed but has not been rolled back.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesStatefulsetUpdateNotRolledOut
@ -256,16 +226,16 @@ groups:
labels:
severity: warning
annotations:
summary: Kubernetes StatefulSet update not rolled out (instance {{ $labels.instance }})
summary: Kubernetes StatefulSet update not rolled out ({{ $labels.namespace }}/{{ $labels.statefulset }})
description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesDaemonsetRolloutStuck
expr: '(kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 and kube_daemonset_status_desired_number_scheduled > 0) or kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0'
expr: 'kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 or kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0'
for: 10m
labels:
severity: warning
annotations:
summary: Kubernetes DaemonSet rollout stuck (instance {{ $labels.instance }})
summary: Kubernetes DaemonSet rollout stuck ({{ $labels.namespace }}/{{ $labels.daemonset }})
description: "Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled or not ready\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesDaemonsetMisscheduled
@ -274,17 +244,16 @@ groups:
labels:
severity: critical
annotations:
summary: Kubernetes DaemonSet misscheduled (instance {{ $labels.instance }})
summary: Kubernetes DaemonSet misscheduled ({{ $labels.namespace }}/{{ $labels.daemonset }})
description: "Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold should be customized for each cronjob name.
- alert: KubernetesCronjobTooLong
expr: 'kube_job_status_start_time > 0 and absent(kube_job_status_completion_time) and (time() - kube_job_status_start_time) > 3600'
expr: 'time() - kube_cronjob_next_schedule_time > 3600'
for: 0m
labels:
severity: warning
annotations:
summary: Kubernetes CronJob too long (instance {{ $labels.instance }})
summary: Kubernetes CronJob too long ({{ $labels.namespace }}/{{ $labels.cronjob }})
description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesJobSlowCompletion
@ -293,26 +262,26 @@ groups:
labels:
severity: critical
annotations:
summary: Kubernetes Job slow completion (instance {{ $labels.instance }})
summary: Kubernetes job slow completion ({{ $labels.namespace }}/{{ $labels.job_name }})
description: "Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in time.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesApiServerErrors
expr: 'sum(rate(apiserver_request_total{job="apiserver",code=~"(?:5..)"}[1m])) by (instance, job) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) by (instance, job) * 100 > 3 and sum(rate(apiserver_request_total{job="apiserver"}[1m])) by (instance, job) > 0'
expr: 'sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[1m])) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) * 100 > 3'
for: 2m
labels:
severity: critical
annotations:
summary: Kubernetes API server errors (instance {{ $labels.instance }})
description: "Kubernetes API server is experiencing {{ $value | humanize }}% error rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Kubernetes API server is experiencing high error rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesApiClientErrors
expr: '(sum(rate(rest_client_requests_total{code=~"(4|5).."}[1m])) by (instance, job) / sum(rate(rest_client_requests_total[1m])) by (instance, job)) * 100 > 1 and sum(rate(rest_client_requests_total[1m])) by (instance, job) > 0'
expr: '(sum(rate(rest_client_requests_total{code=~"(4|5).."}[1m])) by (instance, job) / sum(rate(rest_client_requests_total[1m])) by (instance, job)) * 100 > 1'
for: 2m
labels:
severity: critical
annotations:
summary: Kubernetes API client errors (instance {{ $labels.instance }})
description: "Kubernetes API client is experiencing {{ $value | humanize }}% error rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Kubernetes API client is experiencing high error rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesClientCertificateExpiresNextWeek
expr: 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60'
@ -333,7 +302,7 @@ groups:
description: "A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesApiServerLatency
expr: 'histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{verb!~"(?:CONNECT|WATCHLIST|WATCH|PROXY)"} [10m])) WITHOUT (subresource)) > 1'
expr: 'histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"} [10m])) WITHOUT (instance, resource)) > 1'
for: 2m
labels:
severity: warning

View file

@ -2,15 +2,13 @@ groups:
- name: EmbeddedExporter
rules:
# Linkerd does not expose request_errors_total. Errors are tracked via response_total{classification="failure"}.
- alert: LinkerdHighErrorRate
expr: 'sum(rate(response_total{classification="failure"}[1m])) by (deployment, statefulset, daemonset) / sum(rate(response_total[1m])) by (deployment, statefulset, daemonset) * 100 > 10 and sum(rate(response_total[1m])) by (deployment, statefulset, daemonset) > 0'
expr: 'sum(rate(request_errors_total[1m])) by (deployment, statefulset, daemonset) / sum(rate(request_total[1m])) by (deployment, statefulset, daemonset) * 100 > 10'
for: 1m
labels:
severity: warning
annotations:
summary: Linkerd high error rate (instance {{ $labels.instance }})
description: "Linkerd error rate for {{ $labels.deployment }}{{ $labels.statefulset }}{{ $labels.daemonset }} is over 10%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Linkerd error rate for {{ $labels.deployment | $labels.statefulset | $labels.daemonset }} is over 10%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -1,36 +0,0 @@
groups:
- name: EmbeddedExporter
rules:
# The threshold (1) is in USD. The `model` label carries the resolved model-name (post-routing).
# PromQL `increase()` requires ≥2 datapoints with growth-difference to extrapolate positive —
# for brand-new counter series this needs ≥2 distinct request bursts ≥1 scrape-cycle apart.
- alert: LitellmProviderSpendOverBudget
expr: 'sum(increase(litellm_spend_metric_total{model=~"(claude-|anthropic/).*"}[24h])) > 1'
for: 5m
labels:
severity: warning
annotations:
summary: LiteLLM provider spend over budget (instance {{ $labels.instance }})
description: "Cumulative spend for an LLM provider has exceeded the daily budget threshold. Replace the regex `(claude-|anthropic/).*` with your provider's model-name pattern. Useful as a soft-warning when `provider_budget_config` hard-cap is unavailable or disabled.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: LitellmProxyFailedRequestsRateHigh
expr: 'sum(rate(litellm_proxy_failed_requests_metric_total[5m])) / sum(rate(litellm_proxy_total_requests_metric_total[5m])) > 0.05'
for: 10m
labels:
severity: warning
annotations:
summary: LiteLLM proxy failed requests rate high (instance {{ $labels.instance }})
description: "LiteLLM proxy is returning failed responses to clients (>5% error rate over 5min). Investigate downstream LLM provider availability or auth issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: LitellmRequestLatencyP95High
expr: 'histogram_quantile(0.95, sum(rate(litellm_request_total_latency_metric_bucket[5m])) by (le)) > 10'
for: 10m
labels:
severity: warning
annotations:
summary: LiteLLM request latency p95 high (instance {{ $labels.instance }})
description: "LiteLLM request total latency p95 exceeds 10 seconds over 5min. Check downstream LLM provider response-times and proxy queue-depth.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -2,7 +2,6 @@ groups:
- name: EmbeddedExporter
rules:
- alert: LokiProcessTooManyRestarts
@ -15,28 +14,28 @@ groups:
description: "A loki process had too many restarts (target {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: LokiRequestErrors
expr: '100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10 and sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 0'
expr: '100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10'
for: 15m
labels:
severity: critical
annotations:
summary: Loki request errors (instance {{ $labels.instance }})
description: "The {{ $labels.job }} and {{ $labels.route }} are experiencing {{ printf \"%.2f\" $value }}% errors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "The {{ $labels.job }} and {{ $labels.route }} are experiencing errors\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: LokiRequestPanic
expr: 'sum(increase(loki_panic_total[5m])) by (namespace, job) > 0'
for: 0m
expr: 'sum(increase(loki_panic_total[10m])) by (namespace, job) > 0'
for: 5m
labels:
severity: critical
annotations:
summary: Loki request panic (instance {{ $labels.instance }})
description: "{{ $labels.job }} is experiencing {{ $value | humanize }} panic(s) in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% increase of panics\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: LokiRequestLatency
expr: 'histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (namespace, job, route, le)) > 1'
expr: '(histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (le))) > 1'
for: 5m
labels:
severity: critical
annotations:
summary: Loki request latency (instance {{ $labels.instance }})
description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -1,24 +0,0 @@
groups:
- name: EmbeddedExporter
rules:
- alert: MeilisearchIndexIsEmpty
expr: 'meilisearch_index_docs_count == 0'
for: 0m
labels:
severity: warning
annotations:
summary: Meilisearch index is empty (instance {{ $labels.instance }})
description: "Meilisearch index {{ $labels.index }} has zero documents\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MeilisearchHttpResponseTime
expr: 'meilisearch_http_response_time_seconds > 0.5'
for: 0m
labels:
severity: warning
annotations:
summary: Meilisearch http response time (instance {{ $labels.instance }})
description: "Meilisearch http response time is too high\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -1,91 +0,0 @@
groups:
- name: MemcachedExporter
rules:
# 1m delay allows a restart without triggering an alert.
- alert: MemcachedDown
expr: 'memcached_up == 0'
for: 1m
labels:
severity: critical
annotations:
summary: Memcached down (instance {{ $labels.instance }})
description: "Memcached instance is down on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MemcachedConnectionLimitApproaching(>80%)
expr: '(memcached_current_connections / memcached_max_connections * 100) > 80 and memcached_max_connections > 0'
for: 2m
labels:
severity: warning
annotations:
summary: Memcached connection limit approaching (> 80%) (instance {{ $labels.instance }})
description: "Memcached connection usage is above 80% on {{ $labels.instance }} (current value: {{ $value }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MemcachedConnectionLimitApproaching(>95%)
expr: '(memcached_current_connections / memcached_max_connections * 100) > 95 and memcached_max_connections > 0'
for: 2m
labels:
severity: critical
annotations:
summary: Memcached connection limit approaching (> 95%) (instance {{ $labels.instance }})
description: "Memcached connection usage is above 95% on {{ $labels.instance }} (current value: {{ $value }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MemcachedOutOfMemoryErrors
expr: 'sum without (slab) (rate(memcached_slab_items_outofmemory_total[5m])) > 0.05'
for: 5m
labels:
severity: warning
annotations:
summary: Memcached out of memory errors (instance {{ $labels.instance }})
description: "Memcached is returning out-of-memory errors on {{ $labels.instance }} ({{ $value }} errors/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# High memory usage is expected if the cache is well-utilized. This alert fires when it approaches the configured limit, which may cause evictions.
- alert: MemcachedMemoryUsageHigh(>90%)
expr: '(memcached_current_bytes / memcached_limit_bytes * 100) > 90 and memcached_limit_bytes > 0'
for: 5m
labels:
severity: warning
annotations:
summary: Memcached memory usage high (> 90%) (instance {{ $labels.instance }})
description: "Memcached memory usage is above 90% on {{ $labels.instance }} (current value: {{ $value }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# A sustained eviction rate indicates memory pressure. Consider increasing memcached memory limit or reducing cache usage. Threshold of 10 evictions/s is a rough default — adjust based on your workload.
- alert: MemcachedHighEvictionRate
expr: 'rate(memcached_items_evicted_total[5m]) > 10'
for: 5m
labels:
severity: warning
annotations:
summary: Memcached high eviction rate (instance {{ $labels.instance }})
description: "Memcached is evicting items at a high rate on {{ $labels.instance }} ({{ $value }} evictions/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# A low hit rate may indicate poor cache utilization, incorrect cache keys, or TTLs that are too short. Threshold of 80% is a rough default — adjust based on your workload and access patterns.
- alert: MemcachedLowCacheHitRate(<80%)
expr: '(rate(memcached_commands_total{command="get", status="hit"}[5m]) / (rate(memcached_commands_total{command="get", status="hit"}[5m]) + rate(memcached_commands_total{command="get", status="miss"}[5m])) * 100) < 80 and (rate(memcached_commands_total{command="get", status="hit"}[5m]) + rate(memcached_commands_total{command="get", status="miss"}[5m])) > 0'
for: 10m
labels:
severity: warning
annotations:
summary: Memcached low cache hit rate (< 80%) (instance {{ $labels.instance }})
description: "Memcached cache hit rate is below 80% on {{ $labels.instance }} (current value: {{ $value }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MemcachedConnectionsRejected
expr: 'increase(memcached_connections_rejected_total[5m]) > 3'
for: 5m
labels:
severity: warning
annotations:
summary: Memcached connections rejected (instance {{ $labels.instance }})
description: "Memcached is rejecting connections on {{ $labels.instance }} ({{ $value }} rejections in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MemcachedItemsTooLarge
expr: 'increase(memcached_item_too_large_total[5m]) > 3'
for: 5m
labels:
severity: info
annotations:
summary: Memcached items too large (instance {{ $labels.instance }})
description: "Memcached is rejecting items exceeding max-item-size on {{ $labels.instance }} ({{ $value }} rejections in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -2,11 +2,10 @@ groups:
- name: EmbeddedExporter
rules:
- alert: MinioClusterDiskOffline
expr: 'minio_cluster_drive_offline_total > 0'
expr: 'minio_cluster_disk_offline_total > 0'
for: 0m
labels:
severity: critical
@ -24,7 +23,7 @@ groups:
description: "Minio cluster node disk is offline\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MinioDiskSpaceUsage
expr: 'minio_cluster_capacity_raw_free_bytes / minio_cluster_capacity_raw_total_bytes * 100 < 10 and minio_cluster_capacity_raw_total_bytes > 0'
expr: 'disk_storage_available / disk_storage_total * 100 < 10'
for: 0m
labels:
severity: warning

View file

@ -2,16 +2,15 @@ groups:
- name: DcuMongodbExporter
rules:
- alert: MongodbReplicationLag(dcu)
- alert: MongodbReplicationLag
expr: 'avg(mongodb_replset_member_optime_date{state="PRIMARY"}) - avg(mongodb_replset_member_optime_date{state="SECONDARY"}) > 10'
for: 0m
labels:
severity: critical
annotations:
summary: MongoDB replication lag (DCU) (instance {{ $labels.instance }})
summary: MongoDB replication lag (instance {{ $labels.instance }})
description: "Mongodb replication lag is more than 10s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MongodbReplicationStatus3
@ -59,29 +58,38 @@ groups:
summary: MongoDB replication Status 10 (instance {{ $labels.instance }})
description: "MongoDB Replication set member was once in a replica set but was subsequently removed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MongodbNumberCursorsOpen(dcu)
- alert: MongodbNumberCursorsOpen
expr: 'mongodb_metrics_cursor_open{state="total_open"} > 10000'
for: 2m
labels:
severity: warning
annotations:
summary: MongoDB number cursors open (DCU) (instance {{ $labels.instance }})
summary: MongoDB number cursors open (instance {{ $labels.instance }})
description: "Too many cursors opened by MongoDB for clients (> 10k)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MongodbCursorsTimeouts(dcu)
- alert: MongodbCursorsTimeouts
expr: 'increase(mongodb_metrics_cursor_timed_out_total[1m]) > 100'
for: 2m
labels:
severity: warning
annotations:
summary: MongoDB cursors timeouts (DCU) (instance {{ $labels.instance }})
description: "Too many cursors are timing out ({{ $value }} in the last minute)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: MongoDB cursors timeouts (instance {{ $labels.instance }})
description: "Too many cursors are timing out\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MongodbTooManyConnections(dcu)
expr: 'mongodb_connections{state="current"} / (mongodb_connections{state="current"} + mongodb_connections{state="available"}) * 100 > 80 and (mongodb_connections{state="current"} + mongodb_connections{state="available"}) > 0'
- alert: MongodbTooManyConnections
expr: 'avg by(instance) (rate(mongodb_connections{state="current"}[1m])) / avg by(instance) (sum (mongodb_connections) by (instance)) * 100 > 80'
for: 2m
labels:
severity: warning
annotations:
summary: MongoDB too many connections (DCU) (instance {{ $labels.instance }})
summary: MongoDB too many connections (instance {{ $labels.instance }})
description: "Too many connections (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MongodbVirtualMemoryUsage
expr: '(sum(mongodb_memory{type="virtual"}) BY (instance) / sum(mongodb_memory{type="mapped"}) BY (instance)) > 3'
for: 2m
labels:
severity: warning
annotations:
summary: MongoDB virtual memory usage (instance {{ $labels.instance }})
description: "High memory usage\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -2,39 +2,35 @@ groups:
- name: PerconaMongodbExporter
rules:
# 1m delay allows a restart without triggering an alert.
- alert: MongodbDown
expr: 'mongodb_up == 0'
for: 1m
for: 0m
labels:
severity: critical
annotations:
summary: MongoDB Down (instance {{ $labels.instance }})
description: "MongoDB instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# 1m delay allows a restart without triggering an alert.
- alert: MongodbReplicaMemberUnhealthy
expr: 'mongodb_rs_members_health == 0'
for: 1m
for: 0m
labels:
severity: critical
annotations:
summary: Mongodb replica member unhealthy (instance {{ $labels.instance }})
description: "MongoDB replica member is not healthy\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MongodbReplicationLag(percona)
- alert: MongodbReplicationLag
expr: '(mongodb_rs_members_optimeDate{member_state="PRIMARY"} - on (set) group_right mongodb_rs_members_optimeDate{member_state="SECONDARY"}) / 1000 > 10'
for: 0m
labels:
severity: critical
annotations:
summary: MongoDB replication lag (Percona) (instance {{ $labels.instance }})
summary: MongoDB replication lag (instance {{ $labels.instance }})
description: "Mongodb replication lag is more than 10s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# This query mixes old (mongodb_mongod_*) and new (mongodb_rs_*) metric names. It requires the Percona exporter to run with --compatible-mode to expose both.
- alert: MongodbReplicationHeadroom
expr: 'sum(avg(mongodb_mongod_replset_oplog_head_timestamp - mongodb_mongod_replset_oplog_tail_timestamp)) - sum(avg(mongodb_rs_members_optimeDate{member_state="PRIMARY"} - on (set) group_right mongodb_rs_members_optimeDate{member_state="SECONDARY"})) <= 0'
for: 0m
@ -44,29 +40,38 @@ groups:
summary: MongoDB replication headroom (instance {{ $labels.instance }})
description: "MongoDB replication headroom is <= 0\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MongodbNumberCursorsOpen(percona)
- alert: MongodbNumberCursorsOpen
expr: 'mongodb_ss_metrics_cursor_open{csr_type="total"} > 10 * 1000'
for: 2m
labels:
severity: warning
annotations:
summary: MongoDB number cursors open (Percona) (instance {{ $labels.instance }})
summary: MongoDB number cursors open (instance {{ $labels.instance }})
description: "Too many cursors opened by MongoDB for clients (> 10k)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MongodbCursorsTimeouts(percona)
- alert: MongodbCursorsTimeouts
expr: 'increase(mongodb_ss_metrics_cursor_timedOut[1m]) > 100'
for: 2m
labels:
severity: warning
annotations:
summary: MongoDB cursors timeouts (Percona) (instance {{ $labels.instance }})
description: "Too many cursors are timing out ({{ $value }} in the last minute)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: MongoDB cursors timeouts (instance {{ $labels.instance }})
description: "Too many cursors are timing out\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MongodbTooManyConnections(percona)
expr: 'mongodb_ss_connections{conn_type="current"} / (mongodb_ss_connections{conn_type="current"} + mongodb_ss_connections{conn_type="available"}) * 100 > 80 and (mongodb_ss_connections{conn_type="current"} + mongodb_ss_connections{conn_type="available"}) > 0'
- alert: MongodbTooManyConnections
expr: 'avg by(instance) (rate(mongodb_ss_connections{conn_type="current"}[1m])) / avg by(instance) (sum (mongodb_ss_connections) by (instance)) * 100 > 80'
for: 2m
labels:
severity: warning
annotations:
summary: MongoDB too many connections (Percona) (instance {{ $labels.instance }})
summary: MongoDB too many connections (instance {{ $labels.instance }})
description: "Too many connections (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MongodbVirtualMemoryUsage
expr: '(sum(mongodb_ss_mem_virtual) BY (instance) / sum(mongodb_ss_mem_resident) BY (instance)) > 3'
for: 2m
labels:
severity: warning
annotations:
summary: MongoDB virtual memory usage (instance {{ $labels.instance }})
description: "High memory usage\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -2,7 +2,6 @@ groups:
- name: StefanprodanMgobExporter
rules:
- alert: MgobBackupFailed

View file

@ -2,13 +2,11 @@ groups:
- name: MysqldExporter
rules:
# 1m delay allows a restart without triggering an alert.
- alert: MysqlDown
expr: 'mysql_up == 0'
for: 1m
for: 0m
labels:
severity: critical
annotations:
@ -16,7 +14,7 @@ groups:
description: "MySQL instance is down on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MysqlTooManyConnections(>80%)
expr: 'max_over_time(mysql_global_status_threads_connected[1m]) / mysql_global_variables_max_connections * 100 > 80 and mysql_global_variables_max_connections > 0'
expr: 'max_over_time(mysql_global_status_threads_connected[1m]) / mysql_global_variables_max_connections * 100 > 80'
for: 2m
labels:
severity: warning
@ -25,7 +23,7 @@ groups:
description: "More than 80% of MySQL connections are in use on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MysqlHighPreparedStatementsUtilization(>80%)
expr: 'max_over_time(mysql_global_status_prepared_stmt_count[1m]) / mysql_global_variables_max_prepared_stmt_count * 100 > 80 and mysql_global_variables_max_prepared_stmt_count > 0'
expr: 'max_over_time(mysql_global_status_prepared_stmt_count[1m]) / mysql_global_variables_max_prepared_stmt_count * 100 > 80'
for: 2m
labels:
severity: warning
@ -34,7 +32,7 @@ groups:
description: "High utilization of prepared statements (>80%) on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MysqlHighThreadsRunning
expr: 'max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections * 100 > 60 and mysql_global_variables_max_connections > 0'
expr: 'max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections * 100 > 60'
for: 2m
labels:
severity: warning
@ -42,20 +40,18 @@ groups:
summary: MySQL high threads running (instance {{ $labels.instance }})
description: "More than 60% of MySQL connections are in running state on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# 1m delay allows a restart without triggering an alert.
- alert: MysqlSlaveIoThreadNotRunning
expr: '( mysql_slave_status_slave_io_running and ON (instance) mysql_slave_status_master_server_id > 0 ) == 0'
for: 1m
for: 0m
labels:
severity: critical
annotations:
summary: MySQL Slave IO thread not running (instance {{ $labels.instance }})
description: "MySQL Slave IO thread not running on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# 1m delay allows a restart without triggering an alert.
- alert: MysqlSlaveSqlThreadNotRunning
expr: '( mysql_slave_status_slave_sql_running and ON (instance) mysql_slave_status_master_server_id > 0) == 0'
for: 1m
for: 0m
labels:
severity: critical
annotations:
@ -71,25 +67,23 @@ groups:
summary: MySQL Slave replication lag (instance {{ $labels.instance }})
description: "MySQL replication lag on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# mysqld_exporter exposes SHOW GLOBAL STATUS variables as untyped/gauge, so delta() is used instead of increase().
- alert: MysqlSlowQueries
expr: 'delta(mysql_global_status_slow_queries[1m]) > 0'
expr: 'increase(mysql_global_status_slow_queries[1m]) > 0'
for: 2m
labels:
severity: warning
annotations:
summary: MySQL slow queries (instance {{ $labels.instance }})
description: "MySQL server has some new slow queries ({{ $value }} in the last minute).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "MySQL server mysql has some new slow query.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# mysqld_exporter exposes SHOW GLOBAL STATUS variables as untyped/gauge, so deriv() is used instead of rate().
- alert: MysqlInnodbLogWaits
expr: 'deriv(mysql_global_status_innodb_log_waits[15m]) > 10'
expr: 'rate(mysql_global_status_innodb_log_waits[15m]) > 10'
for: 0m
labels:
severity: warning
annotations:
summary: MySQL InnoDB log waits (instance {{ $labels.instance }})
description: "MySQL innodb log writes stalling ({{ $value }} waits/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "MySQL innodb log writes stalling\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MysqlRestarted
expr: 'mysql_global_status_uptime < 60'
@ -99,40 +93,3 @@ groups:
annotations:
summary: MySQL restarted (instance {{ $labels.instance }})
description: "MySQL has just been restarted, less than one minute ago on {{ $labels.instance }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# mysqld_exporter exposes SHOW GLOBAL STATUS variables as untyped/gauge, so deriv() is used instead of irate().
- alert: MysqlHighQps
expr: 'deriv(mysql_global_status_questions[1m]) > 10000'
for: 2m
labels:
severity: info
annotations:
summary: MySQL High QPS (instance {{ $labels.instance }})
description: "MySQL is being overload with unusual QPS (> 10k QPS).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MysqlTooManyOpenFiles
expr: 'mysql_global_status_innodb_num_open_files / mysql_global_variables_open_files_limit * 100 > 75 and mysql_global_variables_open_files_limit > 0'
for: 2m
labels:
severity: warning
annotations:
summary: MySQL too many open files (instance {{ $labels.instance }})
description: "MySQL has too many open files, consider increase variables open_files_limit on {{ $labels.instance }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MysqlInnodbForceRecoveryIsEnabled
expr: 'mysql_global_variables_innodb_force_recovery != 0'
for: 2m
labels:
severity: warning
annotations:
summary: MySQL InnoDB Force Recovery is enabled (instance {{ $labels.instance }})
description: "MySQL InnoDB force recovery is enabled on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MysqlInnodbHistory_lenTooLong
expr: 'mysql_info_schema_innodb_metrics_transaction_trx_rseg_history_len > 50000'
for: 2m
labels:
severity: warning
annotations:
summary: MySQL InnoDB history_len too long (instance {{ $labels.instance }})
description: "MySQL history_len (undo log) too long on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -2,126 +2,40 @@ groups:
- name: NatsExporter
rules:
- alert: NatsHighConnectionCount
expr: 'gnatsd_varz_connections > 100'
for: 3m
labels:
severity: warning
annotations:
summary: Nats high connection count (instance {{ $labels.instance }})
description: "High number of NATS connections ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsHighPendingBytes
expr: 'gnatsd_connz_pending_bytes > 100000'
for: 3m
labels:
severity: warning
annotations:
summary: Nats high pending bytes (instance {{ $labels.instance }})
description: "High number of NATS pending bytes ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsHighSubscriptionsCount
expr: 'gnatsd_connz_subscriptions > 50'
for: 3m
labels:
severity: warning
annotations:
summary: Nats high subscriptions count (instance {{ $labels.instance }})
description: "High number of NATS subscriptions ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsHighRoutesCount
expr: 'gnatsd_varz_routes > 10'
expr: 'gnatsd_routez_num_routes > 10'
for: 3m
labels:
severity: warning
annotations:
summary: Nats high routes count (instance {{ $labels.instance }})
description: "High number of NATS routes ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsHighMemoryUsage
expr: 'gnatsd_varz_mem > 200 * 1024 * 1024'
for: 5m
labels:
severity: warning
annotations:
summary: Nats high memory usage (instance {{ $labels.instance }})
description: "NATS server memory usage is above 200MB for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsSlowConsumers
expr: 'gnatsd_varz_slow_consumers > 0'
for: 3m
labels:
severity: critical
annotations:
summary: Nats slow consumers (instance {{ $labels.instance }})
description: "There are slow consumers in NATS for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Replace job="nats" with the actual job name in your Prometheus configuration.
- alert: NatsServerDown
expr: 'absent(up{job="nats"})'
for: 5m
labels:
severity: critical
annotations:
summary: Nats server down (instance {{ $labels.instance }})
description: "NATS server has been down for more than 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# gnatsd_varz_cpu is a gauge reporting CPU percentage (0-100 scale).
- alert: NatsHighCpuUsage
expr: 'gnatsd_varz_cpu > 80'
for: 5m
labels:
severity: warning
annotations:
summary: Nats high CPU usage (instance {{ $labels.instance }})
description: "NATS server is using more than 80% CPU for the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsHighNumberOfConnections
expr: 'gnatsd_connz_num_connections > 1000'
for: 5m
labels:
severity: warning
annotations:
summary: Nats high number of connections (instance {{ $labels.instance }})
description: "NATS server has more than 1000 active connections\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsHighJetstreamStoreUsage
expr: 'gnatsd_varz_jetstream_stats_storage / gnatsd_varz_jetstream_config_max_storage > 0.8 and gnatsd_varz_jetstream_config_max_storage > 0'
for: 5m
labels:
severity: warning
annotations:
summary: Nats high JetStream store usage (instance {{ $labels.instance }})
description: "JetStream store usage is over 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsHighJetstreamMemoryUsage
expr: 'gnatsd_varz_jetstream_stats_memory / gnatsd_varz_jetstream_config_max_memory > 0.8 and gnatsd_varz_jetstream_config_max_memory > 0'
for: 5m
labels:
severity: warning
annotations:
summary: Nats high JetStream memory usage (instance {{ $labels.instance }})
description: "JetStream memory usage is over 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsHighNumberOfSubscriptions
expr: 'gnatsd_varz_subscriptions > 1000'
for: 5m
labels:
severity: warning
annotations:
summary: Nats high number of subscriptions (instance {{ $labels.instance }})
description: "NATS server has more than 1000 active subscriptions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsHighPendingBytes
expr: 'gnatsd_connz_pending_bytes > 100000'
for: 5m
labels:
severity: warning
annotations:
summary: Nats high pending bytes (instance {{ $labels.instance }})
description: "NATS server has more than 100,000 pending bytes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsTooManyErrors
expr: 'increase(gnatsd_varz_jetstream_stats_api_errors[5m]) > 5'
for: 5m
labels:
severity: warning
annotations:
summary: Nats too many errors (instance {{ $labels.instance }})
description: "NATS server has encountered {{ $value }} JetStream API errors in the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsJetstreamAccountsExceeded
expr: 'sum(gnatsd_varz_jetstream_stats_accounts) > 100'
for: 5m
labels:
severity: warning
annotations:
summary: Nats JetStream accounts exceeded (instance {{ $labels.instance }})
description: "JetStream has more than 100 active accounts\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Only enable this alert if your deployment requires leaf node connections.
# This will fire spuriously if leaf nodes are not configured.
- alert: NatsLeafNodeConnectionIssue
expr: 'gnatsd_varz_leafnodes == 0'
for: 5m
labels:
severity: warning
annotations:
summary: Nats leaf node connection issue (instance {{ $labels.instance }})
description: "No leaf node connections on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -2,12 +2,10 @@ groups:
- name: EmbeddedExporter
rules:
# This is a gauge metric (not a counter). Checking idle < 20% means CPU usage > 80%.
- alert: NetdataHighCpuUsage
expr: 'netdata_cpu_cpu_percentage_average{dimension="idle"} < 20'
expr: 'rate(netdata_cpu_cpu_percentage_average{dimension="idle"}[1m]) > 80'
for: 5m
labels:
severity: warning
@ -15,17 +13,17 @@ groups:
summary: Netdata high cpu usage (instance {{ $labels.instance }})
description: "Netdata high CPU usage (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NetdataCpuStealNoisyNeighbor
expr: 'netdata_cpu_cpu_percentage_average{dimension="steal"} > 10'
- alert: HostCpuStealNoisyNeighbor
expr: 'rate(netdata_cpu_cpu_percentage_average{dimension="steal"}[1m]) > 10'
for: 5m
labels:
severity: warning
annotations:
summary: Netdata CPU steal noisy neighbor (instance {{ $labels.instance }})
summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NetdataHighMemoryUsage
expr: '100 / netdata_system_ram_MiB_average * netdata_system_ram_MiB_average{dimension=~"free|cached"} < 20 and netdata_system_ram_MiB_average > 0'
expr: '100 / netdata_system_ram_MB_average * netdata_system_ram_MB_average{dimension=~"free|cached"} < 20'
for: 5m
labels:
severity: warning
@ -34,7 +32,7 @@ groups:
description: "Netdata high memory usage (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NetdataLowDiskSpace
expr: '100 / netdata_disk_space_GB_average * netdata_disk_space_GB_average{dimension=~"avail|cached"} < 20 and netdata_disk_space_GB_average > 0'
expr: '100 / netdata_disk_space_GB_average * netdata_disk_space_GB_average{dimension=~"avail|cached"} < 20'
for: 5m
labels:
severity: warning
@ -67,7 +65,7 @@ groups:
severity: info
annotations:
summary: Netdata disk reallocated sectors (instance {{ $labels.instance }})
description: "Disk reallocated sectors detected ({{ $value }} sectors)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Reallocated sectors on disk\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NetdataDiskCurrentPendingSector
expr: 'netdata_smartd_log_current_pending_sector_count_sectors_average > 0'
@ -85,4 +83,4 @@ groups:
severity: warning
annotations:
summary: Netdata reported uncorrectable disk sectors (instance {{ $labels.instance }})
description: "Reported uncorrectable disk sectors ({{ $value }} sectors)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Reported uncorrectable disk sectors\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -2,11 +2,10 @@ groups:
- name: KnyarNginxExporter
rules:
- alert: NginxHighHttp4xxErrorRate
expr: 'sum(rate(nginx_http_requests_total{status=~"^4.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5 and sum(rate(nginx_http_requests_total[1m])) > 0'
expr: 'sum(rate(nginx_http_requests_total{status=~"^4.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5'
for: 1m
labels:
severity: critical
@ -15,7 +14,7 @@ groups:
description: "Too many HTTP requests with status 4xx (> 5%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NginxHighHttp5xxErrorRate
expr: 'sum(rate(nginx_http_requests_total{status=~"^5.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5 and sum(rate(nginx_http_requests_total[1m])) > 0'
expr: 'sum(rate(nginx_http_requests_total{status=~"^5.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5'
for: 1m
labels:
severity: critical

View file

@ -2,7 +2,6 @@ groups:
- name: EmbeddedExporter
rules:
- alert: NomadJobFailed
@ -12,7 +11,7 @@ groups:
severity: warning
annotations:
summary: Nomad job failed (instance {{ $labels.instance }})
description: "Nomad job {{ $labels.job }} has {{ $value }} failed allocations.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Nomad job failed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NomadJobLost
expr: 'nomad_nomad_job_summary_lost > 0'
@ -21,7 +20,7 @@ groups:
severity: warning
annotations:
summary: Nomad job lost (instance {{ $labels.instance }})
description: "Nomad job {{ $labels.job }} has {{ $value }} lost allocations.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Nomad job lost\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NomadJobQueued
expr: 'nomad_nomad_job_summary_queued > 0'
@ -30,7 +29,7 @@ groups:
severity: warning
annotations:
summary: Nomad job queued (instance {{ $labels.instance }})
description: "Nomad job {{ $labels.job }} has {{ $value }} queued allocations.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Nomad job queued\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NomadBlockedEvaluation
expr: 'nomad_nomad_blocked_evals_total_blocked > 0'
@ -39,4 +38,4 @@ groups:
severity: warning
annotations:
summary: Nomad blocked evaluation (instance {{ $labels.instance }})
description: "Nomad has {{ $value }} blocked evaluations. The cluster may lack resources to place allocations.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Nomad blocked evaluation\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -2,7 +2,6 @@ groups:
- name: EmbeddedExporter
rules:
- alert: OpenebsUsedPoolCapacity

View file

@ -1,60 +0,0 @@
groups:
- name: OpensearchProjectOpensearchPrometheusExporter
rules:
- alert: OpensearchIsUnhealthy
expr: 'opensearch_cluster_status != 0'
for: 0m
labels:
severity: critical
annotations:
summary: OpenSearch is unhealthy (instance {{ $labels.instance }})
description: "OpenSearch cluster {{ $labels.cluster }} is unhealthy\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: OpensearchHighHeapUsage
expr: 'opensearch_jvm_mem_heap_used_percent > 90'
for: 5m
labels:
severity: warning
annotations:
summary: OpenSearch high heap usage (instance {{ $labels.instance }})
description: "OpenSearch heap usage on cluster {{ $labels.cluster }} is too high\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: OpensearchCircuitbreakerTripped
expr: 'opensearch_circuitbreaker_tripped_count > 0'
for: 5m
labels:
severity: warning
annotations:
summary: OpenSearch circuitbreaker tripped (instance {{ $labels.instance }})
description: "The circuitbreaker on OpenSearch cluster {{ $labels.cluster }} has tripped to prevent Java OutOfMemoryError\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: OpensearchHasPendingTasks
expr: 'opensearch_cluster_pending_tasks_number > 0'
for: 5m
labels:
severity: warning
annotations:
summary: OpenSearch has pending tasks (instance {{ $labels.instance }})
description: "OpenSearch cluster {{ $labels.cluster }} has pending tasks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: OpensearchIndexingIsThrottled
expr: 'opensearch_indices_indexing_is_throttled_bool > 0'
for: 5m
labels:
severity: warning
annotations:
summary: OpenSearch indexing is throttled (instance {{ $labels.instance }})
description: "Indexing on OpenSearch cluster {{ $labels.cluster }} is throttled\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: OpensearchHasInactiveShards
expr: 'opensearch_cluster_shards_active_percent < 100.0'
for: 5m
labels:
severity: warning
annotations:
summary: OpenSearch has inactive shards (instance {{ $labels.instance }})
description: "OpenSearch cluster {{ $labels.cluster }} has inactive shards\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -1,192 +0,0 @@
groups:
- name: OpenstackExporter
rules:
# Adjust the job label regex to match the actual job name in your Prometheus scrape config.
- alert: OpenstackExporterDown
expr: 'up{job=~".*openstack.*"} == 0'
for: 2m
labels:
severity: critical
annotations:
summary: OpenStack exporter down (instance {{ $labels.instance }})
description: "The OpenStack exporter is down. OpenStack cloud metrics are no longer being collected.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: OpenstackNovaAgentDown
expr: 'openstack_nova_agent_state{adminState="enabled"} == 0'
for: 2m
labels:
severity: critical
annotations:
summary: OpenStack Nova agent down (instance {{ $labels.instance }})
description: "Nova agent {{ $labels.hostname }} ({{ $labels.service }}) is down in zone {{ $labels.zone }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: OpenstackNeutronAgentDown
expr: 'openstack_neutron_agent_state{adminState="up"} == 0'
for: 2m
labels:
severity: critical
annotations:
summary: OpenStack Neutron agent down (instance {{ $labels.instance }})
description: "Neutron agent {{ $labels.hostname }} ({{ $labels.service }}) is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: OpenstackCinderAgentDown
expr: 'openstack_cinder_agent_state{adminState="enabled"} == 0'
for: 2m
labels:
severity: critical
annotations:
summary: OpenStack Cinder agent down (instance {{ $labels.instance }})
description: "Cinder agent {{ $labels.hostname }} ({{ $labels.service }}) is down in zone {{ $labels.zone }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# The threshold of 90% is a rough default. Adjust based on your overcommit ratio and workload patterns.
- alert: OpenstackHypervisorHighVcpuUsage
expr: 'openstack_nova_vcpus_used / openstack_nova_vcpus_available > 0.9 and openstack_nova_vcpus_available > 0'
for: 5m
labels:
severity: warning
annotations:
summary: OpenStack hypervisor high vCPU usage (instance {{ $labels.instance }})
description: "Hypervisor {{ $labels.hostname }} vCPU usage is above 90%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# The threshold of 90% is a rough default. Adjust based on your overcommit ratio and workload patterns.
- alert: OpenstackHypervisorHighMemoryUsage
expr: 'openstack_nova_memory_used_bytes / openstack_nova_memory_available_bytes > 0.9 and openstack_nova_memory_available_bytes > 0'
for: 5m
labels:
severity: warning
annotations:
summary: OpenStack hypervisor high memory usage (instance {{ $labels.instance }})
description: "Hypervisor {{ $labels.hostname }} memory usage is above 90%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: OpenstackHypervisorHighDiskUsage
expr: 'openstack_nova_local_storage_used_bytes / openstack_nova_local_storage_available_bytes > 0.9 and openstack_nova_local_storage_available_bytes > 0'
for: 5m
labels:
severity: warning
annotations:
summary: OpenStack hypervisor high disk usage (instance {{ $labels.instance }})
description: "Hypervisor {{ $labels.hostname }} local disk usage is above 90%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# A value of -1 for limits_vcpus_max means unlimited quota (no limit set).
- alert: OpenstackNovaTenantVcpuQuotaNearlyExhausted
expr: 'openstack_nova_limits_vcpus_used / openstack_nova_limits_vcpus_max > 0.9 and openstack_nova_limits_vcpus_max > 0'
for: 0m
labels:
severity: warning
annotations:
summary: OpenStack Nova tenant vCPU quota nearly exhausted (instance {{ $labels.instance }})
description: "Tenant {{ $labels.tenant }} has used over 90% of its vCPU quota\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: OpenstackNovaTenantMemoryQuotaNearlyExhausted
expr: 'openstack_nova_limits_memory_used / openstack_nova_limits_memory_max > 0.9 and openstack_nova_limits_memory_max > 0'
for: 0m
labels:
severity: warning
annotations:
summary: OpenStack Nova tenant memory quota nearly exhausted (instance {{ $labels.instance }})
description: "Tenant {{ $labels.tenant }} has used over 90% of its memory quota\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: OpenstackNovaTenantInstanceQuotaNearlyExhausted
expr: 'openstack_nova_limits_instances_used / openstack_nova_limits_instances_max > 0.9 and openstack_nova_limits_instances_max > 0'
for: 0m
labels:
severity: warning
annotations:
summary: OpenStack Nova tenant instance quota nearly exhausted (instance {{ $labels.instance }})
description: "Tenant {{ $labels.tenant }} has used over 90% of its instance quota\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: OpenstackCinderTenantVolumeQuotaNearlyExhausted
expr: 'openstack_cinder_limits_volume_used_gb / openstack_cinder_limits_volume_max_gb > 0.9 and openstack_cinder_limits_volume_max_gb > 0'
for: 0m
labels:
severity: warning
annotations:
summary: OpenStack Cinder tenant volume quota nearly exhausted (instance {{ $labels.instance }})
description: "Tenant {{ $labels.tenant }} has used over 90% of its volume storage quota\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: OpenstackCinderPoolLowFreeCapacity
expr: 'openstack_cinder_pool_capacity_free_gb / openstack_cinder_pool_capacity_total_gb < 0.1 and openstack_cinder_pool_capacity_total_gb > 0'
for: 5m
labels:
severity: warning
annotations:
summary: OpenStack Cinder pool low free capacity (instance {{ $labels.instance }})
description: "Cinder storage pool {{ $labels.name }} has less than 10% free capacity\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: OpenstackNeutronFloatingIpsAssociatedButNotActive
expr: 'openstack_neutron_floating_ips_associated_not_active > 0'
for: 5m
labels:
severity: warning
annotations:
summary: OpenStack Neutron floating IPs associated but not active (instance {{ $labels.instance }})
description: "{{ $value }} floating IPs are associated to a private IP but are not in ACTIVE state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: OpenstackNeutronRoutersNotActive
expr: 'openstack_neutron_routers_not_active > 0'
for: 5m
labels:
severity: warning
annotations:
summary: OpenStack Neutron routers not active (instance {{ $labels.instance }})
description: "{{ $value }} Neutron routers are not in ACTIVE state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: OpenstackNeutronSubnetIpPoolExhaustion
expr: 'openstack_neutron_network_ip_availabilities_used / openstack_neutron_network_ip_availabilities_total > 0.9 and openstack_neutron_network_ip_availabilities_total > 0'
for: 0m
labels:
severity: warning
annotations:
summary: OpenStack Neutron subnet IP pool exhaustion (instance {{ $labels.instance }})
description: "Subnet {{ $labels.subnet_name }} on network {{ $labels.network_name }} has used over 90% of its IP pool\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: OpenstackNeutronPortsWithoutIps
expr: 'openstack_neutron_ports_no_ips > 0'
for: 5m
labels:
severity: warning
annotations:
summary: OpenStack Neutron ports without IPs (instance {{ $labels.instance }})
description: "{{ $value }} active ports have no IP addresses assigned\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: OpenstackLoadBalancerNotOnline
expr: 'openstack_loadbalancer_loadbalancer_status{operating_status!="ONLINE"} > 0'
for: 5m
labels:
severity: warning
annotations:
summary: OpenStack load balancer not online (instance {{ $labels.instance }})
description: "Load balancer {{ $labels.name }} ({{ $labels.id }}) operating status is {{ $labels.operating_status }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: OpenstackNovaInstancesInErrorState
expr: 'sum(openstack_nova_server_status{status="ERROR"}) > 0'
for: 5m
labels:
severity: warning
annotations:
summary: OpenStack Nova instances in ERROR state (instance {{ $labels.instance }})
description: "{{ $value }} Nova instances are in ERROR state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: OpenstackCinderVolumesInErrorState
expr: 'openstack_cinder_volume_status_counter{status=~"error.*"} > 0'
for: 5m
labels:
severity: warning
annotations:
summary: OpenStack Cinder volumes in error state (instance {{ $labels.instance }})
description: "{{ $value }} Cinder volumes are in an error state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# This alert factors in the allocation ratio to compute effective capacity.
# The threshold of 90% is a rough default. Adjust based on your allocation ratios and workload patterns.
- alert: OpenstackPlacementResourceHighUsage
expr: 'openstack_placement_resource_usage / (openstack_placement_resource_total * openstack_placement_resource_allocation_ratio) > 0.9 and openstack_placement_resource_total > 0'
for: 5m
labels:
severity: warning
annotations:
summary: OpenStack placement resource high usage (instance {{ $labels.instance }})
description: "Resource {{ $labels.resourcetype }} on host {{ $labels.hostname }} usage exceeds 90% of its allocation\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -1,128 +0,0 @@
groups:
- name: EmbeddedExporter
# OpenTelemetry Collector self-monitoring metrics are exposed on port 8888 by default at the /metrics endpoint.
# These alerts monitor the collector's health when metrics are ingested via the Prometheus OTLP endpoint or scraped directly.
# All collector internal metrics are prefixed with 'otelcol_'.
rules:
# Adjust the job label regex to match the actual job name in your Prometheus scrape config.
- alert: OpentelemetryCollectorDown
expr: 'up{job=~".*otel.*collector.*"} == 0'
for: 1m
labels:
severity: critical
annotations:
summary: OpenTelemetry Collector down (instance {{ $labels.instance }})
description: "OpenTelemetry Collector instance has disappeared or is not being scraped\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 0.05/s avoids firing on transient single-event spikes.
- alert: OpentelemetryCollectorReceiverRefusedSpans
expr: 'rate(otelcol_receiver_refused_spans[5m]) > 0.05'
for: 5m
labels:
severity: critical
annotations:
summary: OpenTelemetry Collector receiver refused spans (instance {{ $labels.instance }})
description: "OpenTelemetry Collector is refusing {{ $value | humanize }}/s spans on {{ $labels.receiver }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 0.05/s avoids firing on transient single-event spikes.
- alert: OpentelemetryCollectorReceiverRefusedMetricPoints
expr: 'rate(otelcol_receiver_refused_metric_points[5m]) > 0.05'
for: 5m
labels:
severity: critical
annotations:
summary: OpenTelemetry Collector receiver refused metric points (instance {{ $labels.instance }})
description: "OpenTelemetry Collector is refusing {{ $value | humanize }}/s metric points on {{ $labels.receiver }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 0.05/s avoids firing on transient single-event spikes.
- alert: OpentelemetryCollectorReceiverRefusedLogRecords
expr: 'rate(otelcol_receiver_refused_log_records[5m]) > 0.05'
for: 5m
labels:
severity: critical
annotations:
summary: OpenTelemetry Collector receiver refused log records (instance {{ $labels.instance }})
description: "OpenTelemetry Collector is refusing {{ $value | humanize }}/s log records on {{ $labels.receiver }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 0.05/s avoids firing on transient single-event spikes.
- alert: OpentelemetryCollectorExporterFailedSpans
expr: 'rate(otelcol_exporter_send_failed_spans[5m]) > 0.05'
for: 5m
labels:
severity: warning
annotations:
summary: OpenTelemetry Collector exporter failed spans (instance {{ $labels.instance }})
description: "OpenTelemetry Collector failing to send {{ $value | humanize }}/s spans via {{ $labels.exporter }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 0.05/s avoids firing on transient single-event spikes.
- alert: OpentelemetryCollectorExporterFailedMetricPoints
expr: 'rate(otelcol_exporter_send_failed_metric_points[5m]) > 0.05'
for: 5m
labels:
severity: warning
annotations:
summary: OpenTelemetry Collector exporter failed metric points (instance {{ $labels.instance }})
description: "OpenTelemetry Collector failing to send {{ $value | humanize }}/s metric points via {{ $labels.exporter }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 0.05/s avoids firing on transient single-event spikes.
- alert: OpentelemetryCollectorExporterFailedLogRecords
expr: 'rate(otelcol_exporter_send_failed_log_records[5m]) > 0.05'
for: 5m
labels:
severity: warning
annotations:
summary: OpenTelemetry Collector exporter failed log records (instance {{ $labels.instance }})
description: "OpenTelemetry Collector failing to send {{ $value | humanize }}/s log records via {{ $labels.exporter }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: OpentelemetryCollectorExporterQueueNearlyFull
expr: '(otelcol_exporter_queue_size / on(instance, job, exporter) otelcol_exporter_queue_capacity) > 0.8 and otelcol_exporter_queue_capacity > 0'
for: 0m
labels:
severity: warning
annotations:
summary: OpenTelemetry Collector exporter queue nearly full (instance {{ $labels.instance }})
description: "OpenTelemetry Collector exporter {{ $labels.exporter }} queue is over 80% full\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 0.05/s avoids firing on transient single-event spikes.
# These processor metrics are deprecated since collector v0.110.0.
- alert: OpentelemetryCollectorProcessorRefusedSpans
expr: 'rate(otelcol_processor_refused_spans[5m]) > 0.05'
for: 5m
labels:
severity: warning
annotations:
summary: OpenTelemetry Collector processor refused spans (instance {{ $labels.instance }})
description: "OpenTelemetry Collector processor {{ $labels.processor }} is refusing spans ({{ $value | humanize }}/s), likely due to backpressure.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold of 0.05/s avoids firing on transient single-event spikes.
# These processor metrics are deprecated since collector v0.110.0.
- alert: OpentelemetryCollectorProcessorRefusedMetricPoints
expr: 'rate(otelcol_processor_refused_metric_points[5m]) > 0.05'
for: 5m
labels:
severity: warning
annotations:
summary: OpenTelemetry Collector processor refused metric points (instance {{ $labels.instance }})
description: "OpenTelemetry Collector processor {{ $labels.processor }} is refusing metric points ({{ $value | humanize }}/s), likely due to backpressure.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: OpentelemetryCollectorHighMemoryUsage
expr: '(otelcol_process_runtime_heap_alloc_bytes / on(instance, job) otelcol_process_runtime_total_sys_memory_bytes) > 0.9'
for: 5m
labels:
severity: warning
annotations:
summary: OpenTelemetry Collector high memory usage (instance {{ $labels.instance }})
description: "OpenTelemetry Collector memory usage is above 90%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: OpentelemetryCollectorOtlpReceiverErrors
expr: 'rate(otelcol_receiver_accepted_spans{receiver=~"otlp"}[5m]) == 0 and rate(otelcol_receiver_refused_spans{receiver=~"otlp"}[5m]) > 0'
for: 2m
labels:
severity: critical
annotations:
summary: OpenTelemetry Collector OTLP receiver errors (instance {{ $labels.instance }})
description: "OpenTelemetry Collector OTLP receiver is completely failing - all spans are being refused\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -1,84 +0,0 @@
groups:
- name: IamsethOracledbExporter
rules:
# 1m delay allows a restart without triggering an alert.
- alert: OracleDbDown
expr: 'oracledb_up == 0'
for: 1m
labels:
severity: critical
annotations:
summary: Oracle DB down (instance {{ $labels.instance }})
description: "Oracle Database instance is down on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold is workload-dependent. Adjust 85% to suit your environment.
- alert: OracleDbSessionsReachingLimit(>85%)
expr: 'oracledb_resource_current_utilization{resource_name="sessions"} / oracledb_resource_limit_value{resource_name="sessions"} * 100 > 85 and oracledb_resource_limit_value{resource_name="sessions"} > 0'
for: 5m
labels:
severity: warning
annotations:
summary: Oracle DB sessions reaching limit (> 85%) (instance {{ $labels.instance }})
description: "Oracle Database session utilization is above 85% on {{ $labels.instance }} (current value: {{ $value }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold is workload-dependent. Adjust 85% to suit your environment.
- alert: OracleDbProcessesReachingLimit(>85%)
expr: 'oracledb_resource_current_utilization{resource_name="processes"} / oracledb_resource_limit_value{resource_name="processes"} * 100 > 85 and oracledb_resource_limit_value{resource_name="processes"} > 0'
for: 5m
labels:
severity: warning
annotations:
summary: Oracle DB processes reaching limit (> 85%) (instance {{ $labels.instance }})
description: "Oracle Database process utilization is above 85% on {{ $labels.instance }} (current value: {{ $value }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: OracleDbTablespaceReachingCapacity(>85%)
expr: 'oracledb_tablespace_used_percent > 85'
for: 5m
labels:
severity: warning
annotations:
summary: Oracle DB tablespace reaching capacity (> 85%) (instance {{ $labels.instance }})
description: "Oracle Database tablespace {{ $labels.tablespace }} is above 85% usage on {{ $labels.instance }} (current value: {{ $value }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: OracleDbTablespaceFull(>95%)
expr: 'oracledb_tablespace_used_percent > 95'
for: 5m
labels:
severity: critical
annotations:
summary: Oracle DB tablespace full (> 95%) (instance {{ $labels.instance }})
description: "Oracle Database tablespace {{ $labels.tablespace }} is critically full on {{ $labels.instance }} (current value: {{ $value }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# A high rollback rate (>20%) often indicates application-level issues such as deadlocks, constraint violations, or poorly designed transactions.
- alert: OracleDbHighUserRollbacks
expr: 'rate(oracledb_activity_user_rollbacks[5m]) / (rate(oracledb_activity_user_commits[5m]) + rate(oracledb_activity_user_rollbacks[5m])) * 100 > 20 and (rate(oracledb_activity_user_commits[5m]) + rate(oracledb_activity_user_rollbacks[5m])) > 0'
for: 5m
labels:
severity: warning
annotations:
summary: Oracle DB high user rollbacks (instance {{ $labels.instance }})
description: "Oracle Database on {{ $labels.instance }} has a high rollback rate ({{ $value }}% of transactions are rolled back)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold is highly workload-dependent. Adjust 200 to suit your environment.
- alert: OracleDbTooManyActiveSessions
expr: 'oracledb_sessions_value{status="ACTIVE", type="USER"} > 200'
for: 5m
labels:
severity: warning
annotations:
summary: Oracle DB too many active sessions (instance {{ $labels.instance }})
description: "Oracle Database on {{ $labels.instance }} has too many active user sessions (current value: {{ $value }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# The metric from v$waitclassmetric is already a normalized rate (centiseconds per second). Threshold 300 means 3 seconds of I/O wait per second of wall time.
- alert: OracleDbHighWaitTime(userI/o)
expr: 'oracledb_wait_time_user_io > 300'
for: 5m
labels:
severity: warning
annotations:
summary: Oracle DB high wait time (user I/O) (instance {{ $labels.instance }})
description: "Oracle Database on {{ $labels.instance }} is experiencing high user I/O wait time\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -2,13 +2,11 @@ groups:
- name: EmbeddedExporterPatroni
rules:
# 1m delay allows a restart without triggering an alert.
- alert: PatroniHasNoLeader
expr: '(max by (scope) (patroni_primary) < 1) and (max by (scope) (patroni_standby_leader) < 1)'
for: 1m
expr: '(max by (scope) (patroni_master) < 1) and (max by (scope) (patroni_standby_leader) < 1)'
for: 0m
labels:
severity: critical
annotations:

View file

@ -2,7 +2,6 @@ groups:
- name: SpreakerPgbouncerExporter
rules:
- alert: PgbouncerActiveConnections
@ -21,10 +20,10 @@ groups:
severity: warning
annotations:
summary: PGBouncer errors (instance {{ $labels.instance }})
description: "PGBouncer is logging errors. This may be due to a server restart or an admin typing commands at the pgbouncer console.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "PGBouncer is logging errors. This may be due to a a server restart or an admin typing commands at the pgbouncer console.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PgbouncerMaxConnections
expr: 'increase(pgbouncer_errors_count{errmsg="no more connections allowed (max_client_conn)"}[2m]) > 0'
expr: 'increase(pgbouncer_errors_count{errmsg="no more connections allowed (max_client_conn)"}[30s]) > 0'
for: 0m
labels:
severity: critical

Some files were not shown because too many files have changed in this diff Show more