mirror of
https://github.com/samber/awesome-prometheus-alerts.git
synced 2026-06-21 00:47:18 +08:00
Compare commits
285 commits
2024-05-13
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5cc052fc0a | ||
|
|
63b80c8078 | ||
|
|
3a847e3d02 | ||
|
|
96fc299432 | ||
|
|
074736db2c | ||
|
|
832376c598 | ||
|
|
5c41e54297 | ||
|
|
49dbf0309f | ||
|
|
0cb56fdcfc | ||
|
|
56c10ee930 | ||
|
|
2bdcdbb54e | ||
|
|
1fb78854d4 | ||
|
|
07b24067f3 | ||
|
|
09a0755bee | ||
|
|
bbdcbb7956 | ||
|
|
d1021e7c8b | ||
|
|
adc5477b1e | ||
|
|
b14bfb236b | ||
|
|
43427987af | ||
|
|
4c9da9ed24 | ||
|
|
8ca1fe591f | ||
|
|
f5f4fdfba4 | ||
|
|
73fff11969 | ||
|
|
7fd73364a0 | ||
|
|
b2563bb228 | ||
|
|
90f0a63450 | ||
|
|
353133d23f | ||
|
|
eccf556bdb | ||
|
|
e0311c3c09 | ||
|
|
6d8b2b3671 | ||
|
|
bb8ac9b0cd | ||
|
|
6b2a5af9f9 | ||
|
|
b58c180dcb | ||
|
|
6070e81097 | ||
|
|
4481bb3276 | ||
|
|
b4324742be | ||
|
|
5a5976c9a3 | ||
|
|
1c5f626046 | ||
|
|
bb055773b4 | ||
|
|
d38511d7cb | ||
|
|
a56d8cf2a4 | ||
|
|
25418c5db2 | ||
|
|
5366d4b9ae | ||
|
|
1f8bcca779 | ||
|
|
954999dfa9 | ||
|
|
297fd9864c | ||
|
|
5c166e8403 | ||
|
|
ab87fdcf30 | ||
|
|
aa7d93ce95 | ||
|
|
a4d0b1370c | ||
|
|
d31b3f9ba0 | ||
|
|
89d8423d93 | ||
|
|
814dd5d3fb | ||
|
|
e6ea45aec1 | ||
|
|
bea2dc45b4 | ||
|
|
dd0c8372f9 | ||
|
|
132329abd8 | ||
|
|
9e80bb910e | ||
|
|
79afa21610 | ||
|
|
0d148832d3 | ||
|
|
c2615fae52 | ||
|
|
72c9e922c0 | ||
|
|
ed1515015a | ||
|
|
2258835c30 | ||
|
|
b8fd051a55 | ||
|
|
87d0610246 | ||
|
|
7ba6b2d367 | ||
|
|
b13d59bce6 | ||
|
|
9d9c648cdd | ||
|
|
af2f277830 | ||
|
|
e3a7165a65 | ||
|
|
c0e1f7a5f5 | ||
|
|
1aafa40913 | ||
|
|
4fb1aa9ae4 | ||
|
|
a4581ed322 | ||
|
|
f36c23e393 | ||
|
|
03963ef6f9 | ||
|
|
06f8b048a3 | ||
|
|
5d099fcae1 | ||
|
|
9d00396bc8 | ||
|
|
2b99cf1f76 | ||
|
|
e8eb75c2e2 | ||
|
|
5071e01ad9 | ||
|
|
6423f93ba7 | ||
|
|
1455e0fd77 | ||
|
|
ba5c9a3280 | ||
|
|
d8315eb3bc | ||
|
|
7f346ede99 | ||
|
|
b58b498bbb | ||
|
|
ff17e9c69b | ||
|
|
7ee16641ac | ||
|
|
4da60669d0 | ||
|
|
f974552ef1 | ||
|
|
eeba1ebbaa | ||
|
|
8b443be6d2 | ||
|
|
30bbedbc79 | ||
|
|
577c36d9ae | ||
|
|
fd3bfb02c0 | ||
|
|
fab9193407 | ||
|
|
97aae5dabf | ||
|
|
c390641203 | ||
|
|
e6cdcdb9e5 | ||
|
|
1db2c6f196 | ||
|
|
88e2c19017 | ||
|
|
258220b4f0 | ||
|
|
20651aa10d | ||
|
|
32f639da3b | ||
|
|
bf7b902881 | ||
|
|
d44bfd4c4b | ||
|
|
2b239736cf | ||
|
|
281142567c | ||
|
|
6bec57ae96 | ||
|
|
f97f692596 | ||
|
|
7397eb24ec | ||
|
|
be7a2e4d5d | ||
|
|
8bd2265fe1 | ||
|
|
c064d2264e | ||
|
|
375a36f82a | ||
|
|
89e703d763 | ||
|
|
9f6d4fd2a2 | ||
|
|
3db9281508 | ||
|
|
b039066277 | ||
|
|
01a5791376 | ||
|
|
e2af1325c6 | ||
|
|
c37ef8f50c | ||
|
|
89842beb5c | ||
|
|
8f462ce962 | ||
|
|
879436f440 | ||
|
|
080a792777 | ||
|
|
1e4e3d17bc | ||
|
|
9ae17eca97 | ||
|
|
bc41215c8f | ||
|
|
80400e9a56 | ||
|
|
eeebb90e6f | ||
|
|
0693ed168e | ||
|
|
e60601fdcd | ||
|
|
9998e22145 | ||
|
|
52cc00fc4c | ||
|
|
dd10c7ef05 | ||
|
|
51aea96ba7 | ||
|
|
1d69457017 | ||
|
|
f0107caf9e | ||
|
|
34cc80ffea | ||
|
|
a5d1c04955 | ||
|
|
65551ae19f | ||
|
|
570521429e | ||
|
|
55f16705eb | ||
|
|
2b5c8b0ec7 | ||
|
|
81081bdda5 | ||
|
|
d400e3e64d | ||
|
|
1136aa3a87 | ||
|
|
f810ff531d | ||
|
|
74ba870f05 | ||
|
|
ffa260b39d | ||
|
|
766b224c67 | ||
|
|
79f2858037 | ||
|
|
d6589237e1 | ||
|
|
d0d1b00a7b | ||
|
|
e617c07179 | ||
|
|
48f2dde80c | ||
|
|
cea78d7fd6 | ||
|
|
d58bc324ad | ||
|
|
4acbddb21a | ||
|
|
6e2db98590 | ||
|
|
ae8cfb0366 | ||
|
|
9edef74e73 | ||
|
|
2f9279d707 | ||
|
|
606d6fc592 | ||
|
|
7832e01082 | ||
|
|
b158ebb551 | ||
|
|
237e89babc | ||
|
|
264bcb82be | ||
|
|
dfac84209d | ||
|
|
5fbce5f513 | ||
|
|
a2c31358d1 | ||
|
|
edae18b8df | ||
|
|
0a55137e6a | ||
|
|
3abc7144aa | ||
|
|
7bced89d2d | ||
|
|
52e4ba143c | ||
|
|
b04b11ce1d | ||
|
|
554850df41 | ||
|
|
ea63d8001a | ||
|
|
748524d580 | ||
|
|
6ebe6d8a8e | ||
|
|
a5a3c2cd92 | ||
|
|
a3325114ea | ||
|
|
4b1b8242cb | ||
|
|
67cf6892a4 | ||
|
|
98d6e7db05 | ||
|
|
becbe1be3b | ||
|
|
e0e3cdda1d | ||
|
|
4be87d7796 | ||
|
|
fd9da90c1d | ||
|
|
79f45a5146 | ||
|
|
9f5c641bdd | ||
|
|
aca1bdf1fb | ||
|
|
4666830538 | ||
|
|
198035eaf4 | ||
|
|
b3d25fafcf | ||
|
|
6446bb44be | ||
|
|
a75d5124c5 | ||
|
|
3b440fec7b | ||
|
|
32a4bfb19b | ||
|
|
8b730ef059 | ||
|
|
93f9daecee | ||
|
|
69c8208e3c | ||
|
|
97a31f34e5 | ||
|
|
242054f7dc | ||
|
|
4335f85830 | ||
|
|
7bcae33011 | ||
|
|
2127c4ce90 | ||
|
|
9963b750ac | ||
|
|
c189984d0f | ||
|
|
807db03d0d | ||
|
|
6838196343 | ||
|
|
0f4b45d127 | ||
|
|
4e49e77d29 | ||
|
|
11a78f0f06 | ||
|
|
7889a9a29b | ||
|
|
add097c489 | ||
|
|
12b8acb1b8 | ||
|
|
4a7b9b5c72 | ||
|
|
20f9a36615 | ||
|
|
fb857e8b39 | ||
|
|
2f9c0c0483 | ||
|
|
eb92a79898 | ||
|
|
ae12871fa9 | ||
|
|
10d00c66da | ||
|
|
70ac7d9cae | ||
|
|
fc6b3faadc | ||
|
|
d916b7c6ab | ||
|
|
cbb2337438 | ||
|
|
53a369769d | ||
|
|
bdcc67c04e | ||
|
|
84a3b517a8 | ||
|
|
4533f23b79 | ||
|
|
52d4a8c744 | ||
|
|
c5203e94d0 | ||
|
|
a8d7c43b30 | ||
|
|
fff8a80ae5 | ||
|
|
4e38ae2087 | ||
|
|
8c3d06502f | ||
|
|
8a220b1b8a | ||
|
|
353ef1ed95 | ||
|
|
14949721ba | ||
|
|
bb75cb2c68 | ||
|
|
f9e683896f | ||
|
|
c41fda1d92 | ||
|
|
7313acce36 | ||
|
|
640f06588d | ||
|
|
cd5b39a1f0 | ||
|
|
35596c866f | ||
|
|
d6d6ae4ef8 | ||
|
|
65a5f586cb | ||
|
|
4aa45dee05 | ||
|
|
f08e8df514 | ||
|
|
995ab4d27a | ||
|
|
3bf8d6d824 | ||
|
|
8c0bdc2b24 | ||
|
|
02687db33d | ||
|
|
d1715de751 | ||
|
|
61da73d517 | ||
|
|
225607cf7f | ||
|
|
2c764df932 | ||
|
|
58ade95b8b | ||
|
|
47e74f65e0 | ||
|
|
9557d4b50e | ||
|
|
b6a6c2e313 | ||
|
|
60c235975c | ||
|
|
ca4fb01c6d | ||
|
|
1ee046b739 | ||
|
|
1e4ea0b3e7 | ||
|
|
8759c50440 | ||
|
|
9b0ac7d230 | ||
|
|
61a40270d9 | ||
|
|
7dd767c4b4 | ||
|
|
1adecd9ee7 | ||
|
|
9877561b6c | ||
|
|
826be5877f | ||
|
|
262e451625 | ||
|
|
81079a2a7e | ||
|
|
8460f9008e | ||
|
|
4963331101 | ||
|
|
04886da968 |
191 changed files with 20558 additions and 2955 deletions
1
.github/FUNDING.yml
vendored
1
.github/FUNDING.yml
vendored
|
|
@ -1 +1,2 @@
|
|||
github: [samber]
|
||||
ko_fi: samuelberthe
|
||||
|
|
|
|||
5
.github/dependabot.yml
vendored
5
.github/dependabot.yml
vendored
|
|
@ -5,3 +5,8 @@ updates:
|
|||
directory: "/"
|
||||
schedule:
|
||||
interval: "monthly"
|
||||
|
||||
- package-ecosystem: "npm"
|
||||
directory: "/site"
|
||||
schedule:
|
||||
interval: "monthly"
|
||||
|
|
|
|||
25
.github/workflows/dependabot-automerge.yaml
vendored
Normal file
25
.github/workflows/dependabot-automerge.yaml
vendored
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
name: Dependabot automerge
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
types: [opened, synchronize]
|
||||
|
||||
jobs:
|
||||
automerge:
|
||||
runs-on: ubuntu-latest
|
||||
if: github.actor == 'dependabot[bot]'
|
||||
permissions:
|
||||
contents: write
|
||||
pull-requests: write
|
||||
steps:
|
||||
- name: Fetch Dependabot metadata
|
||||
id: metadata
|
||||
uses: dependabot/fetch-metadata@v3
|
||||
|
||||
- name: Enable auto-merge for github-actions updates
|
||||
if: steps.metadata.outputs.package-ecosystem == 'github_actions'
|
||||
run: gh pr merge --auto --squash "$PR_URL"
|
||||
env:
|
||||
PR_URL: ${{ github.event.pull_request.html_url }}
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
62
.github/workflows/deploy.yml
vendored
Normal file
62
.github/workflows/deploy.yml
vendored
Normal file
|
|
@ -0,0 +1,62 @@
|
|||
name: Deploy Astro site to GitHub Pages
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [master]
|
||||
workflow_dispatch:
|
||||
|
||||
# Only allow one concurrent deployment
|
||||
concurrency:
|
||||
group: pages
|
||||
cancel-in-progress: false
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
pages: write
|
||||
id-token: write
|
||||
|
||||
jobs:
|
||||
build:
|
||||
name: Build
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Setup Node.js
|
||||
uses: actions/setup-node@v6
|
||||
with:
|
||||
node-version: 'latest'
|
||||
cache: npm
|
||||
cache-dependency-path: site/package-lock.json
|
||||
|
||||
- name: Install dependencies
|
||||
working-directory: site
|
||||
run: npm ci
|
||||
|
||||
- name: Build Astro site
|
||||
working-directory: site
|
||||
env:
|
||||
ASTRO_TELEMETRY_DISABLED: "1"
|
||||
run: npm run build
|
||||
|
||||
- name: Build Pagefind search index
|
||||
working-directory: site
|
||||
run: npx pagefind --site dist
|
||||
|
||||
- name: Upload Pages artifact
|
||||
uses: actions/upload-pages-artifact@v5
|
||||
with:
|
||||
path: site/dist
|
||||
|
||||
deploy:
|
||||
name: Deploy
|
||||
needs: build
|
||||
runs-on: ubuntu-latest
|
||||
environment:
|
||||
name: github-pages
|
||||
url: ${{ steps.deployment.outputs.page_url }}
|
||||
steps:
|
||||
- name: Deploy to GitHub Pages
|
||||
id: deployment
|
||||
uses: actions/deploy-pages@v5
|
||||
|
|
@ -1,34 +1,38 @@
|
|||
name: Publish
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
|
||||
permissions:
|
||||
contents: write
|
||||
|
||||
jobs:
|
||||
publish:
|
||||
name: Publish
|
||||
# Check if the PR is not from a fork
|
||||
if: github.repository_owner == 'samber'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout Repo
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Set up Ruby
|
||||
uses: ruby/setup-ruby@v1
|
||||
with:
|
||||
ruby-version: 2.7
|
||||
ruby-version: '3.4'
|
||||
|
||||
- name: Set up yq
|
||||
uses: mikefarah/yq@master
|
||||
uses: mikefarah/yq@v4
|
||||
|
||||
- name: Install liquid
|
||||
run: gem install liquid-cli
|
||||
run: |
|
||||
gem install liquid -v 5.5.1
|
||||
gem install liquid-cli
|
||||
|
||||
- name: Build rule configuration
|
||||
run: |
|
||||
gem install liquid-cli
|
||||
cat _data/rules.yml | yq -I 0 -o json > _data/rules.json
|
||||
|
||||
rm -rf dist/rules
|
||||
|
|
@ -38,7 +42,7 @@ jobs:
|
|||
mkdir -p "${subdir}"
|
||||
|
||||
# groupName=$(echo "{% assign groupName = name | split: ' ' %}{% capture groupNameCamelcase %}{% for word in groupName %}{{ word | capitalize }} {% endfor %}{% endcapture %} {{ groupNameCamelcase | remove: ' ' | remove: '-' }}" | liquid $(echo ${service} | base64 --decode | jq -r '.name | ascii_downcase | split(" ") | join("-")'))
|
||||
|
||||
|
||||
for exporter in $(echo ${service} | base64 --decode | jq -r '.exporters[] | @base64'); do
|
||||
exporterName=$(echo ${exporter} | base64 --decode | jq -r '.slug')
|
||||
cat dist/template.yml | liquid "$(echo ${exporter} | base64 --decode)" > ${subdir}/${exporterName}.yml
|
||||
|
|
@ -51,7 +55,7 @@ jobs:
|
|||
# https://peterevans.dev/posts/github-actions-how-to-automate-code-formatting-in-pull-requests/
|
||||
- name: Check for modified files
|
||||
id: git-check
|
||||
run: echo ::set-output name=modified::$(git status -s --porcelain | wc -l | awk '{$1=$1};1')
|
||||
run: echo "modified=$(git status -s --porcelain | wc -l | awk '{$1=$1};1')" >> $GITHUB_OUTPUT
|
||||
- name: Push changes
|
||||
if: steps.git-check.outputs.modified != '0'
|
||||
run: |
|
||||
38
.github/workflows/site.yml
vendored
Normal file
38
.github/workflows/site.yml
vendored
Normal file
|
|
@ -0,0 +1,38 @@
|
|||
name: Site build
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
paths:
|
||||
- site/**
|
||||
- _data/**
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths:
|
||||
- site/**
|
||||
- _data/**
|
||||
|
||||
jobs:
|
||||
site-build:
|
||||
name: Build Astro site
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Setup Node.js
|
||||
uses: actions/setup-node@v6
|
||||
with:
|
||||
node-version: 'latest'
|
||||
cache: npm
|
||||
cache-dependency-path: site/package-lock.json
|
||||
|
||||
- name: Install dependencies
|
||||
working-directory: site
|
||||
run: npm ci
|
||||
|
||||
- name: Build Astro site
|
||||
working-directory: site
|
||||
env:
|
||||
ASTRO_TELEMETRY_DISABLED: "1"
|
||||
run: npm run build
|
||||
19
.github/workflows/test.yml
vendored
19
.github/workflows/test.yml
vendored
|
|
@ -1,6 +1,14 @@
|
|||
name: Promtool check
|
||||
|
||||
on: [pull_request, push]
|
||||
on:
|
||||
pull_request:
|
||||
paths:
|
||||
- _data/**
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths:
|
||||
- _data/**
|
||||
|
||||
jobs:
|
||||
promtool-check:
|
||||
|
|
@ -8,22 +16,21 @@ jobs:
|
|||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout Repo
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Set up Ruby
|
||||
uses: ruby/setup-ruby@v1
|
||||
with:
|
||||
ruby-version: 2.7
|
||||
ruby-version: 3.4
|
||||
|
||||
- name: Set up yq
|
||||
uses: mikefarah/yq@master
|
||||
uses: mikefarah/yq@v4
|
||||
|
||||
- name: Install liquid
|
||||
run: gem install liquid-cli
|
||||
|
||||
- name: Build rule configuration
|
||||
run: |
|
||||
gem install liquid-cli
|
||||
cat _data/rules.yml | yq -I 0 -o json > _data/rules.json
|
||||
|
||||
for service in $(cat _data/rules.json | jq -r '.groups[].services[] | @base64'); do
|
||||
|
|
@ -31,7 +38,7 @@ jobs:
|
|||
mkdir -p "${subdir}"
|
||||
|
||||
# groupName=$(echo "{% assign groupName = name | split: ' ' %}{% capture groupNameCamelcase %}{% for word in groupName %}{{ word | capitalize }} {% endfor %}{% endcapture %} {{ groupNameCamelcase | remove: ' ' | remove: '-' }}" | liquid $(echo ${service} | base64 --decode | jq -r '.name | ascii_downcase | split(" ") | join("-")'))
|
||||
|
||||
|
||||
for exporter in $(echo ${service} | base64 --decode | jq -r '.exporters[] | @base64'); do
|
||||
exporterName=$(echo ${exporter} | base64 --decode | jq -r '.slug')
|
||||
cat dist/template.yml | liquid "$(echo ${exporter} | base64 --decode)" > ${subdir}/${exporterName}.yml
|
||||
|
|
|
|||
15
.gitignore
vendored
15
.gitignore
vendored
|
|
@ -1,6 +1,13 @@
|
|||
_site/
|
||||
.sass-cache/
|
||||
.jekyll-cache/
|
||||
.jekyll-metadata
|
||||
# Generated data
|
||||
_data/rules.json
|
||||
test/rules/
|
||||
|
||||
# Node / Astro
|
||||
/node_modules
|
||||
site/node_modules/
|
||||
site/dist/
|
||||
site/.astro/
|
||||
site/public/pagefind/
|
||||
|
||||
# Misc
|
||||
.worktrees/
|
||||
216
CLAUDE.md
Normal file
216
CLAUDE.md
Normal file
|
|
@ -0,0 +1,216 @@
|
|||
# CLAUDE.md
|
||||
|
||||
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
||||
|
||||
## Project Overview
|
||||
|
||||
A curated collection of ~940 Prometheus alerting rules covering 90+ services across 100+ exporters, organized in categories: basic resource monitoring (Prometheus, host/hardware, SMART, Docker, Blackbox, Windows, VMware, Netdata), databases (MySQL, PostgreSQL, Redis, MongoDB, Elasticsearch, Cassandra, Clickhouse, CouchDB, etc.), message brokers (RabbitMQ, Kafka, Pulsar, Nats, Zookeeper), proxies/load balancers/service meshes (Nginx, Apache, HaProxy, Traefik, Caddy, Linkerd, Istio), runtimes (PHP-FPM, JVM, Sidekiq), data engineering (Apache Flink, Apache Spark, Hadoop), orchestrators (Kubernetes, Nomad, Consul, Etcd, OpenStack), CI/CD (Jenkins, ArgoCD, FluxCD, GitLab CI, Spinnaker), network and security (SSL/TLS, CoreDNS, Vault, Cloudflare, Cilium, eBPF), storage (Ceph, ZFS, OpenEBS, Minio), cloud providers (AWS, Azure, DigitalOcean), observability (Thanos, Loki, Cortex, OpenTelemetry Collector, Grafana Tempo/Mimir/Alloy, Jaeger), and other (APC UPS, Graph Node).
|
||||
|
||||
All rules are stored in a single YAML data file (`_data/rules.yml`) and rendered as a static site built with Astro + TypeScript (located in `site/`). The site provides copy-pasteable Prometheus alert snippets and downloadable rule files per exporter.
|
||||
|
||||
The project is community-driven. Most contributions are PRs adding or updating rules in `_data/rules.yml`. Files in `dist/rules/` are auto-generated on merge — never edit them manually.
|
||||
|
||||
## Architecture
|
||||
|
||||
- **`_data/rules.yml`** — The single source of truth for all alerting rules. This is the main file contributors edit. It is NOT a valid Prometheus config; the site renders each rule into copy-pasteable Prometheus alert format.
|
||||
- **`site/`** — Astro + TypeScript static site. Run `npm run dev` inside this directory to develop locally.
|
||||
- **`site/src/data/rules.ts`** — Typed wrappers and helper functions over `_data/rules.yml`.
|
||||
- **`site/src/data/site.ts`** — Shared site metadata constants (URLs, author, schema objects).
|
||||
- **`site/src/pages/`** — Astro page routes: `index.astro` (homepage), `rules/[group]/[service].astro` (per-service rule pages), `alertmanager.astro`, `blackbox-exporter.astro`, `sleep-peacefully.astro` (guides).
|
||||
- **`site/src/layouts/BaseLayout.astro`** — Root HTML layout (SEO, GA, dark mode).
|
||||
- **`site/src/layouts/GuideLayout.astro`** — Layout for guide pages (TOC, hero, related guides).
|
||||
- **`site/src/components/`** — Shared Astro components (Header, Footer, Sidebar, RuleCard, ExporterSection, etc.).
|
||||
- **`site/astro.config.mjs`** — Astro configuration (sitemap, Vite YAML plugin, base URL).
|
||||
- **`dist/rules/`** — Pre-built downloadable rule files organized by service/exporter (referenced in the site for `wget` commands).
|
||||
|
||||
## Rules YAML Structure
|
||||
|
||||
Services are listed in README.md.
|
||||
|
||||
`_data/rules.yml` hierarchy:
|
||||
```
|
||||
groups:
|
||||
- name: "<category>" # e.g. "Basic resource monitoring"
|
||||
services:
|
||||
- name: "<service>" # e.g. "Host and hardware"
|
||||
exporters:
|
||||
- name: "<exporter>"
|
||||
slug: "<slug>" # used for download URLs
|
||||
doc_url: "<url>" # optional link to exporter docs
|
||||
comments: # optional, exporter-level multiline notes rendered before rules
|
||||
"<comment>"
|
||||
rules:
|
||||
- name: "<alert name>"
|
||||
description: "<text>"
|
||||
query: "<PromQL>"
|
||||
severity: warning|critical|info
|
||||
for: "<duration>" # optional, defaults to 0m
|
||||
comments: # optional, rendered as multiline YAML comments
|
||||
"<comment>"
|
||||
```
|
||||
|
||||
Services are grouped in category. If you are not sure about the classification, ask the developer.
|
||||
|
||||
## Running Locally
|
||||
|
||||
```bash
|
||||
cd site
|
||||
npm install
|
||||
npm run dev
|
||||
```
|
||||
|
||||
Site serves at http://localhost:4321/awesome-prometheus-alerts.
|
||||
|
||||
To build for production:
|
||||
|
||||
```bash
|
||||
cd site
|
||||
npm run build
|
||||
npm run preview
|
||||
```
|
||||
|
||||
## Contributing Rules
|
||||
|
||||
All rule changes go in `_data/rules.yml`. Each rule needs: `name`, `description`, `query` (valid PromQL), and `severity`. The `for` field is optional. Descriptions should be factual ("what") and include root cause hints ("why"). Queries must be tested against the latest exporter version. Never modify files in `dist/` — they are auto-generated on merge.
|
||||
|
||||
## Query Validation
|
||||
|
||||
- When adding or updating an alert, verify that the PromQL query references metric series that actually exist in the related exporter. Check the exporter's documentation or source code to confirm series names.
|
||||
- If a metric series has been deprecated or removed in a newer version of the exporter, update the query to use the replacement series, or remove the rule if no replacement exists. Known examples: `kube_hpa_*` renamed to `kube_horizontalpodautoscaler_*` in kube-state-metrics 2.x; `node_hwmon_temp_alarm` does not exist (correct: `node_hwmon_temp_crit_alarm_celsius`); node-exporter CLI flags get renamed across versions.
|
||||
- When writing or reviewing a query, search the internet (exporter docs, GitHub issues, changelogs) to validate correctness and catch outdated series names. When you are not sure about a metric name, always search the internet to confirm it exists and is spelled correctly before using it.
|
||||
- Pay special attention to metric naming conventions: many exporters add `_total` suffixes for counters and `_seconds_total` for time-based counters. Verify the exact name from source code, not just docs. Known examples: Spark's PrometheusResource adds `_total` and `_seconds_total` suffixes (e.g., `metrics_executor_failedTasks_total`, not `metrics_executor_failedTasks`); Oracle's `oracledb_sessions_value` not `oracledb_sessions_activity`.
|
||||
- Verify that label names used in `{{ $labels.xxx }}` template variables actually exist on the metric. Check the exporter source code for the exact label names. Known examples: cloudflare/ebpf_exporter uses `id` not `name` for programs, and `config` not `name` for decoder errors.
|
||||
- When a metric uses info-style patterns (value always 1, information carried in labels), `== 0` will never be true — the metric simply won't exist. Use `absent()` instead. Known example: `ebpf_exporter_enabled_configs`.
|
||||
- Some metrics are version-dependent. When a metric was renamed or removed in a newer version, add a comment noting the version requirement. Known examples: `go_memstats_gc_cpu_fraction` removed in client_golang v1.12+; cert-manager renamed `certmanager_http_acme_client_request_count` to `certmanager_acme_client_request_count` in v1.19+.
|
||||
- Verify the unit of a metric before setting thresholds. Some metrics use milliseconds while descriptions assume seconds. Known example: Keycloak's `keycloak_request_duration` is in milliseconds, so `> 2` means 2ms not 2s.
|
||||
- Some exporters expose labels that differ between services even within the same ecosystem. Known example: OpenStack Neutron uses `adminState="up"` while Nova and Cinder use `adminState="enabled"`.
|
||||
- When an official mixin exists for a service, compare thresholds and time windows against it. Known deviations to watch for: Mimir store-gateway sync uses 1800s (not 600s), Mimir compactor skipped blocks uses `[24h]` (not `[5m]`), Tempo normalizes outstanding blocks per worker.
|
||||
|
||||
## Common Review Pitfalls (learned from PR history)
|
||||
|
||||
These are the most frequent issues raised during code review on this repo:
|
||||
|
||||
### Severity levels
|
||||
- `critical` = requires immediate human attention. Do not use for informational/security notifications.
|
||||
- `warning` = needs attention soon but not urgent.
|
||||
- `info` = awareness only (e.g., config changes, underutilized resources).
|
||||
- Authentication failures, security notifications, and config-change detections are typically `info`, not `critical`.
|
||||
|
||||
### `for` duration
|
||||
- Omit `for` when the default (0m) is intentional and appropriate — do not add `for: 0m` explicitly.
|
||||
- Add a `for` duration (e.g., `for: 2m` or `for: 5m`) to tolerate brief unavailability from restarts or transient spikes. Most "service down" rules should have at least `for: 1m`–`2m`.
|
||||
- Do not blanket-change all `for: 0m` to `for: 1m` — it depends on the alert's semantics and the range window used in `increase()`/`rate()`.
|
||||
|
||||
### Query design
|
||||
- Prefer symptom-based alerts over cause-based alerts to reduce alert fatigue. Example: "service is unreachable" is better than "specific internal counter changed". Metrics like heap object count, allocation rate, or free heap slots are causes, not symptoms — prefer GC duration, latency, or error rate alerts instead.
|
||||
- Don't add unnecessary aggregation (`avg()`, `avg_over_time()`) on metrics that are local to a single node/instance. Only aggregate when the alert is cluster-wide.
|
||||
- Don't combine `min_over_time()[1m]` with `for: 2m` redundantly — pick one mechanism for smoothing. Same applies to `avg_over_time()[5m]` with `for: 5m`.
|
||||
- Remove unnecessary label filters (e.g., `job="cassandra"` or `cluster=~".*"`) that add noise without value.
|
||||
- Verify comparison operators match the intent — e.g., "high snapshot count" must use `> N`, not `< N`.
|
||||
- When dividing counters (e.g., error rate = errors / total), guard against division by zero with `and total > 0` or filter appropriately. This is the most common issue in new PRs — check every ratio query.
|
||||
- Filter out system/template databases explicitly in DB queries (e.g., PostgreSQL: add `datid!="0"` alongside `datname!~"template.*|postgres"`).
|
||||
- Never use `rate()` on a gauge metric — use `deriv()` instead. `rate()` is for monotonically increasing counters only.
|
||||
- Conversely, never use `deriv()` or `delta()` on a metric that is a cumulative counter, even if the exporter declares it as `untyped`. The only reliable way to determine whether a metric is a counter or a gauge is to check whether it monotonically increases and resets on restart — not just the declared type. Known examples of untyped metrics with counter semantics: `node_vmstat_*` (e.g., `node_vmstat_pgmajfault`, `node_vmstat_oom_kill`) from node_exporter (cumulative values from /proc/vmstat — the official node_exporter mixin uses `rate()`); MySQL `SHOW GLOBAL STATUS` variables via mysqld_exporter (e.g., `mysql_global_status_slow_queries`, `mysql_global_status_innodb_log_waits`, `mysql_global_status_questions` — all monotonically increasing, use `rate()`/`increase()`).
|
||||
- When using `increase()` for ratio calculations, prefer `rate()` instead — `increase()` can produce incorrect results when counters reset mid-window.
|
||||
- When filtering gRPC error codes, don't use `grpc_code!="OK"` — this includes normal application responses like `NotFound`, `AlreadyExists`, and `Cancelled`. Filter to actual errors: `grpc_code=~"Internal|Unavailable|DeadlineExceeded|ResourceExhausted|Aborted|Unknown"`.
|
||||
- When computing ratios with `rate()` on a metric that is itself already a normalized rate (e.g., Oracle's `v$waitclassmetric`), applying `rate()` computes the rate-of-change of a rate, which is not meaningful.
|
||||
- When a multi-label metric is used in a binary operation with a metric that has fewer labels, use `ignoring(extra_label)` to avoid join failures. Known example: `systemd_unit_tasks_current / ignoring(type) systemd_unit_tasks_max`.
|
||||
- When a query groups by labels (e.g., `by (le, worker)`), consider the cardinality impact — hundreds of label values means hundreds of independent alerts.
|
||||
- Ensure `{{ $value | humanizeDuration }}` is only used on values in seconds. If the metric is in milliseconds, divide by 1000 first or use `{{ $value | humanize }}ms`.
|
||||
- Avoid using `up{job=~"exporter-name"} == 0` or `absent(up{job=~"exporter-name"})` to detect whether a service is down. When targets are managed via service discovery or a job reaches multiple targets, a disappeared target causes the `up` series to become stale and vanish rather than drop to 0, so the alert never fires. Prefer application-level or cluster-level metrics instead (e.g., "number of consul cluster members < 3", "PostgreSQL primary node absent").
|
||||
|
||||
### Thresholds
|
||||
- Alert thresholds are inherently arbitrary and depend on workload. Use `comments:` to note this when a threshold is a rough default.
|
||||
- When threshold values in a PR seem unreasonable (too high or too low), challenge them with real-world reasoning or exporter docs.
|
||||
- Watch for thresholds that are so high they only catch catastrophic scenarios and miss real problems. Examples: Go goroutine spike at 100/s (misses gradual leaks), Ruby major GC at 5/s (only fires if app is non-functional), Python gen2 GC at >1/s (extremely rare).
|
||||
- Watch for thresholds that will fire on normal healthy operation. Examples: Memcached at 90% memory is desired (it's a cache), Flink TaskManager at 90% JVM heap is normal, cache hit rate < 80% is common for cold caches.
|
||||
- For SNMP bandwidth utilization, `ifSpeed` (Gauge32) maxes at ~4.29 Gbps. For 10G+ interfaces, use `ifHighSpeed * 1000000` instead.
|
||||
- For alerts using `> 0` on counters with `rate()` or `increase()`, consider whether a single event truly warrants alerting. In most cases, a small threshold (e.g., `> 0.05` for rate, `> 3` for increase) better distinguishes real problems from transient noise.
|
||||
- When checking a cumulative total metric (one that only resets on process restart) with `> 0`, the alert will fire permanently after the first occurrence and never resolve. Always wrap such metrics in `increase()` or `rate()` to detect new events. Known example: `opensearch_circuitbreaker_tripped_count > 0` fires forever after the first circuit breaker trip.
|
||||
|
||||
### Comments
|
||||
- When an alert or its query needs explanation (e.g., non-obvious PromQL logic, threshold rationale, edge cases), use the rule-level `comments:` field. Use multiline comments when needed.
|
||||
- Use the exporter-level `comments:` field for notes that apply to all rules under that exporter (e.g., exporter version requirements, known quirks, setup prerequisites).
|
||||
- Comments are rendered as YAML `#` comments in the output, so they are visible to users who copy-paste the rules.
|
||||
- Never add two `comments:` keys to the same rule or exporter block. YAML silently discards the first when there are duplicate keys in the same mapping. Always merge multiple comment paragraphs into a single `comments:` field using the multiline `|` block scalar.
|
||||
|
||||
### Descriptions
|
||||
- Keep descriptions short, factual, and actionable.
|
||||
- Include what is happening ("Disk is almost full") and why it matters or what to check.
|
||||
- Use `{{ $labels.instance }}`, `{{ $value }}`, and other template variables in descriptions when useful.
|
||||
- If the description says "average" but the query uses `histogram_quantile(0.95, ...)`, fix the description to say "p95" (or vice versa).
|
||||
- When alerting on rates or ratios that may not be intuitive, include `{{ $value }}` in the description so operators can see the actual number.
|
||||
|
||||
### Structure
|
||||
- Some services have multiple exporters (e.g., MongoDB has `percona/mongodb_exporter` and `dcu/mongodb_exporter`). Place rules under the correct exporter.
|
||||
- Search for duplicates before adding a new rule — a similar alert may already exist under a different exporter or with different thresholds.
|
||||
- The `slug` field must be unique per exporter and is used for download URLs.
|
||||
|
||||
## Reference Sources for Cross-Checking Alerts
|
||||
|
||||
Use these sources to criticize and validate PromQL queries, compare thresholds, and find inspiration for new rules.
|
||||
|
||||
Everytime you consume an external resource to change a PromQL query, please compare before/after and explain why you think the external source is right.
|
||||
|
||||
### Official project mixins (alerts maintained by the project itself)
|
||||
- https://github.com/prometheus/node_exporter/tree/master/docs/node-mixin/alerts
|
||||
- https://github.com/prometheus/prometheus/tree/main/documentation/prometheus-mixin
|
||||
- https://github.com/prometheus/alertmanager/tree/main/doc/alertmanager-mixin
|
||||
- https://github.com/prometheus/snmp_exporter/tree/main/snmp-mixin
|
||||
- https://github.com/prometheus/mysqld_exporter/tree/main/mysqld-mixin
|
||||
- https://github.com/prometheus-community/postgres_exporter/tree/master/postgres_mixin
|
||||
- https://github.com/prometheus-community/elasticsearch_exporter (mixin via Grafana docs)
|
||||
- https://github.com/etcd-io/etcd/tree/main/contrib/mixin
|
||||
- https://github.com/thanos-io/thanos/tree/main/mixin (also: examples/alerts/)
|
||||
- https://github.com/grafana/loki/tree/main/production/loki-mixin (also: promtail-mixin/)
|
||||
- https://github.com/grafana/mimir/tree/main/operations/mimir-mixin
|
||||
- https://github.com/grafana/tempo/tree/main/operations/tempo-mixin
|
||||
- https://github.com/grafana/grafana/tree/main/grafana-mixin
|
||||
- https://github.com/ceph/ceph/tree/main/monitoring/ceph-mixin (in-tree; also https://github.com/ceph/ceph-mixins)
|
||||
- https://github.com/jaegertracing/jaeger/tree/main/monitoring/jaeger-mixin
|
||||
- https://github.com/kubernetes-monitoring/kubernetes-mixin (includes runbook.md)
|
||||
- https://github.com/kubernetes/kube-state-metrics/tree/main/jsonnet/kube-state-metrics-mixin
|
||||
- https://github.com/prometheus-operator/prometheus-operator/tree/main/jsonnet/mixin
|
||||
- https://github.com/prometheus-operator/kube-prometheus
|
||||
- https://github.com/cortexproject/cortex-jsonnet
|
||||
- https://github.com/gluster/gluster-mixins
|
||||
|
||||
### Standalone mixin repositories
|
||||
- https://github.com/povilasv/coredns-mixin
|
||||
- https://github.com/adinhodovic/rabbitmq-mixin
|
||||
- https://github.com/adinhodovic/blackbox-exporter-mixin
|
||||
- https://github.com/adinhodovic/django-mixin
|
||||
- https://github.com/adinhodovic/argo-cd-mixin
|
||||
- https://github.com/adinhodovic/ingress-nginx-mixin
|
||||
- https://github.com/adinhodovic/kubernetes-autoscaling-mixin
|
||||
- https://github.com/metalmatze/kube-cockroachdb (CockroachDB on Kubernetes)
|
||||
- https://github.com/bitnami-labs/sealed-secrets (sealed-secrets mixin)
|
||||
- https://github.com/lukas-vlcek/elasticsearch-mixin (includes runbook.md)
|
||||
- https://github.com/opensearch-project/opensearch-prometheus-exporter (OpenSearch exporter — check metric names here)
|
||||
- https://github.com/adinhodovic/postgresql-mixin
|
||||
- https://github.com/imusmanmalik/cert-manager-mixin
|
||||
- https://gitlab.com/uneeq-oss/cert-manager-mixin (alternative cert-manager mixin)
|
||||
- https://github.com/uneeq-oss/spinnaker-mixin
|
||||
- https://github.com/metalmatze/slo-libsonnet (SLO alerting/recording rules generation library)
|
||||
|
||||
### Grafana jsonnet-libs (93 mixins — browse for specific services)
|
||||
- https://github.com/grafana/jsonnet-libs
|
||||
- Notable mixins with alerts: consul, memcached, elasticsearch, haproxy, clickhouse, opensearch, redis, mongodb, kafka, nginx, rabbitmq, jvm, vault, envoy, istio, jenkins, caddy, cloudflare, docker, traefik, windows, snmp, argocd, nomad, pgbouncer, minio, ceph, and 60+ more.
|
||||
|
||||
### Mixin aggregators
|
||||
- https://monitoring.mixins.dev/ (central registry of all monitoring mixins)
|
||||
- https://github.com/monitoring-mixins/website/blob/master/mixins.json (machine-readable list of all mixins with source URLs)
|
||||
- https://github.com/nlamirault/monitoring-mixins (hub aggregating many mixins)
|
||||
|
||||
### GitLab monitoring & infrastructure
|
||||
- https://gitlab.com/gitlab-com/runbooks (GitLab.com SRE runbooks — production alert rules, runbook docs, alertmanager config)
|
||||
- https://gitlab.com/gitlab-com/runbooks/-/tree/master/mimir-rules (production Mimir alerting rules organized by tenant/environment)
|
||||
- https://gitlab.com/gitlab-com/runbooks/-/tree/master/mimir-rules-jsonnet (jsonnet sources for GitLab alerting rules)
|
||||
- https://gitlab.com/gitlab-org/omnibus-gitlab/-/tree/master/files/gitlab-cookbooks/monitoring/templates/rules (default Prometheus rules shipped with GitLab Omnibus)
|
||||
|
||||
### Community alert collections
|
||||
- https://github.com/jpweber/prometheus-alert-rules
|
||||
- https://github.com/bdossantos/prometheus-alert-rules
|
||||
- https://github.com/giantswarm/prometheus-rules
|
||||
- https://github.com/last9/awesome-prometheus-toolkit
|
||||
- https://github.com/warpnet/awesome-prometheus (meta-list of Prometheus resources)
|
||||
|
|
@ -16,24 +16,16 @@ Please ensure your pull request adheres to the following guidelines:
|
|||
- Description must be factual (the "what?") and should provide root cause suggestions (the "why?"), for faster resolution.
|
||||
- Queries must be tested on latest exporter version.
|
||||
|
||||
## Improving Github page
|
||||
## Improving the website
|
||||
|
||||
### Run localy
|
||||
The site is built with Astro + TypeScript, located in `site/`.
|
||||
|
||||
### Run locally
|
||||
|
||||
```
|
||||
gem install bundler
|
||||
bundle install
|
||||
jekyll serve
|
||||
cd site
|
||||
npm install
|
||||
npm run dev
|
||||
```
|
||||
|
||||
Or with Docker:
|
||||
|
||||
```
|
||||
docker run --rm -it -p 4000:4000 -v $(pwd):/srv/jekyll jekyll/jekyll jekyll serve
|
||||
```
|
||||
|
||||
Or with Docker-Compose:
|
||||
|
||||
```
|
||||
docker-compose up -d
|
||||
```
|
||||
Site serves at http://localhost:4321/awesome-prometheus-alerts.
|
||||
|
|
|
|||
3
Gemfile
3
Gemfile
|
|
@ -1,3 +0,0 @@
|
|||
source 'https://rubygems.org'
|
||||
gem 'github-pages', group: :jekyll_plugins
|
||||
gem 'webrick', '~> 1.3', '>= 1.3.1'
|
||||
284
Gemfile.lock
284
Gemfile.lock
|
|
@ -1,284 +0,0 @@
|
|||
GEM
|
||||
remote: https://rubygems.org/
|
||||
specs:
|
||||
activesupport (6.0.6.1)
|
||||
concurrent-ruby (~> 1.0, >= 1.0.2)
|
||||
i18n (>= 0.7, < 2)
|
||||
minitest (~> 5.1)
|
||||
tzinfo (~> 1.1)
|
||||
zeitwerk (~> 2.2, >= 2.2.2)
|
||||
addressable (2.8.0)
|
||||
public_suffix (>= 2.0.2, < 5.0)
|
||||
coffee-script (2.4.1)
|
||||
coffee-script-source
|
||||
execjs
|
||||
coffee-script-source (1.11.1)
|
||||
colorator (1.1.0)
|
||||
commonmarker (0.23.10)
|
||||
concurrent-ruby (1.2.0)
|
||||
dnsruby (1.61.9)
|
||||
simpleidn (~> 0.1)
|
||||
em-websocket (0.5.3)
|
||||
eventmachine (>= 0.12.9)
|
||||
http_parser.rb (~> 0)
|
||||
ethon (0.15.0)
|
||||
ffi (>= 1.15.0)
|
||||
eventmachine (1.2.7)
|
||||
execjs (2.8.1)
|
||||
faraday (1.10.0)
|
||||
faraday-em_http (~> 1.0)
|
||||
faraday-em_synchrony (~> 1.0)
|
||||
faraday-excon (~> 1.1)
|
||||
faraday-httpclient (~> 1.0)
|
||||
faraday-multipart (~> 1.0)
|
||||
faraday-net_http (~> 1.0)
|
||||
faraday-net_http_persistent (~> 1.0)
|
||||
faraday-patron (~> 1.0)
|
||||
faraday-rack (~> 1.0)
|
||||
faraday-retry (~> 1.0)
|
||||
ruby2_keywords (>= 0.0.4)
|
||||
faraday-em_http (1.0.0)
|
||||
faraday-em_synchrony (1.0.0)
|
||||
faraday-excon (1.1.0)
|
||||
faraday-httpclient (1.0.1)
|
||||
faraday-multipart (1.0.3)
|
||||
multipart-post (>= 1.2, < 3)
|
||||
faraday-net_http (1.0.1)
|
||||
faraday-net_http_persistent (1.2.0)
|
||||
faraday-patron (1.0.0)
|
||||
faraday-rack (1.0.0)
|
||||
faraday-retry (1.0.3)
|
||||
ffi (1.15.5)
|
||||
forwardable-extended (2.6.0)
|
||||
gemoji (3.0.1)
|
||||
github-pages (226)
|
||||
github-pages-health-check (= 1.17.9)
|
||||
jekyll (= 3.9.2)
|
||||
jekyll-avatar (= 0.7.0)
|
||||
jekyll-coffeescript (= 1.1.1)
|
||||
jekyll-commonmark-ghpages (= 0.2.0)
|
||||
jekyll-default-layout (= 0.1.4)
|
||||
jekyll-feed (= 0.15.1)
|
||||
jekyll-gist (= 1.5.0)
|
||||
jekyll-github-metadata (= 2.13.0)
|
||||
jekyll-include-cache (= 0.2.1)
|
||||
jekyll-mentions (= 1.6.0)
|
||||
jekyll-optional-front-matter (= 0.3.2)
|
||||
jekyll-paginate (= 1.1.0)
|
||||
jekyll-readme-index (= 0.3.0)
|
||||
jekyll-redirect-from (= 0.16.0)
|
||||
jekyll-relative-links (= 0.6.1)
|
||||
jekyll-remote-theme (= 0.4.3)
|
||||
jekyll-sass-converter (= 1.5.2)
|
||||
jekyll-seo-tag (= 2.8.0)
|
||||
jekyll-sitemap (= 1.4.0)
|
||||
jekyll-swiss (= 1.0.0)
|
||||
jekyll-theme-architect (= 0.2.0)
|
||||
jekyll-theme-cayman (= 0.2.0)
|
||||
jekyll-theme-dinky (= 0.2.0)
|
||||
jekyll-theme-hacker (= 0.2.0)
|
||||
jekyll-theme-leap-day (= 0.2.0)
|
||||
jekyll-theme-merlot (= 0.2.0)
|
||||
jekyll-theme-midnight (= 0.2.0)
|
||||
jekyll-theme-minimal (= 0.2.0)
|
||||
jekyll-theme-modernist (= 0.2.0)
|
||||
jekyll-theme-primer (= 0.6.0)
|
||||
jekyll-theme-slate (= 0.2.0)
|
||||
jekyll-theme-tactile (= 0.2.0)
|
||||
jekyll-theme-time-machine (= 0.2.0)
|
||||
jekyll-titles-from-headings (= 0.5.3)
|
||||
jemoji (= 0.12.0)
|
||||
kramdown (= 2.3.2)
|
||||
kramdown-parser-gfm (= 1.1.0)
|
||||
liquid (= 4.0.3)
|
||||
mercenary (~> 0.3)
|
||||
minima (= 2.5.1)
|
||||
nokogiri (>= 1.13.4, < 2.0)
|
||||
rouge (= 3.26.0)
|
||||
terminal-table (~> 1.4)
|
||||
github-pages-health-check (1.17.9)
|
||||
addressable (~> 2.3)
|
||||
dnsruby (~> 1.60)
|
||||
octokit (~> 4.0)
|
||||
public_suffix (>= 3.0, < 5.0)
|
||||
typhoeus (~> 1.3)
|
||||
html-pipeline (2.14.1)
|
||||
activesupport (>= 2)
|
||||
nokogiri (>= 1.4)
|
||||
http_parser.rb (0.8.0)
|
||||
i18n (0.9.5)
|
||||
concurrent-ruby (~> 1.0)
|
||||
jekyll (3.9.2)
|
||||
addressable (~> 2.4)
|
||||
colorator (~> 1.0)
|
||||
em-websocket (~> 0.5)
|
||||
i18n (~> 0.7)
|
||||
jekyll-sass-converter (~> 1.0)
|
||||
jekyll-watch (~> 2.0)
|
||||
kramdown (>= 1.17, < 3)
|
||||
liquid (~> 4.0)
|
||||
mercenary (~> 0.3.3)
|
||||
pathutil (~> 0.9)
|
||||
rouge (>= 1.7, < 4)
|
||||
safe_yaml (~> 1.0)
|
||||
jekyll-avatar (0.7.0)
|
||||
jekyll (>= 3.0, < 5.0)
|
||||
jekyll-coffeescript (1.1.1)
|
||||
coffee-script (~> 2.2)
|
||||
coffee-script-source (~> 1.11.1)
|
||||
jekyll-commonmark (1.4.0)
|
||||
commonmarker (~> 0.22)
|
||||
jekyll-commonmark-ghpages (0.2.0)
|
||||
commonmarker (~> 0.23.4)
|
||||
jekyll (~> 3.9.0)
|
||||
jekyll-commonmark (~> 1.4.0)
|
||||
rouge (>= 2.0, < 4.0)
|
||||
jekyll-default-layout (0.1.4)
|
||||
jekyll (~> 3.0)
|
||||
jekyll-feed (0.15.1)
|
||||
jekyll (>= 3.7, < 5.0)
|
||||
jekyll-gist (1.5.0)
|
||||
octokit (~> 4.2)
|
||||
jekyll-github-metadata (2.13.0)
|
||||
jekyll (>= 3.4, < 5.0)
|
||||
octokit (~> 4.0, != 4.4.0)
|
||||
jekyll-include-cache (0.2.1)
|
||||
jekyll (>= 3.7, < 5.0)
|
||||
jekyll-mentions (1.6.0)
|
||||
html-pipeline (~> 2.3)
|
||||
jekyll (>= 3.7, < 5.0)
|
||||
jekyll-optional-front-matter (0.3.2)
|
||||
jekyll (>= 3.0, < 5.0)
|
||||
jekyll-paginate (1.1.0)
|
||||
jekyll-readme-index (0.3.0)
|
||||
jekyll (>= 3.0, < 5.0)
|
||||
jekyll-redirect-from (0.16.0)
|
||||
jekyll (>= 3.3, < 5.0)
|
||||
jekyll-relative-links (0.6.1)
|
||||
jekyll (>= 3.3, < 5.0)
|
||||
jekyll-remote-theme (0.4.3)
|
||||
addressable (~> 2.0)
|
||||
jekyll (>= 3.5, < 5.0)
|
||||
jekyll-sass-converter (>= 1.0, <= 3.0.0, != 2.0.0)
|
||||
rubyzip (>= 1.3.0, < 3.0)
|
||||
jekyll-sass-converter (1.5.2)
|
||||
sass (~> 3.4)
|
||||
jekyll-seo-tag (2.8.0)
|
||||
jekyll (>= 3.8, < 5.0)
|
||||
jekyll-sitemap (1.4.0)
|
||||
jekyll (>= 3.7, < 5.0)
|
||||
jekyll-swiss (1.0.0)
|
||||
jekyll-theme-architect (0.2.0)
|
||||
jekyll (> 3.5, < 5.0)
|
||||
jekyll-seo-tag (~> 2.0)
|
||||
jekyll-theme-cayman (0.2.0)
|
||||
jekyll (> 3.5, < 5.0)
|
||||
jekyll-seo-tag (~> 2.0)
|
||||
jekyll-theme-dinky (0.2.0)
|
||||
jekyll (> 3.5, < 5.0)
|
||||
jekyll-seo-tag (~> 2.0)
|
||||
jekyll-theme-hacker (0.2.0)
|
||||
jekyll (> 3.5, < 5.0)
|
||||
jekyll-seo-tag (~> 2.0)
|
||||
jekyll-theme-leap-day (0.2.0)
|
||||
jekyll (> 3.5, < 5.0)
|
||||
jekyll-seo-tag (~> 2.0)
|
||||
jekyll-theme-merlot (0.2.0)
|
||||
jekyll (> 3.5, < 5.0)
|
||||
jekyll-seo-tag (~> 2.0)
|
||||
jekyll-theme-midnight (0.2.0)
|
||||
jekyll (> 3.5, < 5.0)
|
||||
jekyll-seo-tag (~> 2.0)
|
||||
jekyll-theme-minimal (0.2.0)
|
||||
jekyll (> 3.5, < 5.0)
|
||||
jekyll-seo-tag (~> 2.0)
|
||||
jekyll-theme-modernist (0.2.0)
|
||||
jekyll (> 3.5, < 5.0)
|
||||
jekyll-seo-tag (~> 2.0)
|
||||
jekyll-theme-primer (0.6.0)
|
||||
jekyll (> 3.5, < 5.0)
|
||||
jekyll-github-metadata (~> 2.9)
|
||||
jekyll-seo-tag (~> 2.0)
|
||||
jekyll-theme-slate (0.2.0)
|
||||
jekyll (> 3.5, < 5.0)
|
||||
jekyll-seo-tag (~> 2.0)
|
||||
jekyll-theme-tactile (0.2.0)
|
||||
jekyll (> 3.5, < 5.0)
|
||||
jekyll-seo-tag (~> 2.0)
|
||||
jekyll-theme-time-machine (0.2.0)
|
||||
jekyll (> 3.5, < 5.0)
|
||||
jekyll-seo-tag (~> 2.0)
|
||||
jekyll-titles-from-headings (0.5.3)
|
||||
jekyll (>= 3.3, < 5.0)
|
||||
jekyll-watch (2.2.1)
|
||||
listen (~> 3.0)
|
||||
jemoji (0.12.0)
|
||||
gemoji (~> 3.0)
|
||||
html-pipeline (~> 2.2)
|
||||
jekyll (>= 3.0, < 5.0)
|
||||
kramdown (2.3.2)
|
||||
rexml
|
||||
kramdown-parser-gfm (1.1.0)
|
||||
kramdown (~> 2.0)
|
||||
liquid (4.0.3)
|
||||
listen (3.7.1)
|
||||
rb-fsevent (~> 0.10, >= 0.10.3)
|
||||
rb-inotify (~> 0.9, >= 0.9.10)
|
||||
mercenary (0.3.6)
|
||||
minima (2.5.1)
|
||||
jekyll (>= 3.5, < 5.0)
|
||||
jekyll-feed (~> 0.9)
|
||||
jekyll-seo-tag (~> 2.1)
|
||||
minitest (5.17.0)
|
||||
multipart-post (2.1.1)
|
||||
nokogiri (1.16.2-x86_64-linux)
|
||||
racc (~> 1.4)
|
||||
octokit (4.22.0)
|
||||
faraday (>= 0.9)
|
||||
sawyer (~> 0.8.0, >= 0.5.3)
|
||||
pathutil (0.16.2)
|
||||
forwardable-extended (~> 2.6)
|
||||
public_suffix (4.0.7)
|
||||
racc (1.7.3)
|
||||
rb-fsevent (0.11.1)
|
||||
rb-inotify (0.10.1)
|
||||
ffi (~> 1.0)
|
||||
rexml (3.2.5)
|
||||
rouge (3.26.0)
|
||||
ruby2_keywords (0.0.5)
|
||||
rubyzip (2.3.2)
|
||||
safe_yaml (1.0.5)
|
||||
sass (3.7.4)
|
||||
sass-listen (~> 4.0.0)
|
||||
sass-listen (4.0.0)
|
||||
rb-fsevent (~> 0.9, >= 0.9.4)
|
||||
rb-inotify (~> 0.9, >= 0.9.7)
|
||||
sawyer (0.8.2)
|
||||
addressable (>= 2.3.5)
|
||||
faraday (> 0.8, < 2.0)
|
||||
simpleidn (0.2.1)
|
||||
unf (~> 0.1.4)
|
||||
terminal-table (1.8.0)
|
||||
unicode-display_width (~> 1.1, >= 1.1.1)
|
||||
thread_safe (0.3.6)
|
||||
typhoeus (1.4.0)
|
||||
ethon (>= 0.9.0)
|
||||
tzinfo (1.2.11)
|
||||
thread_safe (~> 0.1)
|
||||
unf (0.1.4)
|
||||
unf_ext
|
||||
unf_ext (0.0.8.1)
|
||||
unicode-display_width (1.8.0)
|
||||
webrick (1.7.0)
|
||||
zeitwerk (2.6.6)
|
||||
|
||||
PLATFORMS
|
||||
x86_64-linux
|
||||
x86_64-linux-musl
|
||||
|
||||
DEPENDENCIES
|
||||
github-pages
|
||||
webrick (~> 1.3, >= 1.3.1)
|
||||
|
||||
BUNDLED WITH
|
||||
2.3.13
|
||||
38
LICENSE
38
LICENSE
|
|
@ -1,3 +1,39 @@
|
|||
This repository uses a dual license:
|
||||
|
||||
- Alert rules and content (_data/rules.yml, dist/rules/, README.md):
|
||||
Creative Commons Attribution 4.0 International (CC BY 4.0)
|
||||
https://creativecommons.org/licenses/by/4.0/
|
||||
|
||||
- Site source code (site/):
|
||||
MIT License
|
||||
https://opensource.org/licenses/MIT
|
||||
|
||||
---
|
||||
|
||||
Creative Commons Attribution 4.0 International License (CC BY 4.0)
|
||||
|
||||
http://creativecommons.org/licenses/by/4.0/
|
||||
https://creativecommons.org/licenses/by/4.0/
|
||||
|
||||
---
|
||||
|
||||
MIT License (site source code)
|
||||
|
||||
Copyright (c) 2018 Samuel Berthe
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
|
|
|
|||
117
README.md
117
README.md
|
|
@ -1,6 +1,6 @@
|
|||
# 👋 Awesome Prometheus Alerts [](https://awesome.re)
|
||||
|
||||
> Most alerting rules are common to every Prometheus setup. We need a place to find them all. 🤘 🚨 📊
|
||||
> **940+ production-ready Prometheus alerting rules for 90+ services** — copy-paste YAML for Kubernetes, MySQL, Redis, Kafka, and more.
|
||||
|
||||
Collection available here: **[https://samber.github.io/awesome-prometheus-alerts](https://samber.github.io/awesome-prometheus-alerts)**
|
||||
|
||||
|
|
@ -8,9 +8,18 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts
|
|||
<hr>
|
||||
<sup><b>Sponsored by:</b></sup>
|
||||
<br>
|
||||
<a href="https://cast.ai/samuel">
|
||||
<div>
|
||||
<img src="https://samber.github.io/awesome-prometheus-alerts/images/sponsor-cast-ai.png" width="200" alt="Cast AI">
|
||||
</div>
|
||||
<div>
|
||||
Cut Kubernetes & AI costs, boost application stability.
|
||||
</div>
|
||||
</a>
|
||||
<br>
|
||||
<a href="https://betterstack.com">
|
||||
<div>
|
||||
<img src="https://samber.github.io/awesome-prometheus-alerts/assets/sponsor-betterstack.png" width="200" alt="Better Stack">
|
||||
<img src="https://samber.github.io/awesome-prometheus-alerts/images/sponsor-betterstack.png" width="200" alt="Better Stack">
|
||||
</div>
|
||||
<div>
|
||||
Better Stack lets you centralize, search, and visualize your logs.
|
||||
|
|
@ -34,74 +43,130 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts
|
|||
- [Prometheus self-monitoring](https://samber.github.io/awesome-prometheus-alerts/rules#prometheus-internals)
|
||||
- [Host/Hardware](https://samber.github.io/awesome-prometheus-alerts/rules#host-and-hardware)
|
||||
- [SMART](https://samber.github.io/awesome-prometheus-alerts/rules#smart)
|
||||
- [IPMI](https://samber.github.io/awesome-prometheus-alerts/rules#ipmi)
|
||||
- [Docker Containers](https://samber.github.io/awesome-prometheus-alerts/rules#docker-containers)
|
||||
- [Blackbox](https://samber.github.io/awesome-prometheus-alerts/rules#blackbox)
|
||||
- [Windows](https://samber.github.io/awesome-prometheus-alerts/rules#windows-server)
|
||||
- [VMWare](https://samber.github.io/awesome-prometheus-alerts/rules#vmware)
|
||||
- [Proxmox VE](https://samber.github.io/awesome-prometheus-alerts/rules#proxmox-ve)
|
||||
- [Netdata](https://samber.github.io/awesome-prometheus-alerts/rules#netdata)
|
||||
- [eBPF](https://samber.github.io/awesome-prometheus-alerts/rules#ebpf)
|
||||
- [Process Exporter](https://samber.github.io/awesome-prometheus-alerts/rules#process-exporter)
|
||||
- [Systemd](https://samber.github.io/awesome-prometheus-alerts/rules#systemd)
|
||||
|
||||
#### Databases and brokers
|
||||
#### Databases
|
||||
|
||||
- [MySQL](https://samber.github.io/awesome-prometheus-alerts/rules#mysql)
|
||||
- [PostgreSQL](https://samber.github.io/awesome-prometheus-alerts/rules#postgresql)
|
||||
- [SQL Server](https://samber.github.io/awesome-prometheus-alerts/rules#sql-server)
|
||||
- [Oracle Database](https://samber.github.io/awesome-prometheus-alerts/rules#oracle-database)
|
||||
- [Patroni](https://samber.github.io/awesome-prometheus-alerts/rules#patroni)
|
||||
- [PGBouncer](https://samber.github.io/awesome-prometheus-alerts/rules#pgbouncer)
|
||||
- [Redis](https://samber.github.io/awesome-prometheus-alerts/rules#redis)
|
||||
- [Memcached](https://samber.github.io/awesome-prometheus-alerts/rules#memcached)
|
||||
- [MongoDB](https://samber.github.io/awesome-prometheus-alerts/rules#mongodb)
|
||||
- [RabbitMQ](https://samber.github.io/awesome-prometheus-alerts/rules#rabbitmq)
|
||||
- [Elasticsearch](https://samber.github.io/awesome-prometheus-alerts/rules#elasticsearch)
|
||||
- [OpenSearch](https://samber.github.io/awesome-prometheus-alerts/rules#opensearch)
|
||||
- [Meilisearch](https://samber.github.io/awesome-prometheus-alerts/rules#meilisearch)
|
||||
- [Cassandra](https://samber.github.io/awesome-prometheus-alerts/rules#cassandra)
|
||||
- [Clickhouse](https://samber.github.io/awesome-prometheus-alerts/rules#clickhouse)
|
||||
- [CouchDB](https://samber.github.io/awesome-prometheus-alerts/rules#couchdb)
|
||||
- [Solr](https://samber.github.io/awesome-prometheus-alerts/rules#solr)
|
||||
|
||||
#### Message brokers
|
||||
|
||||
- [RabbitMQ](https://samber.github.io/awesome-prometheus-alerts/rules#rabbitmq)
|
||||
- [Zookeeper](https://samber.github.io/awesome-prometheus-alerts/rules#zookeeper)
|
||||
- [Kafka](https://samber.github.io/awesome-prometheus-alerts/rules#kafka)
|
||||
- [Pulsar](https://samber.github.io/awesome-prometheus-alerts/rules#pulsar)
|
||||
- [Nats](https://samber.github.io/awesome-prometheus-alerts/rules#nats)
|
||||
- [Solr](https://samber.github.io/awesome-prometheus-alerts/rules#solr)
|
||||
- [Hadoop](https://samber.github.io/awesome-prometheus-alerts/rules#hadoop)
|
||||
|
||||
#### Reverse proxies and load balancers
|
||||
#### Proxies, load balancers and service meshes
|
||||
|
||||
- [Nginx](https://samber.github.io/awesome-prometheus-alerts/rules#nginx)
|
||||
- [Apache](https://samber.github.io/awesome-prometheus-alerts/rules#apache)
|
||||
- [HaProxy](https://samber.github.io/awesome-prometheus-alerts/rules#haproxy)
|
||||
- [Traefik](https://samber.github.io/awesome-prometheus-alerts/rules#traefik)
|
||||
- [Caddy](https://samber.github.io/awesome-prometheus-alerts/rules#caddy)
|
||||
- [Envoy](https://samber.github.io/awesome-prometheus-alerts/rules#envoy)
|
||||
- [Linkerd](https://samber.github.io/awesome-prometheus-alerts/rules#linkerd)
|
||||
- [Istio](https://samber.github.io/awesome-prometheus-alerts/rules#istio)
|
||||
|
||||
#### Runtimes
|
||||
|
||||
- [PHP-FPM](https://samber.github.io/awesome-prometheus-alerts/rules#php-fpm)
|
||||
- [JVM](https://samber.github.io/awesome-prometheus-alerts/rules#jvm)
|
||||
- [Golang](https://samber.github.io/awesome-prometheus-alerts/rules#golang)
|
||||
- [Ruby](https://samber.github.io/awesome-prometheus-alerts/rules#ruby)
|
||||
- [Python](https://samber.github.io/awesome-prometheus-alerts/rules#python)
|
||||
- [Sidekiq](https://samber.github.io/awesome-prometheus-alerts/rules#sidekiq)
|
||||
|
||||
#### Data engineering
|
||||
|
||||
- [Apache Flink](https://samber.github.io/awesome-prometheus-alerts/rules#apache-flink)
|
||||
- [Apache Spark](https://samber.github.io/awesome-prometheus-alerts/rules#apache-spark)
|
||||
- [Hadoop](https://samber.github.io/awesome-prometheus-alerts/rules#hadoop)
|
||||
|
||||
#### Orchestrators
|
||||
|
||||
- [Kubernetes](https://samber.github.io/awesome-prometheus-alerts/rules#kubernetes)
|
||||
- [Nomad](https://samber.github.io/awesome-prometheus-alerts/rules#nomad)
|
||||
- [Consul](https://samber.github.io/awesome-prometheus-alerts/rules#consul)
|
||||
- [Etcd](https://samber.github.io/awesome-prometheus-alerts/rules#etcd)
|
||||
- [Linkerd](https://samber.github.io/awesome-prometheus-alerts/rules#linkerd)
|
||||
- [Istio](https://samber.github.io/awesome-prometheus-alerts/rules#istio)
|
||||
- [ArgoCD](https://samber.github.io/awesome-prometheus-alerts/rules#argocd)
|
||||
- [OpenStack](https://samber.github.io/awesome-prometheus-alerts/rules#openstack)
|
||||
|
||||
#### Network, security and storage
|
||||
#### CI/CD
|
||||
|
||||
- [Jenkins](https://samber.github.io/awesome-prometheus-alerts/rules#jenkins)
|
||||
- [ArgoCD](https://samber.github.io/awesome-prometheus-alerts/rules#argocd)
|
||||
- [FluxCD](https://samber.github.io/awesome-prometheus-alerts/rules#fluxcd)
|
||||
- [GitLab CI](https://samber.github.io/awesome-prometheus-alerts/rules#gitlab-ci)
|
||||
- [Spinnaker](https://samber.github.io/awesome-prometheus-alerts/rules#spinnaker)
|
||||
|
||||
#### Network and security
|
||||
|
||||
- [SpeedTest](https://samber.github.io/awesome-prometheus-alerts/rules#speedtest)
|
||||
- [SSL/TLS](https://samber.github.io/awesome-prometheus-alerts/rules#ssl/tls)
|
||||
- [cert-manager](https://samber.github.io/awesome-prometheus-alerts/rules#cert-manager)
|
||||
- [Juniper](https://samber.github.io/awesome-prometheus-alerts/rules#juniper)
|
||||
- [CoreDNS](https://samber.github.io/awesome-prometheus-alerts/rules#coredns)
|
||||
- [FreeSwitch](https://samber.github.io/awesome-prometheus-alerts/rules#freeswitch)
|
||||
- [Hashicorp Vault](https://samber.github.io/awesome-prometheus-alerts/rules#hashicorp-vault)
|
||||
- [Keycloak](https://samber.github.io/awesome-prometheus-alerts/rules#keycloak)
|
||||
- [Cloudflare](https://samber.github.io/awesome-prometheus-alerts/rules#cloudflare)
|
||||
- [SNMP](https://samber.github.io/awesome-prometheus-alerts/rules#snmp)
|
||||
- [Cilium](https://samber.github.io/awesome-prometheus-alerts/rules#cilium)
|
||||
- [WireGuard](https://samber.github.io/awesome-prometheus-alerts/rules#wireguard)
|
||||
|
||||
#### Storage
|
||||
|
||||
- [Ceph](https://samber.github.io/awesome-prometheus-alerts/rules#ceph)
|
||||
- [ZFS](https://samber.github.io/awesome-prometheus-alerts/rules#zfs)
|
||||
- [OpenEBS](https://samber.github.io/awesome-prometheus-alerts/rules#openebs)
|
||||
- [Minio](https://samber.github.io/awesome-prometheus-alerts/rules#minio)
|
||||
- [SSL/TLS](https://samber.github.io/awesome-prometheus-alerts/rules#ssl/tls)
|
||||
- [Juniper](https://samber.github.io/awesome-prometheus-alerts/rules#juniper)
|
||||
- [CoreDNS](https://samber.github.io/awesome-prometheus-alerts/rules#coredns)
|
||||
- [FreeSwitch](https://samber.github.io/awesome-prometheus-alerts/rules#freeswitch)
|
||||
- [Hashicorp Vault](https://samber.github.io/awesome-prometheus-alerts/rules#hashicorp-vault)
|
||||
- [Cloudflare](https://samber.github.io/awesome-prometheus-alerts/rules#cloudflare)
|
||||
|
||||
#### Other
|
||||
#### Cloud providers
|
||||
|
||||
- [AWS CloudWatch](https://samber.github.io/awesome-prometheus-alerts/rules#aws-cloudwatch)
|
||||
- [Google Cloud Stackdriver](https://samber.github.io/awesome-prometheus-alerts/rules#google-cloud-stackdriver)
|
||||
- [DigitalOcean](https://samber.github.io/awesome-prometheus-alerts/rules#digitalocean)
|
||||
- [Azure](https://samber.github.io/awesome-prometheus-alerts/rules#azure)
|
||||
|
||||
#### Observability
|
||||
|
||||
- [Thanos](https://samber.github.io/awesome-prometheus-alerts/rules#thanos)
|
||||
- [Loki](https://samber.github.io/awesome-prometheus-alerts/rules#loki)
|
||||
- [Promtail](https://samber.github.io/awesome-prometheus-alerts/rules#promtail)
|
||||
- [Cortex](https://samber.github.io/awesome-prometheus-alerts/rules#cortex)
|
||||
- [Jenkins](https://samber.github.io/awesome-prometheus-alerts/rules#jenkins)
|
||||
- [Grafana Tempo](https://samber.github.io/awesome-prometheus-alerts/rules#grafana-tempo)
|
||||
- [Grafana Mimir](https://samber.github.io/awesome-prometheus-alerts/rules#grafana-mimir)
|
||||
- [Grafana Alloy](https://samber.github.io/awesome-prometheus-alerts/rules#grafana-alloy)
|
||||
- [OpenTelemetry Collector](https://samber.github.io/awesome-prometheus-alerts/rules#opentelemetry-collector)
|
||||
- [Jaeger](https://samber.github.io/awesome-prometheus-alerts/rules#jaeger)
|
||||
|
||||
#### Other
|
||||
|
||||
- [APC UPS](https://samber.github.io/awesome-prometheus-alerts/rules#apc-ups)
|
||||
- [Graph Node](https://samber.github.io/awesome-prometheus-alerts/rules#graph-node)
|
||||
|
||||
## 🤝 Contributing
|
||||
|
|
@ -112,23 +177,15 @@ There are many ways to contribute: writing code, alerting rules, documentation,
|
|||
|
||||
[Instructions here](CONTRIBUTING.md)
|
||||
|
||||
## 🏋️ Improvements
|
||||
|
||||
- Create an alert rule builder in Jekyll for custom alerts (severity, thresholds, instances...)
|
||||
- Add resolution suggestions to rule descriptions, for faster incident resolution ([#85](https://github.com/samber/awesome-prometheus-alerts/issues/85)).
|
||||
|
||||
## 💫 Show your support
|
||||
|
||||
Give a ⭐️ if this project helped you!
|
||||
|
||||
[](https://www.patreon.com/samber)
|
||||
|
||||
## 👏 Thanks
|
||||
|
||||
Gratitude for the Gitlab operation team that provided 50+ rules. \o/
|
||||
|
||||
## 📝 License
|
||||
|
||||
[](https://creativecommons.org/licenses/by/4.0/legalcode)
|
||||
- Alert rules and content: [Creative Commons CC BY 4.0](https://creativecommons.org/licenses/by/4.0/)
|
||||
- Site source code: [MIT](site/LICENSE)
|
||||
|
||||
Licensed under the Creative Commons 4.0 License, see LICENSE file for more detail.
|
||||
See [LICENSE](LICENSE) for details.
|
||||
|
|
|
|||
|
|
@ -1,8 +0,0 @@
|
|||
theme: jekyll-theme-cayman
|
||||
|
||||
title: Awesome Prometheus alerts
|
||||
description: Collection of alerting rules
|
||||
|
||||
repository: samber/awesome-prometheus-alerts
|
||||
|
||||
baseurl: /awesome-prometheus-alerts
|
||||
5039
_data/rules.yml
5039
_data/rules.yml
File diff suppressed because it is too large
Load diff
|
|
@ -1,170 +0,0 @@
|
|||
<!DOCTYPE html>
|
||||
<html lang="{{ site.lang | default: "en-US" }}">
|
||||
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
{% seo %}
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||
<meta name="theme-color" content="#157878">
|
||||
<meta name="apple-mobile-web-app-status-bar-style" content="black-translucent">
|
||||
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.min.css">
|
||||
<link rel="stylesheet" href="{{ '/assets/css/style.css?v=' | append: site.github.build_revision | relative_url }}">
|
||||
<link rel="stylesheet" href="{{ '/assets/css/app.css?v=' | append: site.github.build_revision | relative_url }}">
|
||||
<link rel="icon" type="image/png" href="/assets/favicon.ico">
|
||||
|
||||
<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.1/jquery.min.js"></script>
|
||||
<script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.1/js/bootstrap.min.js"></script>
|
||||
<script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.4/clipboard.min.js"></script>
|
||||
<script src="{{ '/assets/js/app.js?v=' | append: site.github.build_revision | relative_url }}"></script>
|
||||
|
||||
<!-- Global site tag (gtag.js) - Google Analytics -->
|
||||
<script async src="https://www.googletagmanager.com/gtag/js?id=UA-118604063-2"></script>
|
||||
<script>
|
||||
window.dataLayer = window.dataLayer || [];
|
||||
|
||||
function gtag() {
|
||||
dataLayer.push(arguments);
|
||||
}
|
||||
gtag('js', new Date());
|
||||
|
||||
gtag('config', 'UA-118604063-2');
|
||||
</script>
|
||||
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<style>
|
||||
#skip-to-content {
|
||||
height: 1px;
|
||||
width: 1px;
|
||||
position: absolute;
|
||||
overflow: hidden;
|
||||
top: -10px;
|
||||
|
||||
&:focus {
|
||||
position: fixed;
|
||||
top: 10px;
|
||||
left: 10px;
|
||||
height: auto;
|
||||
width: auto;
|
||||
background: invert($body-link-color);
|
||||
outline: thick solid invert($body-link-color);
|
||||
}
|
||||
}
|
||||
|
||||
ul.github-buttons-cta li {
|
||||
display: inline-block;
|
||||
height: 20px;
|
||||
padding: 0px 15px;
|
||||
}
|
||||
|
||||
ul.github-buttons-cta li a {
|
||||
/* width: 100px; */
|
||||
text-decoration: none;
|
||||
}
|
||||
|
||||
.fa {
|
||||
/* padding: 14px;
|
||||
width: 50px;
|
||||
height: 50px; */
|
||||
font-size: 25px;
|
||||
text-align: center;
|
||||
text-decoration: none;
|
||||
border-radius: 50%;
|
||||
}
|
||||
|
||||
.fa:hover {
|
||||
opacity: 0.8;
|
||||
}
|
||||
|
||||
.fa-twitter,
|
||||
.fa-linkedin {
|
||||
/* background: #55ACEE; */
|
||||
color: white;
|
||||
}
|
||||
</style>
|
||||
<a id="skip-to-content" href="#content">Skip to the content.</a>
|
||||
|
||||
<header class="page-header" role="banner">
|
||||
<h1 class="project-name">
|
||||
<a href="{{ '/' | relative_url }}" style="color: white">
|
||||
{{ site.title | default: site.github.repository_name }}
|
||||
</a>
|
||||
</h1>
|
||||
<h2 class="project-tagline">{{ site.description | default: site.github.project_tagline }}</h2>
|
||||
<a href="{{ '/alertmanager' | relative_url }}" class="btn">Global configuration</a>
|
||||
<a href="{{ '/rules' | relative_url }}" class="btn">Rules</a>
|
||||
<a href="{{ '/sleep-peacefully' | relative_url }}" class="btn">Sleep peacefully</a>
|
||||
<a href="{{ '/blackbox-exporter' | relative_url }}" class="btn">Blackbox</a>
|
||||
<a href="https://github.com/samber/awesome-prometheus-alerts/blob/master/CONTRIBUTING.md" class="btn">
|
||||
Contribute on GitHub
|
||||
</a>
|
||||
|
||||
<ul class="github-buttons-cta">
|
||||
<li>
|
||||
<a href="https://github.com/samber/awesome-prometheus-alerts">
|
||||
<img alt="GitHub Repo Watchers" src="https://img.shields.io/github/watchers/samber/awesome-prometheus-alerts?style=social">
|
||||
</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="https://github.com/samber/awesome-prometheus-alerts">
|
||||
<img alt="GitHub Repo stars" src="https://img.shields.io/github/stars/samber/awesome-prometheus-alerts?style=social">
|
||||
</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="https://github.com/samber/awesome-prometheus-alerts">
|
||||
<img alt="GitHub Repo forks" src="https://img.shields.io/github/forks/samber/awesome-prometheus-alerts?style=social">
|
||||
</a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="https://twitter.com/share?via=samuelberthe&related=samuelberthe&text=🚨 📊 Here is a collection of Awesome Prometheus Alerts&url=https://samber.github.io/awesome-prometheus-alerts"
|
||||
class="fa fa-twitter" target="_blank"></a>
|
||||
</li>
|
||||
<li>
|
||||
<a href="http://www.linkedin.com/shareArticle?mini=true&url=https://samber.github.io/awesome-prometheus-alerts/"
|
||||
class="fa fa-linkedin" target="_blank"></a>
|
||||
</li>
|
||||
</ul>
|
||||
|
||||
|
||||
<ul id="sponsoring">
|
||||
<li>
|
||||
Kindly supported by 👉
|
||||
</li>
|
||||
<li>
|
||||
<a href="https://betterstack.com/">
|
||||
<img width="" src="assets/sponsor-betterstack.png" />
|
||||
</a>
|
||||
</li>
|
||||
</ul>
|
||||
</header>
|
||||
|
||||
<main id="content" class="main-content" role="main">
|
||||
{{ content }}
|
||||
|
||||
<footer class="site-footer">
|
||||
{% if site.github.is_project_page %}
|
||||
<span class="site-footer-owner">
|
||||
<a href="{{ site.github.repository_url }}">{{ site.title }}</a> is maintained by
|
||||
<a href="{{ site.github.owner_url }}">{{ site.github.owner_name }}</a>.
|
||||
</span>
|
||||
{% endif %}
|
||||
</footer>
|
||||
</main>
|
||||
|
||||
|
||||
<!-- Screeb tag -->
|
||||
<script type="text/javascript">
|
||||
(function (s,c,r,ee,b) {
|
||||
s['ScreebObject']=r;s[r]=s[r]||function(){(s[r].q=s[r].q||[]).push(arguments)};
|
||||
b=c.createElement('script');b.type='text/javascript';
|
||||
b.id=r;b.src=ee;b.async=1;c.getElementsByTagName("head")[0].appendChild(b);
|
||||
}(window,document,'$screeb','https://t2.screeb.app/tag.js'));
|
||||
|
||||
$screeb('init', '232450e3-d3fe-4240-b543-649a5041a7db');
|
||||
</script>
|
||||
<!-- End of Screeb tag -->
|
||||
|
||||
</body>
|
||||
|
||||
</html>
|
||||
141
alertmanager.md
141
alertmanager.md
|
|
@ -1,141 +0,0 @@
|
|||
<h1 style="text-align: center;">
|
||||
Global configuration
|
||||
</h1>
|
||||
|
||||
If you notice a delay between an event and the first notification, read the following blog post => [https://pracucci.com/prometheus-understanding-the-delays-on-alerting.html](https://pracucci.com/prometheus-understanding-the-delays-on-alerting.html).
|
||||
|
||||
## Prometheus configuration
|
||||
|
||||
{% highlight yaml %}
|
||||
# prometheus.yml
|
||||
|
||||
global:
|
||||
scrape_interval: 20s
|
||||
|
||||
# A short evaluation_interval will check alerting rules very often.
|
||||
# It can be costly if you run Prometheus with 100+ alerts.
|
||||
evaluation_interval: 20s
|
||||
...
|
||||
|
||||
rule_files:
|
||||
- 'alerts/*.yml'
|
||||
|
||||
scrape_configs:
|
||||
...
|
||||
|
||||
{% endhighlight %}
|
||||
|
||||
{% highlight yaml %}
|
||||
# alerts/example-redis.yml
|
||||
|
||||
groups:
|
||||
|
||||
- name: ExampleRedisGroup
|
||||
rules:
|
||||
- alert: ExampleRedisDown
|
||||
expr: redis_up{} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Redis instance down"
|
||||
description: "Whatever"
|
||||
|
||||
{% endhighlight %}
|
||||
|
||||
## AlertManager configuration
|
||||
|
||||
{% highlight yaml %}
|
||||
{% raw %}
|
||||
# alertmanager.yml
|
||||
|
||||
route:
|
||||
# When a new group of alerts is created by an incoming alert, wait at
|
||||
# least 'group_wait' to send the initial notification.
|
||||
# This way ensures that you get multiple alerts for the same group that start
|
||||
# firing shortly after another are batched together on the first
|
||||
# notification.
|
||||
group_wait: 10s
|
||||
|
||||
# When the first notification was sent, wait 'group_interval' to send a batch
|
||||
# of new alerts that started firing for that group.
|
||||
group_interval: 30s
|
||||
|
||||
# If an alert has successfully been sent, wait 'repeat_interval' to
|
||||
# resend them.
|
||||
repeat_interval: 30m
|
||||
|
||||
# A default receiver
|
||||
receiver: "slack"
|
||||
|
||||
# All the above attributes are inherited by all child routes and can
|
||||
# overwritten on each.
|
||||
routes:
|
||||
- receiver: "slack"
|
||||
group_wait: 10s
|
||||
match_re:
|
||||
severity: critical|warning
|
||||
continue: true
|
||||
|
||||
- receiver: "pager"
|
||||
group_wait: 10s
|
||||
match_re:
|
||||
severity: critical
|
||||
continue: true
|
||||
|
||||
receivers:
|
||||
- name: "slack"
|
||||
slack_configs:
|
||||
- api_url: 'https://hooks.slack.com/services/XXXXXXXXX/XXXXXXXXX/xxxxxxxxxxxxxxxxxxxxxxxxxxx'
|
||||
send_resolved: true
|
||||
channel: 'monitoring'
|
||||
text: "{{ range .Alerts }}<!channel> {{ .Annotations.summary }}\n{{ .Annotations.description }}\n{{ end }}"
|
||||
|
||||
- name: "pager"
|
||||
webhook_configs:
|
||||
- url: http://a.b.c.d:8080/send/sms
|
||||
send_resolved: true
|
||||
|
||||
{% endraw %}
|
||||
{% endhighlight %}
|
||||
|
||||
## Reduce Prometheus server load
|
||||
|
||||
For expansive or frequent PromQL queries, Prometheus allows to precompute rules.
|
||||
|
||||
{% highlight yaml %}
|
||||
{% raw %}
|
||||
groups:
|
||||
|
||||
# first define the recorded rule
|
||||
- name: ExampleRecordedGroup
|
||||
rules:
|
||||
- record: job:rabbitmq_queue_messages_delivered_total:rate:5m
|
||||
expr: rate(rabbitmq_queue_messages_delivered_total[5m])
|
||||
|
||||
# then use it in alerts
|
||||
- name: ExampleAlertingGroup
|
||||
rules:
|
||||
- alert: ExampleRabbitmqLowMessageDelivery
|
||||
expr: sum(job:rabbitmq_queue_messages_delivered_total:rate:5m) < 10
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Low delivery rate in Rabbitmq queues"
|
||||
{% endraw %}
|
||||
{% endhighlight %}
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
If the notification takes too much time to be triggered, check the following delays:
|
||||
- `scrape_interval = 20s` (prometheus.yml)
|
||||
- `evaluation_interval = 20s` (prometheus.yml)
|
||||
- `increase(mysql_global_status_slow_queries[1m]) > 0` (alerts/example-mysql.yml)
|
||||
- `for: 5m` (alerts/example-mysql.yml)
|
||||
- `group_wait = 10s` (alertmanager.yml)
|
||||
|
||||
Also read:
|
||||
- [https://pracucci.com/prometheus-understanding-the-delays-on-alerting.html](https://pracucci.com/prometheus-understanding-the-delays-on-alerting.html).
|
||||
- [https://hodovi.cc/blog/creating-awesome-alertmanager-templates-for-slack/](https://hodovi.cc/blog/creating-awesome-alertmanager-templates-for-slack/)
|
||||
|
||||
|
|
@ -1,143 +0,0 @@
|
|||
a.anchor {
|
||||
font-size: 15px;
|
||||
vertical-align: middle;
|
||||
color: darkblue;
|
||||
display: inline-block;
|
||||
padding-bottom: 5px;
|
||||
margin-right: 5px;
|
||||
opacity: 0;
|
||||
transition: opacity 0.4s;
|
||||
}
|
||||
|
||||
h2:hover a.anchor,
|
||||
h3:hover a.anchor,
|
||||
h4:hover a.anchor {
|
||||
opacity: 1;
|
||||
}
|
||||
|
||||
summary {
|
||||
position: relative;
|
||||
padding-left: 60px;
|
||||
padding-right: 50px;
|
||||
margin-bottom: 15px;
|
||||
font-size: 15px;
|
||||
}
|
||||
|
||||
h2 {
|
||||
position: relative;
|
||||
}
|
||||
|
||||
.clipboard-single,
|
||||
.clipboard-multiple {
|
||||
right: 0;
|
||||
position: absolute;
|
||||
cursor: pointer;
|
||||
font-size: 14px;
|
||||
color: #606c71;
|
||||
}
|
||||
|
||||
/* NAVBAR */
|
||||
#rules-navbar.affix {
|
||||
/* showed by JS */
|
||||
display: none;
|
||||
|
||||
position: fixed;
|
||||
overflow: auto;
|
||||
top: 0;
|
||||
right: 0;
|
||||
max-width: 250px;
|
||||
max-height: 100%;
|
||||
padding-top: 20px;
|
||||
padding-bottom: 20px;
|
||||
padding-left: 20px;
|
||||
padding-right: 10px;
|
||||
|
||||
background-color: #f3f6fa;
|
||||
}
|
||||
|
||||
/* hide menu on small screens */
|
||||
@media screen and (max-width: 1350px) {
|
||||
#rules-navbar.affix {
|
||||
display: none !important;
|
||||
}
|
||||
}
|
||||
|
||||
/* hide menu scrollbar */
|
||||
#rules-navbar.affix::-webkit-scrollbar {
|
||||
display: none;
|
||||
}
|
||||
|
||||
#rules-navbar.affix {
|
||||
-ms-overflow-style: none;
|
||||
/* IE and Edge */
|
||||
scrollbar-width: none;
|
||||
/* Firefox */
|
||||
}
|
||||
|
||||
#rules-navbar.affix h3 {
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
|
||||
#rules-navbar.affix h4 {
|
||||
margin: 0;
|
||||
font-weight: bold;
|
||||
font-size: 14px;
|
||||
line-height: 14px;
|
||||
}
|
||||
|
||||
#rules-navbar.affix ul,
|
||||
#rules-navbar.affix ul li {
|
||||
margin: 0;
|
||||
padding-top: 0;
|
||||
padding-bottom: 0;
|
||||
line-height: normal;
|
||||
}
|
||||
|
||||
#rules-navbar.affix>ul {
|
||||
padding-left: 0;
|
||||
padding-right: 0;
|
||||
}
|
||||
|
||||
#rules-navbar.affix>ul>li {
|
||||
margin-bottom: 10px;
|
||||
padding-left: 0;
|
||||
padding-right: 0;
|
||||
}
|
||||
|
||||
#rules-navbar.affix a {
|
||||
font-size: 14px;
|
||||
line-height: 14px;
|
||||
}
|
||||
|
||||
/* https://github.com/samber/awesome-prometheus-alerts/issues/356 */
|
||||
@media screen and (min-width: 64em) {
|
||||
.main-content {
|
||||
max-width: 85rem;
|
||||
}
|
||||
}
|
||||
|
||||
ul#sponsoring {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
margin-top: 50px;
|
||||
}
|
||||
|
||||
ul#sponsoring li {
|
||||
display: flex;
|
||||
padding: 0px 15px;
|
||||
font-size: 16px;
|
||||
}
|
||||
|
||||
ul#sponsoring li a {
|
||||
display: flex;
|
||||
}
|
||||
|
||||
ul#sponsoring li a img {
|
||||
max-width: 180px;
|
||||
max-height: 80px;
|
||||
}
|
||||
|
||||
.page-header {
|
||||
padding-bottom: 30px;
|
||||
}
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 4.3 KiB |
|
|
@ -1,16 +0,0 @@
|
|||
$(function () {
|
||||
var clipboardRules = new ClipboardJS('.clipboard-single', {
|
||||
text: function (trigger) {
|
||||
const id = trigger.getAttribute('data-clipboard-target-id');
|
||||
const html = $("#" + id + " .highlight");
|
||||
return html.text() + '\n';
|
||||
},
|
||||
});
|
||||
var clipboardCategories = new ClipboardJS('.clipboard-multiple', {
|
||||
text: function (trigger) {
|
||||
const id = trigger.getAttribute('data-clipboard-target-id');
|
||||
const html = $("[id^=" + id + "] .highlight");
|
||||
return Array.from(html.map((i, target) => $(target).text())).join('\n\n');
|
||||
},
|
||||
});
|
||||
});
|
||||
|
|
@ -1,125 +0,0 @@
|
|||
|
||||
<h1 style="text-align: center;">
|
||||
Blackbox exporter
|
||||
</h1>
|
||||
|
||||
## Wordwide probes
|
||||
|
||||
<a href="https://github.com/prometheus/blackbox_exporter" target="_blank">Blackbox Exporter</a> gives you the ability to probe endpoints over HTTP, HTTPS, DNS, TCP and ICMP.
|
||||
|
||||
You should deploy blackbox exporters in multiple Point of Presence around the globe, to monitor latency. Feel free to use the following endpoints for your own projects:
|
||||
|
||||
- https://screeb-probe-<b>montreal</b>.cleverapps.io
|
||||
- https://screeb-probe-<b>paris</b>.cleverapps.io
|
||||
- https://screeb-probe-<b>jeddah</b>.cleverapps.io
|
||||
- https://screeb-probe-<b>singapore</b>.cleverapps.io
|
||||
- https://screeb-probe-<b>sydney</b>.cleverapps.io
|
||||
- https://screeb-probe-<b>warsaw</b>.cleverapps.io
|
||||
|
||||
☝️ Logs have been disabled. More probes from the community would be appreciated, please contribute <a href="https://github.com/samber/awesome-prometheus-alerts/" target="_blank">here</a>! These blackbox exporters use the following <a href="https://github.com/ScreebApp/blackbox_exporter/blob/master/screeb.yml" target="_blank">configuration</a>.
|
||||
|
||||
## Prometheus Configuration
|
||||
|
||||
Blackbox exporters and endpoints must be declared in Prometheus. Here is a simple configuration, inspired by [Hayk Davtyan medium post](https://medium.com/geekculture/single-prometheus-job-for-dozens-of-blackbox-exporters-2a7ba492d6c8):
|
||||
|
||||
```yml
|
||||
# sd/blackbox.yml
|
||||
|
||||
- targets:
|
||||
#
|
||||
# Montreal
|
||||
#
|
||||
# http
|
||||
- screeb-probe-montreal.cleverapps.io:_:http_2xx:_:Montreal:_:f229cy:_:https://api.screeb.app
|
||||
- screeb-probe-montreal.cleverapps.io:_:http_2xx:_:Montreal:_:f229cy:_:https://t.screeb.app/tag.js
|
||||
# icmp
|
||||
- screeb-probe-montreal.cleverapps.io:_:icmp_ipv4:_:Montreal:_:f229cy:_:api.screeb.app
|
||||
- screeb-probe-montreal.cleverapps.io:_:icmp_ipv4:_:Montreal:_:f229cy:_:t.screeb.app
|
||||
|
||||
|
||||
#
|
||||
# Paris
|
||||
#
|
||||
# http
|
||||
- screeb-probe-paris.cleverapps.io:_:http_2xx:_:Paris:_:u09tgy:_:https://api.screeb.app
|
||||
- screeb-probe-paris.cleverapps.io:_:http_2xx:_:Paris:_:u09tgy:_:https://t.screeb.app/tag.js
|
||||
# icmp
|
||||
- screeb-probe-paris.cleverapps.io:_:icmp_ipv4:_:Paris:_:u09tgy:_:api.screeb.app
|
||||
- screeb-probe-paris.cleverapps.io:_:icmp_ipv4:_:Paris:_:u09tgy:_:t.screeb.app
|
||||
|
||||
|
||||
#
|
||||
# Sydney
|
||||
#
|
||||
# http
|
||||
- screeb-probe-sydney.cleverapps.io:_:http_2xx:_:Sydney:_:r3gpkn:_:https://api.screeb.app
|
||||
- screeb-probe-sydney.cleverapps.io:_:http_2xx:_:Sydney:_:r3gpkn:_:https://t.screeb.app/tag.js
|
||||
# icmp
|
||||
- screeb-probe-sydney.cleverapps.io:_:icmp_ipv4:_:Sydney:_:r3gpkn:_:api.screeb.app
|
||||
- screeb-probe-sydney.cleverapps.io:_:icmp_ipv4:_:Sydney:_:r3gpkn:_:t.screeb.app
|
||||
|
||||
# ...
|
||||
```
|
||||
|
||||
```yml
|
||||
# prometheus.yml
|
||||
|
||||
global:
|
||||
# ...
|
||||
|
||||
scrape_configs:
|
||||
|
||||
- job_name: 'blackbox'
|
||||
metrics_path: /probe
|
||||
scrape_interval: 30s
|
||||
scheme: https
|
||||
file_sd_configs:
|
||||
- files:
|
||||
- /etc/prometheus/sd/blackbox.yml
|
||||
relabel_configs:
|
||||
# adds "module" label in the final labelset
|
||||
- source_labels: [__address__]
|
||||
regex: '.*:_:(.*):_:.*:_:.*:_:.*'
|
||||
target_label: module
|
||||
# adds "geohash" label in the final labelset
|
||||
- source_labels: [__address__]
|
||||
regex: '.*:_:.*:_:.*:_:(.*):_:.*'
|
||||
target_label: geohash
|
||||
# rewrites "instance" label with corresponding URL
|
||||
- source_labels: [__address__]
|
||||
regex: '.*:_:.*:_:.*:_:.*:_:(.*)'
|
||||
target_label: instance
|
||||
# rewrites "pop" label with corresponding location name
|
||||
- source_labels: [__address__]
|
||||
regex: '.*:_:.*:_:(.*):_:.*:_:.*'
|
||||
target_label: pop
|
||||
# passes "module" parameter to Blackbox exporter
|
||||
- source_labels: [module]
|
||||
target_label: __param_module
|
||||
# passes "target" parameter to Blackbox exporter
|
||||
- source_labels: [instance]
|
||||
target_label: __param_target
|
||||
# the Blackbox exporter's real hostname:port
|
||||
- source_labels: [__address__]
|
||||
regex: '(.*):_:.*:_:.*:_:.*:_:.*'
|
||||
target_label: __address__
|
||||
|
||||
# ...
|
||||
|
||||
```
|
||||
|
||||
## Geohash
|
||||
|
||||

|
||||
|
||||
To display nice maps in Grafana, you need to instruct blackbox exporters about the location. Grafana map panel speaks the "geohash" format:
|
||||
|
||||
- go to google map
|
||||
- extract the lat/long from the url
|
||||
- convert lat/long to geohash here: http://geohash.co
|
||||
|
||||
## Grafana
|
||||
|
||||
Some great dashboard have been created by the community: https://grafana.com/grafana/dashboards/?search=blackbox
|
||||
|
||||
Since Grafana v5.0.0, a map panel is available: https://grafana.com/docs/grafana/latest/panels-visualizations/visualizations/geomap/
|
||||
123
dist/rules/apache-flink/flink-prometheus-reporter.yml
vendored
Normal file
123
dist/rules/apache-flink/flink-prometheus-reporter.yml
vendored
Normal file
|
|
@ -0,0 +1,123 @@
|
|||
groups:
|
||||
|
||||
- name: FlinkPrometheusReporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: FlinkJobIsNotRunning
|
||||
expr: 'flink_jobmanager_numRunningJobs == 0'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Flink job is not running (instance {{ $labels.instance }})
|
||||
description: "No Flink jobs are currently running. All jobs may have failed or been cancelled.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: FlinkNoTaskmanagersRegistered
|
||||
expr: 'flink_jobmanager_numRegisteredTaskManagers == 0'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Flink no TaskManagers registered (instance {{ $labels.instance }})
|
||||
description: "No TaskManagers are registered with the JobManager. The cluster has no processing capacity.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# This alert fires when there are no available task slots. Adjust the threshold if your cluster is expected to run at full capacity.
|
||||
- alert: FlinkAllTaskSlotsUsed
|
||||
expr: 'flink_jobmanager_taskSlotsAvailable == 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Flink all task slots used (instance {{ $labels.instance }})
|
||||
description: "All Flink task slots are in use ({{ $value }} available). New jobs cannot be scheduled.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# A single restart may be normal during deployments. Adjust threshold based on restart tolerance.
|
||||
- alert: FlinkJobRestartIncreasing
|
||||
expr: 'delta(flink_jobmanager_job_numRestarts[5m]) > 1'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Flink job restart increasing (instance {{ $labels.instance }})
|
||||
description: "Flink job {{ $labels.job_name }} has restarted {{ $value }} times in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: FlinkCheckpointFailures
|
||||
expr: 'delta(flink_jobmanager_job_numberOfFailedCheckpoints[10m]) > 1'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Flink checkpoint failures (instance {{ $labels.instance }})
|
||||
description: "Flink job {{ $labels.job_name }} has {{ $value }} failed checkpoints in the last 10 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Value is converted from milliseconds to seconds for correct humanizeDuration display.
|
||||
# Threshold is 60 seconds. Adjust based on your checkpoint interval and state size.
|
||||
- alert: FlinkCheckpointDurationHigh
|
||||
expr: 'flink_jobmanager_job_lastCheckpointDuration / 1000 > 60'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Flink checkpoint duration high (instance {{ $labels.instance }})
|
||||
description: "Flink job {{ $labels.job_name }} last checkpoint took {{ $value | humanizeDuration }} to complete.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: FlinkTaskBackpressured
|
||||
expr: 'flink_taskmanager_job_task_isBackPressured == 1'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Flink task backpressured (instance {{ $labels.instance }})
|
||||
description: "Flink task {{ $labels.task_name }} in job {{ $labels.job_name }} is backpressured.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Fires when a task spends more than 500ms/sec backpressured. This indicates the task cannot keep up with upstream data rate.
|
||||
- alert: FlinkTaskHighBackpressureTime
|
||||
expr: 'flink_taskmanager_job_task_backPressuredTimeMsPerSecond > 500'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Flink task high backpressure time (instance {{ $labels.instance }})
|
||||
description: "Flink task {{ $labels.task_name }} is spending {{ $value | humanize }}ms/sec in backpressure.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Flink TaskManagers manage their own memory pool. High JVM heap usage (outside managed memory) may indicate memory leaks or misconfiguration.
|
||||
- alert: FlinkTaskmanagerHeapMemoryHigh
|
||||
expr: 'flink_taskmanager_Status_JVM_Memory_Heap_Used / flink_taskmanager_Status_JVM_Memory_Heap_Max > 0.9 and flink_taskmanager_Status_JVM_Memory_Heap_Max > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Flink TaskManager heap memory high (instance {{ $labels.instance }})
|
||||
description: "Flink TaskManager {{ $labels.instance }} heap memory usage is above 90%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: FlinkJobmanagerHeapMemoryHigh
|
||||
expr: 'flink_jobmanager_Status_JVM_Memory_Heap_Used / flink_jobmanager_Status_JVM_Memory_Heap_Max > 0.9 and flink_jobmanager_Status_JVM_Memory_Heap_Max > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Flink JobManager heap memory high (instance {{ $labels.instance }})
|
||||
description: "Flink JobManager {{ $labels.instance }} heap memory usage is above 90%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Flink exposes GC time as a gauge (cumulative milliseconds), so deriv() is used instead of rate().
|
||||
# Threshold: more than 100ms/sec of GC time (10% of wall clock). Adjust based on your workload.
|
||||
- alert: FlinkTaskmanagerGcTimeHigh
|
||||
expr: 'deriv(flink_taskmanager_Status_JVM_GarbageCollector_All_Time[5m]) > 100'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Flink TaskManager GC time high (instance {{ $labels.instance }})
|
||||
description: "Flink TaskManager {{ $labels.instance }} is spending more than 10% of time in garbage collection.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Only fires for tasks that have previously received records, to avoid false positives during startup.
|
||||
- alert: FlinkNoRecordsProcessed
|
||||
expr: 'delta(flink_taskmanager_job_task_numRecordsIn[5m]) == 0 and flink_taskmanager_job_task_numRecordsIn > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Flink no records processed (instance {{ $labels.instance }})
|
||||
description: "Flink task {{ $labels.task_name }} has not processed any records in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
89
dist/rules/apache-spark/spark-prometheus.yml
vendored
Normal file
89
dist/rules/apache-spark/spark-prometheus.yml
vendored
Normal file
|
|
@ -0,0 +1,89 @@
|
|||
groups:
|
||||
|
||||
- name: SparkPrometheus
|
||||
|
||||
# Spark exposes metrics via two built-in endpoints:
|
||||
# - PrometheusServlet: master/worker/driver metrics at /metrics/prometheus/ (ports 8080, 8081, 4040)
|
||||
# - PrometheusResource: executor metrics at /metrics/executors/prometheus/ (port 4040, requires spark.ui.prometheus.enabled=true in Spark 3.x)
|
||||
# Metric names from PrometheusServlet include a dynamic namespace (application ID), making static PromQL queries challenging.
|
||||
# Configuration: spark.metrics.conf.*.sink.prometheusServlet.class=org.apache.spark.metrics.sink.PrometheusServlet
|
||||
|
||||
rules:
|
||||
|
||||
- alert: SparkNoAliveWorkers
|
||||
expr: 'metrics_master_aliveWorkers_Value == 0'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Spark no alive workers (instance {{ $labels.instance }})
|
||||
description: "No Spark workers are alive. The cluster has no processing capacity.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Adjust the threshold based on your cluster's typical queuing behavior.
|
||||
- alert: SparkTooManyWaitingApps
|
||||
expr: 'metrics_master_waitingApps_Value > 10'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Spark too many waiting apps (instance {{ $labels.instance }})
|
||||
description: "Spark has {{ $value }} applications waiting for resources.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: SparkWorkerMemoryExhausted
|
||||
expr: 'metrics_worker_memFree_MB_Value == 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Spark worker memory exhausted (instance {{ $labels.instance }})
|
||||
description: "Spark worker {{ $labels.instance }} has no free memory ({{ $value }}MB free).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Fires when a worker has no free cores. This may be normal under high load but can indicate capacity issues.
|
||||
- alert: SparkWorkerCoresExhausted
|
||||
expr: 'metrics_worker_coresFree_Value == 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Spark worker cores exhausted (instance {{ $labels.instance }})
|
||||
description: "Spark worker {{ $labels.instance }} has no free cores.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Fires when more than 10% of executor time is spent in garbage collection.
|
||||
# This metric comes from the PrometheusResource endpoint (/metrics/executors/prometheus/).
|
||||
- alert: SparkExecutorHighGcTime
|
||||
expr: 'metrics_executor_totalGCTime_seconds_total / metrics_executor_totalDuration > 0.1 and metrics_executor_totalDuration > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Spark executor high GC time (instance {{ $labels.instance }})
|
||||
description: "Spark executor {{ $labels.executor_id }} in {{ $labels.application_name }} is spending too much time in GC.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: SparkExecutorAllTasksFailing
|
||||
expr: 'metrics_executor_failedTasks_total > 0 and metrics_executor_completedTasks_total == 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Spark executor all tasks failing (instance {{ $labels.instance }})
|
||||
description: "Spark executor {{ $labels.executor_id }} has only failing tasks ({{ $value }} failed, 0 completed).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: SparkExecutorHighTaskFailureRate
|
||||
expr: 'metrics_executor_failedTasks_total / metrics_executor_totalTasks_total > 0.1 and metrics_executor_totalTasks_total > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Spark executor high task failure rate (instance {{ $labels.instance }})
|
||||
description: "Spark executor {{ $labels.executor_id }} has a task failure rate above 10%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# diskUsed is a gauge, not a counter — do not use rate(). Threshold of 1GB is a rough default.
|
||||
# Disk spilling indicates insufficient memory for the workload.
|
||||
- alert: SparkExecutorHighDiskSpill
|
||||
expr: 'metrics_executor_diskUsed_bytes > 1e9'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Spark executor high disk spill (instance {{ $labels.instance }})
|
||||
description: "Spark executor {{ $labels.executor_id }} is spilling data to disk. Consider increasing executor memory.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: LusitaniaeApacheExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: ApacheDown
|
||||
|
|
@ -14,7 +15,7 @@ groups:
|
|||
description: "Apache down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ApacheWorkersLoad
|
||||
expr: '(sum by (instance) (apache_workers{state="busy"}) / sum by (instance) (apache_scoreboard) ) * 100 > 80'
|
||||
expr: '(sum by (instance) (apache_workers{state="busy"}) / sum by (instance) (apache_scoreboard) ) * 100 > 80 and sum by (instance) (apache_scoreboard) > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -26,7 +27,7 @@ groups:
|
|||
expr: 'apache_uptime_seconds_total / 60 < 1'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Apache restart (instance {{ $labels.instance }})
|
||||
description: "Apache has just been restarted.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
|
|||
3
dist/rules/apc-ups/apcupsd_exporter.yml
vendored
3
dist/rules/apc-ups/apcupsd_exporter.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: Apcupsd_exporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: ApcUpsBatteryNearlyEmpty
|
||||
|
|
@ -32,7 +33,7 @@ groups:
|
|||
description: "UPS now running on battery (since {{$value | humanizeDuration}})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ApcUpsLowBatteryVoltage
|
||||
expr: '(apcupsd_battery_volts / apcupsd_battery_nominal_volts) < 0.95'
|
||||
expr: '(apcupsd_battery_volts / apcupsd_battery_nominal_volts) < 0.95 and apcupsd_battery_nominal_volts > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
|
|||
1
dist/rules/argocd/embedded-exporter.yml
vendored
1
dist/rules/argocd/embedded-exporter.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: ArgocdServiceNotSynced
|
||||
|
|
|
|||
141
dist/rules/aws-cloudwatch/prometheus-cloudwatch-exporter.yml
vendored
Normal file
141
dist/rules/aws-cloudwatch/prometheus-cloudwatch-exporter.yml
vendored
Normal file
|
|
@ -0,0 +1,141 @@
|
|||
groups:
|
||||
|
||||
- name: PrometheusCloudwatchExporter
|
||||
|
||||
# CloudWatch metrics are exported as aws_{namespace}_{metric_name}_{statistic} gauges.
|
||||
# The rules below cover both exporter health and common AWS service alerts.
|
||||
# Adjust thresholds and label filters to match your CloudWatch exporter configuration.
|
||||
|
||||
rules:
|
||||
|
||||
- alert: CloudwatchExporterScrapeError
|
||||
expr: 'cloudwatch_exporter_scrape_error > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: CloudWatch exporter scrape error (instance {{ $labels.instance }})
|
||||
description: "CloudWatch exporter on {{ $labels.instance }} failed to scrape metrics from AWS CloudWatch API.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CloudwatchExporterSlowScrape
|
||||
expr: 'cloudwatch_exporter_scrape_duration_seconds > 300'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: CloudWatch exporter slow scrape (instance {{ $labels.instance }})
|
||||
description: "CloudWatch exporter on {{ $labels.instance }} scrape is taking more than 5 minutes ({{ $value }}s). Consider reducing the number of metrics or splitting across multiple exporters.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# CloudWatch API calls cost money (~$0.01 per 1000 GetMetricData requests).
|
||||
# 100 requests/minute ≈ $45/month. Adjust the threshold based on your budget.
|
||||
- alert: CloudwatchApiHighRequestRate
|
||||
expr: 'sum by (instance, namespace) (rate(cloudwatch_requests_total[5m])) * 60 > 100'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: CloudWatch API high request rate (instance {{ $labels.instance }})
|
||||
description: "CloudWatch exporter on {{ $labels.instance }} is making {{ $value }} API calls per minute to namespace {{ $labels.namespace }}. This can lead to high AWS costs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Requires EC2 CPUUtilization metric configured in the CloudWatch exporter.
|
||||
- alert: AwsEc2HighCpuUtilization
|
||||
expr: 'aws_ec2_cpuutilization_average > 90'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: AWS EC2 high CPU utilization (instance {{ $labels.instance }})
|
||||
description: "EC2 instance {{ $labels.instance_id }} CPU utilization is above 90% ({{ $value }}%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Requires RDS FreeStorageSpace metric. The threshold of 2GB is a rough default.
|
||||
# Adjust based on your database size.
|
||||
- alert: AwsRdsLowFreeStorageSpace
|
||||
expr: 'aws_rds_free_storage_space_average < 2000000000'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: AWS RDS low free storage space (instance {{ $labels.instance }})
|
||||
description: "RDS instance {{ $labels.dbinstance_identifier }} has less than 2GB free storage ({{ $value }} bytes remaining).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Requires RDS CPUUtilization metric configured in the CloudWatch exporter.
|
||||
- alert: AwsRdsHighCpuUtilization
|
||||
expr: 'aws_rds_cpuutilization_average > 90'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: AWS RDS high CPU utilization (instance {{ $labels.instance }})
|
||||
description: "RDS instance {{ $labels.dbinstance_identifier }} CPU utilization is above 90% ({{ $value }}%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# The threshold depends on the RDS instance class. Adjust based on your
|
||||
# instance type's max_connections parameter.
|
||||
- alert: AwsRdsHighDatabaseConnections
|
||||
expr: 'aws_rds_database_connections_average > 100'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: AWS RDS high database connections (instance {{ $labels.instance }})
|
||||
description: "RDS instance {{ $labels.dbinstance_identifier }} has {{ $value }} active connections.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Requires SQS ApproximateNumberOfMessagesVisible metric. The threshold of 1000
|
||||
# is a rough default. Adjust based on your expected queue depth.
|
||||
- alert: AwsSqsQueueMessagesVisible
|
||||
expr: 'aws_sqs_approximate_number_of_messages_visible_average > 1000'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: AWS SQS queue messages visible (instance {{ $labels.instance }})
|
||||
description: "SQS queue {{ $labels.queue_name }} has {{ $value }} messages waiting to be processed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Requires SQS ApproximateAgeOfOldestMessage metric.
|
||||
- alert: AwsSqsMessageAgeTooOld
|
||||
expr: 'aws_sqs_approximate_age_of_oldest_message_maximum > 3600'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: AWS SQS message age too old (instance {{ $labels.instance }})
|
||||
description: "SQS queue {{ $labels.queue_name }} has messages older than 1 hour ({{ $value }}s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Requires ApplicationELB UnHealthyHostCount metric.
|
||||
- alert: AwsAlbUnhealthyTargets
|
||||
expr: 'aws_applicationelb_unhealthy_host_count_average > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: AWS ALB unhealthy targets (instance {{ $labels.instance }})
|
||||
description: "ALB {{ $labels.load_balancer }} has {{ $value }} unhealthy target(s) in target group {{ $labels.target_group }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Requires ApplicationELB HTTPCode_ELB_5XX_Count and RequestCount metrics.
|
||||
- alert: AwsAlbHigh5xxErrorRate
|
||||
expr: '(aws_applicationelb_httpcode_elb_5_xx_count_sum / aws_applicationelb_request_count_sum) * 100 > 5 and aws_applicationelb_request_count_sum > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: AWS ALB high 5xx error rate (instance {{ $labels.instance }})
|
||||
description: "ALB {{ $labels.load_balancer }} 5xx error rate is above 5% ({{ $value }}%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Requires ApplicationELB TargetResponseTime metric.
|
||||
- alert: AwsAlbHighTargetResponseTime
|
||||
expr: 'aws_applicationelb_target_response_time_average > 2'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: AWS ALB high target response time (instance {{ $labels.instance }})
|
||||
description: "ALB {{ $labels.load_balancer }} average target response time is above 2 seconds ({{ $value }}s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Requires Lambda Errors and Invocations metrics.
|
||||
- alert: AwsLambdaHighErrorRate
|
||||
expr: '(aws_lambda_errors_sum / aws_lambda_invocations_sum) * 100 > 5 and aws_lambda_invocations_sum > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: AWS Lambda high error rate (instance {{ $labels.instance }})
|
||||
description: "Lambda function {{ $labels.function_name }} error rate is above 5% ({{ $value }}%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
57
dist/rules/azure/azure-metrics-exporter.yml
vendored
Normal file
57
dist/rules/azure/azure-metrics-exporter.yml
vendored
Normal file
|
|
@ -0,0 +1,57 @@
|
|||
groups:
|
||||
|
||||
- name: AzureMetricsExporter
|
||||
|
||||
# The exporter uses azurerm_resource_metric as the default metric name for forwarded Azure Monitor metrics.
|
||||
# The metric name can be customized via the name parameter in probe configuration.
|
||||
# Self-monitoring metrics use the azurerm_stats_* and azurerm_api_* prefixes.
|
||||
|
||||
rules:
|
||||
|
||||
- alert: AzureExporterRequestErrors
|
||||
expr: 'increase(azurerm_stats_metric_requests{result="error"}[15m]) > 5'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Azure exporter request errors (instance {{ $labels.instance }})
|
||||
description: "Azure metrics exporter on {{ $labels.instance }} has {{ $value }} API request errors in the last 15 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: AzureExporterHighErrorRate
|
||||
expr: 'sum by (instance) (rate(azurerm_stats_metric_requests{result="error"}[5m])) / sum by (instance) (rate(azurerm_stats_metric_requests[5m])) * 100 > 10 and sum by (instance) (rate(azurerm_stats_metric_requests[5m])) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Azure exporter high error rate (instance {{ $labels.instance }})
|
||||
description: "Azure metrics exporter on {{ $labels.instance }} has an error rate above 10% ({{ $value }}%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Azure Resource Manager enforces rate limits per subscription.
|
||||
# The threshold of 100 remaining calls is a rough default. Adjust based on your
|
||||
# scrape interval and number of monitored resources.
|
||||
- alert: AzureApiReadRateLimitApproaching
|
||||
expr: 'azurerm_api_ratelimit{type="read"} < 100'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Azure API read rate limit approaching (instance {{ $labels.instance }})
|
||||
description: "Azure API read rate limit for subscription {{ $labels.subscriptionID }} is running low ({{ $value }} remaining).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: AzureApiWriteRateLimitApproaching
|
||||
expr: 'azurerm_api_ratelimit{type="write"} < 50'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Azure API write rate limit approaching (instance {{ $labels.instance }})
|
||||
description: "Azure API write rate limit for subscription {{ $labels.subscriptionID }} is running low ({{ $value }} remaining).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: AzureExporterSlowCollection
|
||||
expr: 'azurerm_stats_metric_collecttime > 300'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Azure exporter slow collection (instance {{ $labels.instance }})
|
||||
description: "Azure metrics exporter on {{ $labels.instance }} metric collection is taking more than 5 minutes ({{ $value }}s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
19
dist/rules/blackbox/blackbox-exporter.yml
vendored
19
dist/rules/blackbox/blackbox-exporter.yml
vendored
|
|
@ -2,11 +2,12 @@ groups:
|
|||
|
||||
- name: BlackboxExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: BlackboxProbeFailed
|
||||
expr: 'probe_success == 0'
|
||||
for: 0m
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
|
|
@ -23,7 +24,7 @@ groups:
|
|||
description: "Blackbox configuration reload failure\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: BlackboxSlowProbe
|
||||
expr: 'avg_over_time(probe_duration_seconds[1m]) > 1'
|
||||
expr: 'probe_duration_seconds > 1'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -33,7 +34,7 @@ groups:
|
|||
|
||||
- alert: BlackboxProbeHttpFailure
|
||||
expr: 'probe_http_status_code <= 199 OR probe_http_status_code >= 400'
|
||||
for: 0m
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
|
|
@ -49,15 +50,19 @@ groups:
|
|||
summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
|
||||
description: "SSL certificate expires in less than 20 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: BlackboxSslCertificateWillExpireSoon
|
||||
- alert: BlackboxSslCertificateWillExpireVerySoon
|
||||
expr: '0 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 3'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
|
||||
summary: Blackbox SSL certificate will expire very soon (instance {{ $labels.instance }})
|
||||
description: "SSL certificate expires in less than 3 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# For probe_ssl_earliest_cert_expiry to be exposed after expiration, you
|
||||
# need to enable insecure_skip_verify. Note that this will disable
|
||||
# certificate validation.
|
||||
# See https://github.com/prometheus/blackbox_exporter/blob/master/CONFIGURATION.md#tls_config
|
||||
- alert: BlackboxSslCertificateExpired
|
||||
expr: 'round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 0'
|
||||
for: 0m
|
||||
|
|
@ -68,7 +73,7 @@ groups:
|
|||
description: "SSL certificate has expired already\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: BlackboxProbeSlowHttp
|
||||
expr: 'avg_over_time(probe_http_duration_seconds[1m]) > 1'
|
||||
expr: 'probe_http_duration_seconds > 1'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -77,7 +82,7 @@ groups:
|
|||
description: "HTTP request took more than 1s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: BlackboxProbeSlowPing
|
||||
expr: 'avg_over_time(probe_icmp_duration_seconds[1m]) > 1'
|
||||
expr: 'probe_icmp_duration_seconds > 1'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
|
|||
33
dist/rules/caddy/embedded-exporter.yml
vendored
Normal file
33
dist/rules/caddy/embedded-exporter.yml
vendored
Normal file
|
|
@ -0,0 +1,33 @@
|
|||
groups:
|
||||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: CaddyReverseProxyDown
|
||||
expr: 'caddy_reverse_proxy_upstreams_healthy == 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Caddy Reverse Proxy Down (instance {{ $labels.instance }})
|
||||
description: "Caddy reverse proxy upstream {{ $labels.upstream }} is unhealthy\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CaddyHighHttp4xxErrorRateService
|
||||
expr: 'sum(rate(caddy_http_request_duration_seconds_count{code=~"4.."}[3m])) by (instance) / sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) * 100 > 5 and sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) > 0'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Caddy high HTTP 4xx error rate service (instance {{ $labels.instance }})
|
||||
description: "Caddy service 4xx error rate is above 5%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CaddyHighHttp5xxErrorRateService
|
||||
expr: 'sum(rate(caddy_http_request_duration_seconds_count{code=~"5.."}[3m])) by (instance) / sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) * 100 > 5 and sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) > 0'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Caddy high HTTP 5xx error rate service (instance {{ $labels.instance }})
|
||||
description: "Caddy service 5xx error rate is above 5%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: CriteoCassandraExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: CassandraHintsCount
|
||||
|
|
@ -14,7 +15,7 @@ groups:
|
|||
description: "Cassandra hints count has changed on {{ $labels.instance }} some nodes may go down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CassandraCompactionTaskPending
|
||||
expr: 'avg_over_time(cassandra_stats{name="org:apache:cassandra:metrics:compaction:pendingtasks:value"}[1m]) > 100'
|
||||
expr: 'cassandra_stats{name="org:apache:cassandra:metrics:compaction:pendingtasks:value"} > 100'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -23,7 +24,7 @@ groups:
|
|||
description: "Many Cassandra compaction tasks are pending. You might need to increase I/O capacity by adding nodes to the cluster.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CassandraViewwriteLatency
|
||||
expr: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:viewwrite:viewwritelatency:99thpercentile",service="cas"} > 100000'
|
||||
expr: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:viewwrite:viewwritelatency:99thpercentile"} > 100000'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -31,49 +32,50 @@ groups:
|
|||
summary: Cassandra viewwrite latency (instance {{ $labels.instance }})
|
||||
description: "High viewwrite latency on {{ $labels.instance }} cassandra node\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CassandraBadHacker
|
||||
expr: 'rate(cassandra_stats{name="org:apache:cassandra:metrics:client:authfailure:count"}[1m]) > 5'
|
||||
- alert: CassandraAuthenticationFailures
|
||||
expr: 'delta(cassandra_stats{name="org:apache:cassandra:metrics:client:authfailure:count"}[1m]) > 5'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Cassandra bad hacker (instance {{ $labels.instance }})
|
||||
summary: Cassandra authentication failures (instance {{ $labels.instance }})
|
||||
description: "Increase of Cassandra authentication failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# 1m delay allows a restart without triggering an alert.
|
||||
- alert: CassandraNodeDown
|
||||
expr: 'sum(cassandra_stats{name="org:apache:cassandra:net:failuredetector:downendpointcount"}) by (service,group,cluster,env) > 0'
|
||||
for: 0m
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Cassandra node down (instance {{ $labels.instance }})
|
||||
description: "Cassandra node down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CassandraCommitlogPendingTasks
|
||||
- alert: CassandraCommitlogPendingTasks(criteo)
|
||||
expr: 'cassandra_stats{name="org:apache:cassandra:metrics:commitlog:pendingtasks:value"} > 15'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Cassandra commitlog pending tasks (instance {{ $labels.instance }})
|
||||
summary: Cassandra commitlog pending tasks (Criteo) (instance {{ $labels.instance }})
|
||||
description: "Unexpected number of Cassandra commitlog pending tasks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CassandraCompactionExecutorBlockedTasks
|
||||
- alert: CassandraCompactionExecutorBlockedTasks(criteo)
|
||||
expr: 'cassandra_stats{name="org:apache:cassandra:metrics:threadpools:internal:compactionexecutor:currentlyblockedtasks:count"} > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Cassandra compaction executor blocked tasks (instance {{ $labels.instance }})
|
||||
summary: Cassandra compaction executor blocked tasks (Criteo) (instance {{ $labels.instance }})
|
||||
description: "Some Cassandra compaction executor tasks are blocked\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CassandraFlushWriterBlockedTasks
|
||||
- alert: CassandraFlushWriterBlockedTasks(criteo)
|
||||
expr: 'cassandra_stats{name="org:apache:cassandra:metrics:threadpools:internal:memtableflushwriter:currentlyblockedtasks:count"} > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Cassandra flush writer blocked tasks (instance {{ $labels.instance }})
|
||||
summary: Cassandra flush writer blocked tasks (Criteo) (instance {{ $labels.instance }})
|
||||
description: "Some Cassandra flush writer tasks are blocked\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CassandraRepairPendingTasks
|
||||
|
|
@ -94,74 +96,75 @@ groups:
|
|||
summary: Cassandra repair blocked tasks (instance {{ $labels.instance }})
|
||||
description: "Some Cassandra repair tasks are blocked\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CassandraConnectionTimeoutsTotal
|
||||
expr: 'rate(cassandra_stats{name="org:apache:cassandra:metrics:connection:totaltimeouts:count"}[1m]) > 5'
|
||||
- alert: CassandraConnectionTimeoutsTotal(criteo)
|
||||
expr: 'delta(cassandra_stats{name="org:apache:cassandra:metrics:connection:totaltimeouts:count"}[1m]) > 5'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Cassandra connection timeouts total (instance {{ $labels.instance }})
|
||||
summary: Cassandra connection timeouts total (Criteo) (instance {{ $labels.instance }})
|
||||
description: "Some connection between nodes are ending in timeout\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CassandraStorageExceptions
|
||||
- alert: CassandraStorageExceptions(criteo)
|
||||
expr: 'changes(cassandra_stats{name="org:apache:cassandra:metrics:storage:exceptions:count"}[1m]) > 1'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Cassandra storage exceptions (instance {{ $labels.instance }})
|
||||
summary: Cassandra storage exceptions (Criteo) (instance {{ $labels.instance }})
|
||||
description: "Something is going wrong with cassandra storage\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CassandraTombstoneDump
|
||||
- alert: CassandraTombstoneDump(criteo)
|
||||
expr: 'cassandra_stats{name="org:apache:cassandra:metrics:table:tombstonescannedhistogram:99thpercentile"} > 1000'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Cassandra tombstone dump (instance {{ $labels.instance }})
|
||||
summary: Cassandra tombstone dump (Criteo) (instance {{ $labels.instance }})
|
||||
description: "Too much tombstones scanned in queries\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CassandraClientRequestUnavailableWrite
|
||||
- alert: CassandraClientRequestUnavailableWrite(criteo)
|
||||
expr: 'changes(cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:write:unavailables:count"}[1m]) > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Cassandra client request unavailable write (instance {{ $labels.instance }})
|
||||
summary: Cassandra client request unavailable write (Criteo) (instance {{ $labels.instance }})
|
||||
description: "Write failures have occurred because too many nodes are unavailable\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CassandraClientRequestUnavailableRead
|
||||
- alert: CassandraClientRequestUnavailableRead(criteo)
|
||||
expr: 'changes(cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:read:unavailables:count"}[1m]) > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Cassandra client request unavailable read (instance {{ $labels.instance }})
|
||||
summary: Cassandra client request unavailable read (Criteo) (instance {{ $labels.instance }})
|
||||
description: "Read failures have occurred because too many nodes are unavailable\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CassandraClientRequestWriteFailure
|
||||
expr: 'increase(cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:write:failures:oneminuterate"}[1m]) > 0'
|
||||
- alert: CassandraClientRequestWriteFailure(criteo)
|
||||
expr: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:write:failures:oneminuterate"} > 0.05'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Cassandra client request write failure (instance {{ $labels.instance }})
|
||||
summary: Cassandra client request write failure (Criteo) (instance {{ $labels.instance }})
|
||||
description: "A lot of write failures encountered. A write failure is a non-timeout exception encountered during a write request. Examine the reason map to find to the root cause. The most common cause for this type of error is when batch sizes are too large.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CassandraClientRequestReadFailure
|
||||
expr: 'increase(cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:read:failures:oneminuterate"}[1m]) > 0'
|
||||
- alert: CassandraClientRequestReadFailure(criteo)
|
||||
expr: 'cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:read:failures:oneminuterate"} > 0.05'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Cassandra client request read failure (instance {{ $labels.instance }})
|
||||
summary: Cassandra client request read failure (Criteo) (instance {{ $labels.instance }})
|
||||
description: "A lot of read failures encountered. A read failure is a non-timeout exception encountered during a read request. Examine the reason map to find to the root cause. The most common cause for this type of error is when batch sizes are too large.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# A low key cache hit rate increases disk I/O. Threshold is workload-dependent — adjust based on your data access patterns.
|
||||
- alert: CassandraCacheHitRateKeyCache
|
||||
expr: 'cassandra_stats{name="org:apache:cassandra:metrics:cache:keycache:hitrate:value"} < .85'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Cassandra cache hit rate key cache (instance {{ $labels.instance }})
|
||||
description: "Key cache hit rate is below 85%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
|
|||
|
|
@ -2,11 +2,13 @@ groups:
|
|||
|
||||
- name: InstaclustrCassandraExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
# 1m delay allows a restart without triggering an alert.
|
||||
- alert: CassandraNodeIsUnavailable
|
||||
expr: 'sum(cassandra_endpoint_active) by (cassandra_cluster,instance,exported_endpoint) < 1'
|
||||
for: 0m
|
||||
expr: 'cassandra_endpoint_active < 1'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
|
|
@ -22,92 +24,92 @@ groups:
|
|||
summary: Cassandra many compaction tasks are pending (instance {{ $labels.instance }})
|
||||
description: "Many Cassandra compaction tasks are pending - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CassandraCommitlogPendingTasks
|
||||
- alert: CassandraCommitlogPendingTasks(instaclustr)
|
||||
expr: 'cassandra_commit_log_pending_tasks > 15'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Cassandra commitlog pending tasks (instance {{ $labels.instance }})
|
||||
summary: Cassandra commitlog pending tasks (Instaclustr) (instance {{ $labels.instance }})
|
||||
description: "Cassandra commitlog pending tasks - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CassandraCompactionExecutorBlockedTasks
|
||||
- alert: CassandraCompactionExecutorBlockedTasks(instaclustr)
|
||||
expr: 'cassandra_thread_pool_blocked_tasks{pool="CompactionExecutor"} > 15'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Cassandra compaction executor blocked tasks (instance {{ $labels.instance }})
|
||||
summary: Cassandra compaction executor blocked tasks (Instaclustr) (instance {{ $labels.instance }})
|
||||
description: "Some Cassandra compaction executor tasks are blocked - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CassandraFlushWriterBlockedTasks
|
||||
- alert: CassandraFlushWriterBlockedTasks(instaclustr)
|
||||
expr: 'cassandra_thread_pool_blocked_tasks{pool="MemtableFlushWriter"} > 15'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Cassandra flush writer blocked tasks (instance {{ $labels.instance }})
|
||||
summary: Cassandra flush writer blocked tasks (Instaclustr) (instance {{ $labels.instance }})
|
||||
description: "Some Cassandra flush writer tasks are blocked - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CassandraConnectionTimeoutsTotal
|
||||
expr: 'avg(cassandra_client_request_timeouts_total) by (cassandra_cluster,instance) > 5'
|
||||
- alert: CassandraConnectionTimeoutsTotal(instaclustr)
|
||||
expr: 'sum by (cassandra_cluster,instance) (rate(cassandra_client_request_timeouts_total[5m])) > 5'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Cassandra connection timeouts total (instance {{ $labels.instance }})
|
||||
summary: Cassandra connection timeouts total (Instaclustr) (instance {{ $labels.instance }})
|
||||
description: "Some connection between nodes are ending in timeout - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CassandraStorageExceptions
|
||||
- alert: CassandraStorageExceptions(instaclustr)
|
||||
expr: 'changes(cassandra_storage_exceptions_total[1m]) > 1'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Cassandra storage exceptions (instance {{ $labels.instance }})
|
||||
summary: Cassandra storage exceptions (Instaclustr) (instance {{ $labels.instance }})
|
||||
description: "Something is going wrong with cassandra storage - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CassandraTombstoneDump
|
||||
- alert: CassandraTombstoneDump(instaclustr)
|
||||
expr: 'avg(cassandra_table_tombstones_scanned{quantile="0.99"}) by (instance,cassandra_cluster,keyspace) > 100'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Cassandra tombstone dump (instance {{ $labels.instance }})
|
||||
summary: Cassandra tombstone dump (Instaclustr) (instance {{ $labels.instance }})
|
||||
description: "Cassandra tombstone dump - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CassandraClientRequestUnavailableWrite
|
||||
- alert: CassandraClientRequestUnavailableWrite(instaclustr)
|
||||
expr: 'changes(cassandra_client_request_unavailable_exceptions_total{operation="write"}[1m]) > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Cassandra client request unavailable write (instance {{ $labels.instance }})
|
||||
summary: Cassandra client request unavailable write (Instaclustr) (instance {{ $labels.instance }})
|
||||
description: "Some Cassandra client requests are unavailable to write - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CassandraClientRequestUnavailableRead
|
||||
- alert: CassandraClientRequestUnavailableRead(instaclustr)
|
||||
expr: 'changes(cassandra_client_request_unavailable_exceptions_total{operation="read"}[1m]) > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Cassandra client request unavailable read (instance {{ $labels.instance }})
|
||||
summary: Cassandra client request unavailable read (Instaclustr) (instance {{ $labels.instance }})
|
||||
description: "Some Cassandra client requests are unavailable to read - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CassandraClientRequestWriteFailure
|
||||
expr: 'increase(cassandra_client_request_failures_total{operation="write"}[1m]) > 0'
|
||||
- alert: CassandraClientRequestWriteFailure(instaclustr)
|
||||
expr: 'increase(cassandra_client_request_failures_total{operation="write"}[1m]) > 5'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Cassandra client request write failure (instance {{ $labels.instance }})
|
||||
description: "Read failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
summary: Cassandra client request write failure (Instaclustr) (instance {{ $labels.instance }})
|
||||
description: "Write failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CassandraClientRequestReadFailure
|
||||
expr: 'increase(cassandra_client_request_failures_total{operation="read"}[1m]) > 0'
|
||||
- alert: CassandraClientRequestReadFailure(instaclustr)
|
||||
expr: 'increase(cassandra_client_request_failures_total{operation="read"}[1m]) > 5'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Cassandra client request read failure (instance {{ $labels.instance }})
|
||||
summary: Cassandra client request read failure (Instaclustr) (instance {{ $labels.instance }})
|
||||
description: "Read failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
|
|||
24
dist/rules/ceph/embedded-exporter.yml
vendored
24
dist/rules/ceph/embedded-exporter.yml
vendored
|
|
@ -2,11 +2,14 @@ groups:
|
|||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
# ceph_health_status: 0=HEALTH_OK, 1=HEALTH_WARN, 2=HEALTH_ERR.
|
||||
# This rule fires on any non-OK state. Split into ==1 (warning) and ==2 (critical) if you want separate severity levels.
|
||||
- alert: CephState
|
||||
expr: 'ceph_health_status != 0'
|
||||
for: 0m
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
|
|
@ -33,15 +36,16 @@ groups:
|
|||
|
||||
- alert: CephOsdDown
|
||||
expr: 'ceph_osd_up == 0'
|
||||
for: 0m
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Ceph OSD Down (instance {{ $labels.instance }})
|
||||
description: "Ceph Object Storage Daemon Down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold of 5000ms (5 seconds). Adjust based on your expected OSD performance.
|
||||
- alert: CephHighOsdLatency
|
||||
expr: 'ceph_osd_perf_apply_latency_seconds > 5'
|
||||
expr: 'ceph_osd_apply_latency_ms > 5000'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -49,14 +53,16 @@ groups:
|
|||
summary: Ceph high OSD latency (instance {{ $labels.instance }})
|
||||
description: "Ceph Object Storage Daemon latency is high. Please check if it doesn't stuck in weird state.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CephOsdLowSpace
|
||||
expr: 'ceph_osd_utilization > 90'
|
||||
for: 2m
|
||||
# Ceph internally triggers OSD_NEARFULL based on the nearfull_ratio (default 85%).
|
||||
# ceph_health_detail exposes named health checks as individual time series.
|
||||
- alert: CephOsdNearFull
|
||||
expr: 'ceph_health_detail{name="OSD_NEARFULL"} == 1'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Ceph OSD low space (instance {{ $labels.instance }})
|
||||
description: "Ceph Object Storage Daemon is going out of space. Please add more disks.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
summary: Ceph OSD near full (instance {{ $labels.instance }})
|
||||
description: "A Ceph OSD is dangerously full. Please add more disks.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CephOsdReweighted
|
||||
expr: 'ceph_osd_weight < 1'
|
||||
|
|
@ -114,7 +120,7 @@ groups:
|
|||
|
||||
- alert: CephPgUnavailable
|
||||
expr: 'ceph_pg_total - ceph_pg_active > 0'
|
||||
for: 0m
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
|
|
|
|||
45
dist/rules/cert-manager/embedded-exporter.yml
vendored
Normal file
45
dist/rules/cert-manager/embedded-exporter.yml
vendored
Normal file
|
|
@ -0,0 +1,45 @@
|
|||
groups:
|
||||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: Cert-managerAbsent
|
||||
expr: 'absent(up{job="cert-manager"})'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Cert-Manager absent (instance {{ $labels.instance }})
|
||||
description: "Cert-Manager has disappeared from Prometheus service discovery. New certificates will not be able to be minted, and existing ones can't be renewed until cert-manager is back.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold of 21 days is a rough default. ACME certificates are typically renewed 30 days before expiry, so expiring within 21 days may indicate issuer misconfiguration.
|
||||
- alert: Cert-managerCertificateExpiringSoon
|
||||
expr: 'avg by (exported_namespace, namespace, name) (certmanager_certificate_expiration_timestamp_seconds - time()) < (21 * 24 * 3600)'
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Cert-Manager certificate expiring soon (instance {{ $labels.instance }})
|
||||
description: "The certificate {{ $labels.name }} is expiring in less than 21 days.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: Cert-managerCertificateNotReady
|
||||
expr: 'max by (name, exported_namespace, namespace, condition) (certmanager_certificate_ready_status{condition!="True"} == 1)'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Cert-Manager certificate not ready (instance {{ $labels.instance }})
|
||||
description: "The certificate {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is not ready to serve traffic.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Metric renamed in cert-manager v1.19+ (dropped the http_ prefix): certmanager_acme_client_request_count.
|
||||
# For cert-manager < v1.19, use: certmanager_http_acme_client_request_count.
|
||||
- alert: Cert-managerHittingAcmeRateLimits
|
||||
expr: 'sum by (host) (rate(certmanager_acme_client_request_count{status="429"}[5m])) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Cert-Manager hitting ACME rate limits (instance {{ $labels.instance }})
|
||||
description: "Cert-Manager is being rate-limited by the ACME provider. Certificate issuance and renewal may be blocked for up to a week.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
294
dist/rules/cilium/embedded-exporter.yml
vendored
Normal file
294
dist/rules/cilium/embedded-exporter.yml
vendored
Normal file
|
|
@ -0,0 +1,294 @@
|
|||
groups:
|
||||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
# Metric name depends on Cilium version. Use cilium_unreachable_nodes (older) or cilium_node_connectivity_status (1.14+).
|
||||
- alert: CiliumAgentUnreachableNodes
|
||||
expr: 'sum(cilium_unreachable_nodes{}) by (pod) > 0'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Cilium agent unreachable nodes (instance {{ $labels.instance }})
|
||||
description: "Cilium agent {{ $labels.pod }} cannot reach {{ $value }} node(s). Check network connectivity and node health.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Metric name depends on Cilium version. Use cilium_unreachable_health_endpoints (older) or cilium_node_connectivity_status (1.14+).
|
||||
- alert: CiliumAgentUnreachableHealthEndpoints
|
||||
expr: 'sum(cilium_unreachable_health_endpoints{}) by (pod) > 0'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Cilium agent unreachable health endpoints (instance {{ $labels.instance }})
|
||||
description: "Cilium agent {{ $labels.pod }} cannot reach {{ $value }} health endpoint(s). Node-to-node health probes are failing.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Metric name depends on Cilium version. Use cilium_controllers_failing (older) or cilium_controllers_runs_total (1.14+).
|
||||
- alert: CiliumAgentFailingControllers
|
||||
expr: 'sum(cilium_controllers_failing{}) by (pod) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Cilium agent failing controllers (instance {{ $labels.instance }})
|
||||
description: "Cilium agent {{ $labels.pod }} has {{ $value }} failing controller(s). Check cilium-agent logs for details.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CiliumAgentEndpointFailures
|
||||
expr: 'sum(cilium_endpoint_state{endpoint_state="invalid"}) by (pod) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Cilium agent endpoint failures (instance {{ $labels.instance }})
|
||||
description: "Cilium agent {{ $labels.pod }} has {{ $value }} endpoint(s) in invalid state.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CiliumAgentEndpointRegenerationFailures
|
||||
expr: 'sum(rate(cilium_endpoint_regenerations_total{outcome="fail"}[5m])) by (pod) > 0.05'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Cilium agent endpoint regeneration failures (instance {{ $labels.instance }})
|
||||
description: "Cilium agent {{ $labels.pod }} is failing to regenerate endpoints. Network policy enforcement may be stale.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CiliumAgentEndpointUpdateFailure
|
||||
expr: 'sum(rate(cilium_k8s_client_api_calls_total{method=~"(PUT|POST|PATCH)", endpoint="endpoint", return_code!~"2[0-9][0-9]"}[5m])) by (pod, method, return_code) > 0.05'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Cilium agent endpoint update failure (instance {{ $labels.instance }})
|
||||
description: "Cilium agent {{ $labels.pod }} is failing K8s endpoint update API calls ({{ $labels.method }} {{ $labels.return_code }}).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CiliumAgentEndpointCreateFailure
|
||||
expr: 'sum(rate(cilium_api_limiter_processed_requests_total{api_call=~"endpoint-create", outcome="fail"}[1m])) by (pod, api_call) > 0.05'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Cilium agent endpoint create failure (instance {{ $labels.instance }})
|
||||
description: "Cilium agent {{ $labels.pod }} is failing CNI endpoint-create calls. New pods may fail to get networking.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CiliumAgentMapOperationFailures
|
||||
expr: 'sum(rate(cilium_bpf_map_ops_total{outcome="fail"}[5m])) by (map_name, pod) > 0.05'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Cilium agent map operation failures (instance {{ $labels.instance }})
|
||||
description: "Cilium agent {{ $labels.pod }} has eBPF map operation failures on {{ $labels.map_name }}. Datapath may be degraded.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Map pressure is a ratio from 0 to 1. At 1.0, the map is full and new entries will be dropped.
|
||||
- alert: CiliumAgentBpfMapPressure
|
||||
expr: 'cilium_bpf_map_pressure{} > 0.9'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Cilium agent BPF map pressure (instance {{ $labels.instance }})
|
||||
description: "Cilium agent {{ $labels.pod }} eBPF map {{ $labels.map_name }} is above 90% utilization. Map may become full.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CiliumAgentConntrackTableFull
|
||||
expr: 'sum(rate(cilium_drop_count_total{reason="CT: Map insertion failed"}[5m])) by (pod) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Cilium agent conntrack table full (instance {{ $labels.instance }})
|
||||
description: "Cilium agent {{ $labels.pod }} conntrack table is full, causing packet drops. Increase CT map size or investigate connection leaks.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CiliumAgentConntrackFailedGarbageCollection
|
||||
expr: 'sum(rate(cilium_datapath_conntrack_gc_runs_total{status="uncompleted"}[5m])) by (pod) > 0.05'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Cilium agent conntrack failed garbage collection (instance {{ $labels.instance }})
|
||||
description: "Cilium agent {{ $labels.pod }} conntrack garbage collection is failing. Stale entries may accumulate.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CiliumAgentNatTableFull
|
||||
expr: 'sum(rate(cilium_drop_count_total{reason="No mapping for NAT masquerade"}[1m])) by (pod) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Cilium agent NAT table full (instance {{ $labels.instance }})
|
||||
description: "Cilium agent {{ $labels.pod }} NAT table is full, causing masquerade failures. Increase NAT map size or investigate.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Policy denials may be expected behavior. Investigate only if unexpected traffic is being blocked.
|
||||
- alert: CiliumAgentHighDeniedRate
|
||||
expr: 'sum(rate(cilium_drop_count_total{reason="Policy denied"}[1m])) by (pod) > 0'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Cilium agent high denied rate (instance {{ $labels.instance }})
|
||||
description: "Cilium agent {{ $labels.pod }} is dropping packets due to policy denial. Verify network policies are correct.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CiliumAgentHighDropRate
|
||||
expr: 'sum(rate(cilium_drop_count_total{reason!~"Policy denied"}[5m])) by (pod, reason) > 0.05'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Cilium agent high drop rate (instance {{ $labels.instance }})
|
||||
description: "Cilium agent {{ $labels.pod }} is dropping packets for reason {{ $labels.reason }}. This indicates infrastructure issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CiliumAgentPolicyMapPressure
|
||||
expr: 'sum(cilium_bpf_map_pressure{map_name=~"cilium_policy_.*"}) by (pod) > 0.9'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Cilium agent policy map pressure (instance {{ $labels.instance }})
|
||||
description: "Cilium agent {{ $labels.pod }} policy BPF map is above 90% utilization. New policies may fail to apply.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CiliumAgentPolicyImportErrors
|
||||
expr: 'sum(rate(cilium_policy_change_total{outcome="fail"}[5m])) by (pod) > 0.05'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Cilium agent policy import errors (instance {{ $labels.instance }})
|
||||
description: "Cilium agent {{ $labels.pod }} is failing to import network policies. Policy enforcement may be incomplete.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold of 60s is a rough default. Adjust based on cluster size and policy complexity.
|
||||
- alert: CiliumAgentPolicyImplementationDelay
|
||||
expr: 'histogram_quantile(0.99, sum(rate(cilium_policy_implementation_delay_bucket[5m])) by (le, pod)) > 60'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Cilium agent policy implementation delay (instance {{ $labels.instance }})
|
||||
description: "Cilium agent {{ $labels.pod }} P99 policy deployment latency exceeds 60 seconds. Endpoints may run with stale policies.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CiliumNode-localHighIdentityAllocation
|
||||
expr: '(sum(cilium_identity{type="node_local"}) by (pod) / (2^16-1)) > 0.8'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Cilium node-local high identity allocation (instance {{ $labels.instance }})
|
||||
description: "Cilium agent {{ $labels.pod }} node-local identity allocation is above 80%. Approaching the 65535 identity limit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CiliumClusterHighIdentityAllocation
|
||||
expr: '(sum(cilium_identity{type="cluster_local"}) by () / (2^16-256)) > 0.8'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Cilium cluster high identity allocation (instance {{ $labels.instance }})
|
||||
description: "Cilium cluster-wide identity allocation is above 80%. Approaching the maximum identity limit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CiliumOperatorExhaustedIpamIps
|
||||
expr: 'sum(cilium_operator_ipam_ips{type="available"}) by () <= 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Cilium operator exhausted IPAM IPs (instance {{ $labels.instance }})
|
||||
description: "Cilium operator has no available IPAM IPs. New pods will fail to schedule networking.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold of 90% is a rough default. Adjust based on your pod churn rate and IP pool size.
|
||||
- alert: CiliumOperatorLowAvailableIpamIps
|
||||
expr: 'sum(cilium_operator_ipam_ips{type!="available"}) by () / sum(cilium_operator_ipam_ips) by () > 0.9 and sum(cilium_operator_ipam_ips) by () > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Cilium operator low available IPAM IPs (instance {{ $labels.instance }})
|
||||
description: "Cilium operator IPAM IP pool is over 90% utilized. Allocate more IPs to avoid exhaustion.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Some Cilium versions may not have a status label on this metric. Verify against your Cilium version.
|
||||
- alert: CiliumOperatorIpamInterfaceCreationFailures
|
||||
expr: 'sum(rate(cilium_operator_ipam_interface_creation_ops{status!="success"}[5m])) by () > 0.05'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Cilium operator IPAM interface creation failures (instance {{ $labels.instance }})
|
||||
description: "Cilium operator is failing to create IPAM network interfaces. IP allocation may be impacted.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CiliumAgentApiErrors
|
||||
expr: 'sum(rate(cilium_agent_api_process_time_seconds_count{return_code=~"5[0-9][0-9]"}[5m])) by (pod, return_code) > 0.05'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Cilium agent API errors (instance {{ $labels.instance }})
|
||||
description: "Cilium agent {{ $labels.pod }} API is returning 5xx errors ({{ $labels.return_code }}). Agent may be unhealthy.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CiliumAgentKubernetesClientErrors
|
||||
expr: 'sum(rate(cilium_k8s_client_api_calls_total{endpoint!="metrics", return_code!~"2[0-9][0-9]"}[5m])) by (pod, endpoint, return_code) > 0.05'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Cilium agent Kubernetes client errors (instance {{ $labels.instance }})
|
||||
description: "Cilium agent {{ $labels.pod }} is receiving errors from K8s API for endpoint {{ $labels.endpoint }} ({{ $labels.return_code }}).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CiliumClustermeshRemoteClusterNotReady
|
||||
expr: 'count(cilium_clustermesh_remote_cluster_readiness_status < 1) by (source_cluster, target_cluster) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Cilium ClusterMesh remote cluster not ready (instance {{ $labels.instance }})
|
||||
description: "Cilium ClusterMesh remote cluster {{ $labels.target_cluster }} is not ready from {{ $labels.source_cluster }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CiliumClustermeshRemoteClusterFailing
|
||||
expr: 'sum(cilium_clustermesh_remote_cluster_failures) by (source_cluster, target_cluster) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Cilium ClusterMesh remote cluster failing (instance {{ $labels.instance }})
|
||||
description: "Cilium ClusterMesh connectivity to remote cluster {{ $labels.target_cluster }} from {{ $labels.source_cluster }} is failing ({{ $value }} failures).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CiliumKvstoremeshRemoteClusterNotReady
|
||||
expr: 'count(cilium_kvstoremesh_remote_cluster_readiness_status < 1) by (source_cluster, target_cluster) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Cilium KVStoreMesh remote cluster not ready (instance {{ $labels.instance }})
|
||||
description: "Cilium KVStoreMesh remote cluster {{ $labels.target_cluster }} is not ready from {{ $labels.source_cluster }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CiliumKvstoremeshRemoteClusterFailing
|
||||
expr: 'sum(cilium_kvstoremesh_remote_cluster_failures) by (source_cluster, target_cluster) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Cilium KVStoreMesh remote cluster failing (instance {{ $labels.instance }})
|
||||
description: "Cilium KVStoreMesh remote cluster {{ $labels.target_cluster }} from {{ $labels.source_cluster }} is experiencing failures ({{ $value }} failures).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CiliumKvstoremeshSyncErrors
|
||||
expr: 'sum(rate(cilium_kvstoremesh_kvstore_sync_errors_total[5m])) by (source_cluster) > 0.05'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Cilium KVStoreMesh sync errors (instance {{ $labels.instance }})
|
||||
description: "Cilium KVStoreMesh from {{ $labels.source_cluster }} is experiencing kvstore sync errors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CiliumHubbleLostEvents
|
||||
expr: 'sum(rate(hubble_lost_events_total[5m])) by (pod) > 0.05'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Cilium Hubble lost events (instance {{ $labels.instance }})
|
||||
description: "Cilium Hubble on {{ $labels.pod }} is losing flow events. Observability data may be incomplete.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold of 10% is a rough default. Some DNS errors may be normal depending on your workload.
|
||||
- alert: CiliumHubbleHighDnsErrorRate
|
||||
expr: 'sum(rate(hubble_dns_responses_total{rcode!="No Error"}[5m])) by (pod) / sum(rate(hubble_dns_responses_total[5m])) by (pod) > 0.1 and sum(rate(hubble_dns_responses_total[5m])) by (pod) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Cilium Hubble high DNS error rate (instance {{ $labels.instance }})
|
||||
description: "Cilium Hubble on {{ $labels.pod }} is observing more than 10% DNS error responses.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
90
dist/rules/clickhouse/embedded-exporter.yml
vendored
90
dist/rules/clickhouse/embedded-exporter.yml
vendored
|
|
@ -2,10 +2,21 @@ groups:
|
|||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
# Adjust the job label to match your Prometheus configuration.
|
||||
- alert: ClickhouseNodeDown
|
||||
expr: 'up{job="clickhouse"} == 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: ClickHouse node down (instance {{ $labels.instance }})
|
||||
description: "No metrics received from ClickHouse exporter for over 2 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ClickhouseMemoryUsageCritical
|
||||
expr: 'ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 90'
|
||||
expr: 'ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 90 and ClickHouseAsyncMetrics_CGroupMemoryTotal > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
|
|
@ -14,7 +25,7 @@ groups:
|
|||
description: "Memory usage is critically high, over 90%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ClickhouseMemoryUsageWarning
|
||||
expr: 'ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 80'
|
||||
expr: 'ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 80 and ClickHouseAsyncMetrics_CGroupMemoryTotal > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -23,7 +34,7 @@ groups:
|
|||
description: "Memory usage is over 80%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ClickhouseDiskSpaceLowOnDefault
|
||||
expr: 'ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 20'
|
||||
expr: 'ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 20 and (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -32,7 +43,7 @@ groups:
|
|||
description: "Disk space on default is below 20%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ClickhouseDiskSpaceCriticalOnDefault
|
||||
expr: 'ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 10'
|
||||
expr: 'ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 10 and (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
|
|
@ -41,7 +52,7 @@ groups:
|
|||
description: "Disk space on default disk is critically low, below 10%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ClickhouseDiskSpaceLowOnBackups
|
||||
expr: 'ClickHouseAsyncMetrics_DiskAvailable_backups / (ClickHouseAsyncMetrics_DiskAvailable_backups + ClickHouseAsyncMetrics_DiskUsed_backups) * 100 < 20'
|
||||
expr: 'ClickHouseAsyncMetrics_DiskAvailable_backups / (ClickHouseAsyncMetrics_DiskAvailable_backups + ClickHouseAsyncMetrics_DiskUsed_backups) * 100 < 20 and (ClickHouseAsyncMetrics_DiskAvailable_backups + ClickHouseAsyncMetrics_DiskUsed_backups) > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -76,15 +87,7 @@ groups:
|
|||
summary: ClickHouse No Live Replicas (instance {{ $labels.instance }})
|
||||
description: "There are too few live replicas available, risking data loss and service disruption.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ClickhouseHighNetworkTraffic
|
||||
expr: 'ClickHouseMetrics_NetworkSend > 250 or ClickHouseMetrics_NetworkReceive > 250'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: ClickHouse High Network Traffic (instance {{ $labels.instance }})
|
||||
description: "Network traffic is unusually high, may affect cluster performance.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Please replace the threshold with an appropriate value
|
||||
- alert: ClickhouseHighTcpConnections
|
||||
expr: 'ClickHouseMetrics_TCPConnection > 400'
|
||||
for: 5m
|
||||
|
|
@ -94,17 +97,18 @@ groups:
|
|||
summary: ClickHouse High TCP Connections (instance {{ $labels.instance }})
|
||||
description: "High number of TCP connections, indicating heavy client or inter-cluster communication.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Adjust the threshold based on your cluster size and expected replication traffic.
|
||||
- alert: ClickhouseInterserverConnectionIssues
|
||||
expr: 'increase(ClickHouseMetrics_InterserverConnection[5m]) > 0'
|
||||
for: 1m
|
||||
expr: 'ClickHouseMetrics_InterserverConnection > 50'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: ClickHouse Interserver Connection Issues (instance {{ $labels.instance }})
|
||||
description: "An increase in interserver connections may indicate replication or distributed query handling issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "High number of interserver connections may indicate replication or distributed query handling issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ClickhouseZookeeperConnectionIssues
|
||||
expr: 'avg(ClickHouseMetrics_ZooKeeperSession) != 1'
|
||||
expr: 'ClickHouseMetrics_ZooKeeperSession != 1'
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -113,7 +117,7 @@ groups:
|
|||
description: "ClickHouse is experiencing issues with ZooKeeper connections, which may affect cluster state and coordination.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ClickhouseAuthenticationFailures
|
||||
expr: 'increase(ClickHouseErrorMetric_AUTHENTICATION_FAILED[5m]) > 0'
|
||||
expr: 'increase(ClickHouseErrorMetric_AUTHENTICATION_FAILED[5m]) > 3'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: info
|
||||
|
|
@ -122,10 +126,56 @@ groups:
|
|||
description: "Authentication failures detected, indicating potential security issues or misconfiguration.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ClickhouseAccessDeniedErrors
|
||||
expr: 'increase(ClickHouseErrorMetric_RESOURCE_ACCESS_DENIED[5m]) > 0'
|
||||
expr: 'increase(ClickHouseErrorMetric_RESOURCE_ACCESS_DENIED[5m]) > 3'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: ClickHouse Access Denied Errors (instance {{ $labels.instance }})
|
||||
description: "Access denied errors have been logged, which could indicate permission issues or unauthorized access attempts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ClickhouseRejectedInsertQueries
|
||||
expr: 'increase(ClickHouseProfileEvents_RejectedInserts[1m]) > 2'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: ClickHouse rejected insert queries (instance {{ $labels.instance }})
|
||||
description: "INSERTs rejected due to too many active data parts. Reduce insert frequency.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ClickhouseDelayedInsertQueries
|
||||
expr: 'increase(ClickHouseProfileEvents_DelayedInserts[5m]) > 10'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: ClickHouse delayed insert queries (instance {{ $labels.instance }})
|
||||
description: "INSERTs delayed due to high number of active parts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ClickhouseZookeeperHardwareException
|
||||
expr: 'increase(ClickHouseProfileEvents_ZooKeeperHardwareExceptions[1m]) > 0'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: ClickHouse zookeeper hardware exception (instance {{ $labels.instance }})
|
||||
description: "Zookeeper hardware exception: network issues communicating with ZooKeeper\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Please replace the threshold with an appropriate value
|
||||
- alert: ClickhouseHighNetworkUsage
|
||||
expr: 'rate(ClickHouseProfileEvents_NetworkSendBytes[1m]) > 100*1024*1024 or rate(ClickHouseProfileEvents_NetworkReceiveBytes[1m]) > 100*1024*1024'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: ClickHouse high network usage (instance {{ $labels.instance }})
|
||||
description: "High network usage. ClickHouse network usage exceeds 100MB/s.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ClickhouseDistributedRejectedInserts
|
||||
expr: 'increase(ClickHouseProfileEvents_DistributedRejectedInserts[5m]) > 3'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: ClickHouse distributed rejected inserts (instance {{ $labels.instance }})
|
||||
description: "INSERTs into Distributed tables rejected due to pending bytes limit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
|
|||
|
|
@ -2,10 +2,11 @@ groups:
|
|||
|
||||
- name: LablabsCloudflareExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: CloudflareHttp4xxErrorRate
|
||||
expr: '(sum by(zone) (rate(cloudflare_zone_requests_status{status=~"^4.."}[15m])) / on (zone) sum by (zone) (rate(cloudflare_zone_requests_status[15m]))) * 100 > 5'
|
||||
expr: '(sum by(zone) (rate(cloudflare_zone_requests_status{status=~"^4.."}[15m])) / on (zone) sum by (zone) (rate(cloudflare_zone_requests_status[15m]))) * 100 > 5 and sum by (zone) (rate(cloudflare_zone_requests_status[15m])) > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -14,7 +15,7 @@ groups:
|
|||
description: "Cloudflare high HTTP 4xx error rate (> 5% for domain {{ $labels.zone }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CloudflareHttp5xxErrorRate
|
||||
expr: '(sum by (zone) (rate(cloudflare_zone_requests_status{status=~"^5.."}[5m])) / on (zone) sum by (zone) (rate(cloudflare_zone_requests_status[5m]))) * 100 > 5'
|
||||
expr: '(sum by (zone) (rate(cloudflare_zone_requests_status{status=~"^5.."}[5m])) / on (zone) sum by (zone) (rate(cloudflare_zone_requests_status[5m]))) * 100 > 5 and sum by (zone) (rate(cloudflare_zone_requests_status[5m])) > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
|
|
|
|||
1
dist/rules/consul/consul-exporter.yml
vendored
1
dist/rules/consul/consul-exporter.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: ConsulExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: ConsulServiceHealthcheckFailed
|
||||
|
|
|
|||
1
dist/rules/coredns/embedded-exporter.yml
vendored
1
dist/rules/coredns/embedded-exporter.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: CorednsPanicCount
|
||||
|
|
|
|||
19
dist/rules/cortex/embedded-exporter.yml
vendored
19
dist/rules/cortex/embedded-exporter.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: CortexRulerConfigurationReloadFailure
|
||||
|
|
@ -22,23 +23,25 @@ groups:
|
|||
summary: Cortex not connected to Alertmanager (instance {{ $labels.instance }})
|
||||
description: "Cortex not connected to Alertmanager (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CortexNotificationAreBeingDropped
|
||||
expr: 'rate(cortex_prometheus_notifications_dropped_total[5m]) > 0'
|
||||
# Threshold of 0.05/s avoids firing on transient single-event spikes.
|
||||
- alert: CortexNotificationsAreBeingDropped
|
||||
expr: 'rate(cortex_prometheus_notifications_dropped_total[5m]) > 0.05'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Cortex notification are being dropped (instance {{ $labels.instance }})
|
||||
description: "Cortex notification are being dropped due to errors (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
summary: Cortex notifications are being dropped (instance {{ $labels.instance }})
|
||||
description: "Cortex notifications are being dropped due to errors (instance {{ $labels.instance }}, {{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CortexNotificationError
|
||||
expr: 'rate(cortex_prometheus_notifications_errors_total[5m]) > 0'
|
||||
# Threshold of 0.05/s avoids firing on transient single-event spikes.
|
||||
- alert: CortexNotificationErrors
|
||||
expr: 'rate(cortex_prometheus_notifications_errors_total[5m]) > 0.05'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Cortex notification error (instance {{ $labels.instance }})
|
||||
description: "Cortex is failing when sending alert notifications (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
summary: Cortex notification errors (instance {{ $labels.instance }})
|
||||
description: "Cortex is failing when sending alert notifications (instance {{ $labels.instance }}, {{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CortexIngesterUnhealthy
|
||||
expr: 'cortex_ring_members{state="Unhealthy", name="ingester"} > 0'
|
||||
|
|
|
|||
170
dist/rules/couchdb/gesellix-couchdb-prometheus-exporter.yml
vendored
Normal file
170
dist/rules/couchdb/gesellix-couchdb-prometheus-exporter.yml
vendored
Normal file
|
|
@ -0,0 +1,170 @@
|
|||
groups:
|
||||
|
||||
- name: GesellixCouchdbPrometheusExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: CouchdbNodeDown
|
||||
expr: 'couchdb_httpd_node_up == 0 or couchdb_httpd_up == 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: CouchDB node down (instance {{ $labels.instance }})
|
||||
description: "CouchDB node is not responding (node_up metric is 0) for more than 2 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CouchdbAtomMemoryUsageCritical
|
||||
expr: 'couchdb_erlang_memory_atom_used > 0.9 * couchdb_erlang_memory_atom'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: CouchDB atom memory usage critical (instance {{ $labels.instance }})
|
||||
description: "Atom memory usage is above 90% of limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# The default max_dbs_open is 500. Adjust the threshold (currently 0.9 * 1000) to match your max_dbs_open setting.
|
||||
- alert: CouchdbOpenDatabasesCritical
|
||||
expr: 'couchdb_httpd_open_databases > 0.9 * 1000'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: CouchDB open databases critical (instance {{ $labels.instance }})
|
||||
description: "Number of open databases exceeds 90% of node capacity\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Adjust 65535 to match your system's file descriptor limit (ulimit -n).
|
||||
- alert: CouchdbOpenOsFilesCritical
|
||||
expr: 'couchdb_httpd_open_os_files > 0.9 * 65535'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: CouchDB open OS files critical (instance {{ $labels.instance }})
|
||||
description: "CouchDB is using more than 90% of allowed OS file descriptors, may fail to open new files\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: Couchdb5xxErrorRatioHigh
|
||||
expr: 'rate(couchdb_httpd_status_codes{code=~"5.."}[5m]) / rate(couchdb_httpd_requests[5m]) > 0.05 and rate(couchdb_httpd_requests[5m]) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: CouchDB 5xx error ratio high (instance {{ $labels.instance }})
|
||||
description: "More than 5% of HTTP requests are returning 5xx errors\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CouchdbTemporaryViewReadRateCritical
|
||||
expr: 'rate(couchdb_httpd_temporary_view_reads[5m]) > 100'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: CouchDB temporary view read rate critical (instance {{ $labels.instance }})
|
||||
description: "Temporary view read rate exceeds 100 reads/sec, high risk of performance degradation\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CouchdbMangoQueriesScanningTooManyDocs
|
||||
expr: 'rate(couchdb_mango_too_many_docs_scanned[5m]) > 50'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: CouchDB Mango queries scanning too many docs (instance {{ $labels.instance }})
|
||||
description: "Some Mango queries are scanning too many documents, consider adding indexes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CouchdbMangoQueriesFailedDueToInvalidIndex
|
||||
expr: 'rate(couchdb_mango_query_invalid_index[5m]) > 5'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: CouchDB Mango queries failed due to invalid index (instance {{ $labels.instance }})
|
||||
description: "Some Mango queries failed to execute because the index was missing or invalid\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CouchdbMangoDocsExaminedHigh
|
||||
expr: 'rate(couchdb_mango_docs_examined[5m]) > 1000'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: CouchDB Mango docs examined high (instance {{ $labels.instance }})
|
||||
description: "High number of documents examined per Mango queries, consider indexing\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CouchdbReplicatorManagerDied
|
||||
expr: 'increase(couchdb_replicator_changes_manager_deaths[5m]) > 0'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: CouchDB Replicator manager died (instance {{ $labels.instance }})
|
||||
description: "Replication manager process has crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CouchdbReplicatorQueueProcessDied
|
||||
expr: 'increase(couchdb_replicator_changes_queue_deaths[5m]) > 0'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: CouchDB Replicator queue process died (instance {{ $labels.instance }})
|
||||
description: "Replication queue process has crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CouchdbReplicatorReaderProcessDied
|
||||
expr: 'increase(couchdb_replicator_changes_reader_deaths[5m]) > 0'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: CouchDB Replicator reader process died (instance {{ $labels.instance }})
|
||||
description: "Replication reader process has crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CouchdbReplicatorFailedToStart
|
||||
expr: 'increase(couchdb_replicator_failed_starts[5m]) > 0'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: CouchDB Replicator failed to start (instance {{ $labels.instance }})
|
||||
description: "One or more replication tasks failed to start\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CouchdbReplicationClusterUnstable
|
||||
expr: 'couchdb_replicator_cluster_is_stable == 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: CouchDB replication cluster unstable (instance {{ $labels.instance }})
|
||||
description: "The replication cluster is unstable, replication may be interrupted\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CouchdbReplicationReadFailures
|
||||
expr: 'increase(couchdb_replicator_changes_read_failures[5m]) > 5'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: CouchDB replication read failures (instance {{ $labels.instance }})
|
||||
description: "Replication changes feed has failed reads more than 5 times in 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CouchdbFileDescriptorsHigh
|
||||
expr: 'process_open_fds / process_max_fds > 0.85 and process_max_fds > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: CouchDB file descriptors high (instance {{ $labels.instance }})
|
||||
description: "Process is using more than 85% of allowed file descriptors\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CouchdbProcessRestarted
|
||||
expr: 'changes(process_start_time_seconds[1h]) > 0'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: CouchDB process restarted (instance {{ $labels.instance }})
|
||||
description: "CouchDB process has restarted recently\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CouchdbCriticalLogEntries
|
||||
expr: 'increase(couchdb_server_couch_log{level=~"error|critical"}[5m]) > 5'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: CouchDB critical log entries (instance {{ $labels.instance }})
|
||||
description: "Critical or error log entries detected in the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
97
dist/rules/digitalocean/digitalocean-exporter.yml
vendored
Normal file
97
dist/rules/digitalocean/digitalocean-exporter.yml
vendored
Normal file
|
|
@ -0,0 +1,97 @@
|
|||
groups:
|
||||
|
||||
- name: DigitaloceanExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: DigitaloceanDropletDown
|
||||
expr: 'digitalocean_droplet_up == 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: DigitalOcean droplet down (instance {{ $labels.instance }})
|
||||
description: "DigitalOcean droplet {{ $labels.name }} ({{ $labels.id }}) in {{ $labels.region }} is not running.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: DigitaloceanAccountNotActive
|
||||
expr: 'digitalocean_account_active != 1'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: DigitalOcean account not active (instance {{ $labels.instance }})
|
||||
description: "DigitalOcean account is not active. It may be suspended or locked.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: DigitaloceanDatabaseDown
|
||||
expr: 'digitalocean_database_status == 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: DigitalOcean database down (instance {{ $labels.instance }})
|
||||
description: "DigitalOcean managed database {{ $labels.name }} ({{ $labels.engine }}) in {{ $labels.region }} is offline.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: DigitaloceanKubernetesClusterDown
|
||||
expr: 'digitalocean_kubernetes_cluster_up == 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: DigitalOcean Kubernetes cluster down (instance {{ $labels.instance }})
|
||||
description: "DigitalOcean Kubernetes cluster {{ $labels.name }} ({{ $labels.version }}) in {{ $labels.region }} is not running.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: DigitaloceanLoadBalancerDown
|
||||
expr: 'digitalocean_loadbalancer_status == 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: DigitalOcean load balancer down (instance {{ $labels.instance }})
|
||||
description: "DigitalOcean load balancer {{ $labels.name }} ({{ $labels.ip }}) is not active.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: DigitaloceanLoadBalancerNoBackends
|
||||
expr: 'digitalocean_loadbalancer_droplets == 0'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: DigitalOcean load balancer no backends (instance {{ $labels.instance }})
|
||||
description: "DigitalOcean load balancer {{ $labels.name }} ({{ $labels.ip }}) has no droplets attached.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: DigitaloceanFloatingIpNotAssigned
|
||||
expr: 'digitalocean_floating_ipv4_active == 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: DigitalOcean floating IP not assigned (instance {{ $labels.instance }})
|
||||
description: "DigitalOcean floating IP {{ $labels.ipv4 }} in {{ $labels.region }} is not assigned to any droplet.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: DigitaloceanActiveIncidents
|
||||
expr: 'digitalocean_incidents_total > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: DigitalOcean active incidents (instance {{ $labels.instance }})
|
||||
description: "DigitalOcean platform has {{ $value }} active incident(s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: DigitaloceanExporterCollectionErrors
|
||||
expr: 'increase(digitalocean_errors_total[5m]) > 3'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: DigitalOcean exporter collection errors (instance {{ $labels.instance }})
|
||||
description: "DigitalOcean exporter {{ $labels.collector }} collector has {{ $value }} errors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Fires when more than 80% of the account's droplet limit is in use.
|
||||
- alert: DigitaloceanDropletLimitApproaching
|
||||
expr: '(count(digitalocean_droplet_up) / digitalocean_account_droplet_limit) * 100 > 80 and digitalocean_account_droplet_limit > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: DigitalOcean droplet limit approaching (instance {{ $labels.instance }})
|
||||
description: "DigitalOcean account is using {{ $value }}% of its droplet quota.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
28
dist/rules/docker-containers/google-cadvisor.yml
vendored
28
dist/rules/docker-containers/google-cadvisor.yml
vendored
|
|
@ -2,8 +2,10 @@ groups:
|
|||
|
||||
- name: GoogleCadvisor
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
# This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment.
|
||||
- alert: ContainerKilled
|
||||
expr: 'time() - container_last_seen > 60'
|
||||
for: 0m
|
||||
|
|
@ -13,6 +15,7 @@ groups:
|
|||
summary: Container killed (instance {{ $labels.instance }})
|
||||
description: "A container has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment.
|
||||
- alert: ContainerAbsent
|
||||
expr: 'absent(container_last_seen)'
|
||||
for: 5m
|
||||
|
|
@ -22,15 +25,17 @@ groups:
|
|||
summary: Container absent (instance {{ $labels.instance }})
|
||||
description: "A container is absent for 5 min\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Only fires for containers with explicit CPU limits. Containers without limits have cpu_quota=0, which is filtered out by the guard.
|
||||
- alert: ContainerHighCpuUtilization
|
||||
expr: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) > 80'
|
||||
expr: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) > 80 and sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Container High CPU utilization (instance {{ $labels.instance }})
|
||||
description: "Container CPU utilization is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Container CPU utilization is above 80% (current: {{ $value | printf \"%.2f\" }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# See https://medium.com/faun/how-much-is-too-much-the-linux-oomkiller-and-used-memory-d32186f29c9d
|
||||
- alert: ContainerHighMemoryUsage
|
||||
expr: '(sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80'
|
||||
for: 2m
|
||||
|
|
@ -41,7 +46,7 @@ groups:
|
|||
description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ContainerVolumeUsage
|
||||
expr: '(1 - (sum(container_fs_inodes_free{name!=""}) BY (instance) / sum(container_fs_inodes_total) BY (instance))) * 100 > 80'
|
||||
expr: '(1 - (sum(container_fs_inodes_free{name!=""}) BY (instance) / sum(container_fs_inodes_total) BY (instance))) * 100 > 80 and sum(container_fs_inodes_total) BY (instance) > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -50,22 +55,31 @@ groups:
|
|||
description: "Container Volume usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ContainerHighThrottleRate
|
||||
expr: 'sum(increase(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) by (container, pod, namespace) / sum(increase(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > ( 25 / 100 )'
|
||||
expr: 'sum(rate(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) by (container, pod, namespace) / sum(rate(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > ( 25 / 100 ) and sum(rate(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Container high throttle rate (instance {{ $labels.instance }})
|
||||
description: "Container is being throttled\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Container is being throttled ({{ $value | humanizePercentage }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ContainerHighLowChangeCpuUsage
|
||||
expr: '(abs((sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[1m])) * 100) - (sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[1m] offset 1m)) * 100)) or abs((sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[1m])) * 100) - (sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[5m] offset 1m)) * 100))) > 25'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Container high low change CPU usage (instance {{ $labels.instance }})
|
||||
description: "This alert rule monitors the absolute change in CPU usage within a time window and triggers an alert when the change exceeds 25%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ContainerLowCpuUtilization
|
||||
expr: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) < 20'
|
||||
expr: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) < 20 and sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) > 0'
|
||||
for: 7d
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Container Low CPU utilization (instance {{ $labels.instance }})
|
||||
description: "Container CPU utilization is under 20% for 1 week. Consider reducing the allocated CPU.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Container CPU utilization is under 20% for 1 week. Consider reducing the allocated CPU. (current: {{ $value | printf \"%.2f\" }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ContainerLowMemoryUsage
|
||||
expr: '(sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) < 20'
|
||||
|
|
|
|||
34
dist/rules/ebpf/ebpf-exporter.yml
vendored
Normal file
34
dist/rules/ebpf/ebpf-exporter.yml
vendored
Normal file
|
|
@ -0,0 +1,34 @@
|
|||
groups:
|
||||
|
||||
- name: EbpfExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
# The exporter uses loose attachment: if a program fails to load (missing BTF, kernel incompatibility), it sets this metric to 0 and continues running.
|
||||
- alert: EbpfExporterProgramNotAttached
|
||||
expr: 'ebpf_exporter_ebpf_program_attached == 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: eBPF exporter program not attached (instance {{ $labels.instance }})
|
||||
description: "eBPF program {{ $labels.id }} failed to attach. The program is not collecting data. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EbpfExporterDecoderErrors
|
||||
expr: 'rate(ebpf_exporter_decoder_errors_total[5m]) > 0.05'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: eBPF exporter decoder errors (instance {{ $labels.instance }})
|
||||
description: "eBPF exporter is experiencing decoder errors for config {{ $labels.config }}. Kernel data is not being correctly transformed into labels. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EbpfExporterNoEnabledConfigs
|
||||
expr: 'ebpf_exporter_enabled_configs == 0 or absent(ebpf_exporter_enabled_configs)'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: eBPF exporter no enabled configs (instance {{ $labels.instance }})
|
||||
description: "eBPF exporter has no enabled configurations. No eBPF programs are being run. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
@ -2,10 +2,11 @@ groups:
|
|||
|
||||
- name: PrometheusCommunityElasticsearchExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: ElasticsearchHeapUsageTooHigh
|
||||
expr: '(elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 90'
|
||||
expr: '(elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 90 and elasticsearch_jvm_memory_max_bytes{area="heap"} > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
|
|
@ -14,7 +15,7 @@ groups:
|
|||
description: "The heap usage is over 90%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ElasticsearchHeapUsageWarning
|
||||
expr: '(elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 80'
|
||||
expr: '(elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 80 and elasticsearch_jvm_memory_max_bytes{area="heap"} > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -23,7 +24,7 @@ groups:
|
|||
description: "The heap usage is over 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ElasticsearchDiskOutOfSpace
|
||||
expr: 'elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 10'
|
||||
expr: 'elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 10 and elasticsearch_filesystem_data_size_bytes > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
|
|
@ -32,7 +33,7 @@ groups:
|
|||
description: "The disk usage is over 90%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ElasticsearchDiskSpaceLow
|
||||
expr: 'elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 20'
|
||||
expr: 'elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 20 and elasticsearch_filesystem_data_size_bytes > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -58,18 +59,20 @@ groups:
|
|||
summary: Elasticsearch Cluster Yellow (instance {{ $labels.instance }})
|
||||
description: "Elastic Cluster Yellow status\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# 1m delay allows a restart without triggering an alert.
|
||||
- alert: ElasticsearchHealthyNodes
|
||||
expr: 'elasticsearch_cluster_health_number_of_nodes < 3'
|
||||
for: 0m
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Elasticsearch Healthy Nodes (instance {{ $labels.instance }})
|
||||
description: "Missing node in Elasticsearch cluster\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# 1m delay allows a restart without triggering an alert.
|
||||
- alert: ElasticsearchHealthyDataNodes
|
||||
expr: 'elasticsearch_cluster_health_number_of_data_nodes < 3'
|
||||
for: 0m
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
|
|
@ -114,7 +117,7 @@ groups:
|
|||
|
||||
- alert: ElasticsearchUnassignedShards
|
||||
expr: 'elasticsearch_cluster_health_unassigned_shards > 0'
|
||||
for: 0m
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
|
|
@ -139,17 +142,19 @@ groups:
|
|||
summary: Elasticsearch no new documents (instance {{ $labels.instance }})
|
||||
description: "No new documents for 10 min!\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold of 10ms (0.01s) per indexing operation is a rough default. Adjust based on your document size and cluster performance.
|
||||
- alert: ElasticsearchHighIndexingLatency
|
||||
expr: 'elasticsearch_indices_indexing_index_time_seconds_total / elasticsearch_indices_indexing_index_total > 0.0005'
|
||||
expr: 'rate(elasticsearch_indices_indexing_index_time_seconds_total[5m]) / rate(elasticsearch_indices_indexing_index_total[5m]) > 0.01 and rate(elasticsearch_indices_indexing_index_total[5m]) > 0'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Elasticsearch High Indexing Latency (instance {{ $labels.instance }})
|
||||
description: "The indexing latency on Elasticsearch cluster is higher than the threshold.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "The indexing latency on Elasticsearch cluster is higher than the threshold (current value: {{ $value }}s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold of 10000 ops/s is a rough default. Adjust based on your cluster capacity and expected workload.
|
||||
- alert: ElasticsearchHighIndexingRate
|
||||
expr: 'elasticsearch_indices_indexing_index_total > 100000'
|
||||
expr: 'sum(rate(elasticsearch_indices_indexing_index_total[1m]))> 10000'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -157,8 +162,9 @@ groups:
|
|||
summary: Elasticsearch High Indexing Rate (instance {{ $labels.instance }})
|
||||
description: "The indexing rate on Elasticsearch cluster is higher than the threshold.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold of 100 queries/s is very low for most production clusters. Adjust based on your expected query volume.
|
||||
- alert: ElasticsearchHighQueryRate
|
||||
expr: 'elasticsearch_indices_search_query_total > 100000'
|
||||
expr: 'sum(rate(elasticsearch_indices_search_query_total[1m])) > 100'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -167,10 +173,10 @@ groups:
|
|||
description: "The query rate on Elasticsearch cluster is higher than the threshold.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ElasticsearchHighQueryLatency
|
||||
expr: 'elasticsearch_indices_search_fetch_time_seconds / elasticsearch_indices_search_fetch_total > 1'
|
||||
expr: 'rate(elasticsearch_indices_search_query_time_seconds[1m]) / rate(elasticsearch_indices_search_query_total[1m]) > 1 and rate(elasticsearch_indices_search_query_total[1m]) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Elasticsearch High Query Latency (instance {{ $labels.instance }})
|
||||
description: "The query latency on Elasticsearch cluster is higher than the threshold.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "The query latency on Elasticsearch cluster is higher than the threshold (current value: {{ $value }}s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
|
|||
177
dist/rules/envoy/embedded-exporter.yml
vendored
Normal file
177
dist/rules/envoy/embedded-exporter.yml
vendored
Normal file
|
|
@ -0,0 +1,177 @@
|
|||
groups:
|
||||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: EnvoyServerNotLive
|
||||
expr: 'envoy_server_live != 1'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Envoy server not live (instance {{ $labels.instance }})
|
||||
description: "Envoy server is not live (draining or shutting down) on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EnvoyHighMemoryUsage
|
||||
expr: 'envoy_server_memory_allocated / envoy_server_memory_heap_size * 100 > 90 and envoy_server_memory_heap_size > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Envoy high memory usage (instance {{ $labels.instance }})
|
||||
description: "Envoy memory allocated is above 90% of heap size on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EnvoyHighDownstreamHttp5xxErrorRate
|
||||
expr: 'sum by (instance) (rate(envoy_http_downstream_rq_xx{envoy_response_code_class="5"}[5m])) / sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) * 100 > 5 and sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) > 0'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Envoy high downstream HTTP 5xx error rate (instance {{ $labels.instance }})
|
||||
description: "More than 5% of downstream HTTP responses are 5xx on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EnvoyHighDownstreamHttp4xxErrorRate
|
||||
expr: 'sum by (instance) (rate(envoy_http_downstream_rq_xx{envoy_response_code_class="4"}[5m])) / sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) * 100 > 10 and sum by (instance) (rate(envoy_http_downstream_rq_completed[5m])) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Envoy high downstream HTTP 4xx error rate (instance {{ $labels.instance }})
|
||||
description: "More than 10% of downstream HTTP responses are 4xx on {{ $labels.instance }} ({{ $value | printf \"%.1f\" }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EnvoyDownstreamConnectionsOverflowing
|
||||
expr: 'increase(envoy_listener_downstream_cx_overflow[5m]) > 5'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Envoy downstream connections overflowing (instance {{ $labels.instance }})
|
||||
description: "Downstream connections are being rejected due to listener overflow on {{ $labels.instance }} ({{ $value }} in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EnvoyClusterMembershipEmpty
|
||||
expr: 'envoy_cluster_membership_healthy == 0'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Envoy cluster membership empty (instance {{ $labels.instance }})
|
||||
description: "Envoy cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} has no healthy members\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EnvoyClusterMembershipDegraded
|
||||
expr: 'envoy_cluster_membership_healthy / envoy_cluster_membership_total * 100 < 75 and envoy_cluster_membership_total > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Envoy cluster membership degraded (instance {{ $labels.instance }})
|
||||
description: "Only {{ $value | printf \"%.1f\" }}% of members in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} are healthy (threshold: 75%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EnvoyHighClusterUpstreamConnectionFailures
|
||||
expr: 'increase(envoy_cluster_upstream_cx_connect_fail[5m]) > 10'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Envoy high cluster upstream connection failures (instance {{ $labels.instance }})
|
||||
description: "High rate of upstream connection failures in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} ({{ $value }} in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EnvoyHighClusterUpstreamRequestTimeoutRate
|
||||
expr: 'rate(envoy_cluster_upstream_rq_timeout[5m]) / rate(envoy_cluster_upstream_rq_completed[5m]) * 100 > 5 and rate(envoy_cluster_upstream_rq_completed[5m]) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Envoy high cluster upstream request timeout rate (instance {{ $labels.instance }})
|
||||
description: "More than 5% of upstream requests are timing out in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EnvoyHighClusterUpstream5xxErrorRate
|
||||
expr: 'rate(envoy_cluster_upstream_rq_xx{envoy_response_code_class="5"}[5m]) / rate(envoy_cluster_upstream_rq_completed[5m]) * 100 > 5 and rate(envoy_cluster_upstream_rq_completed[5m]) > 0'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Envoy high cluster upstream 5xx error rate (instance {{ $labels.instance }})
|
||||
description: "More than 5% of upstream requests return 5xx in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EnvoyClusterHealthCheckFailures
|
||||
expr: 'increase(envoy_cluster_health_check_failure[5m]) > 5'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Envoy cluster health check failures (instance {{ $labels.instance }})
|
||||
description: "Health checks are consistently failing in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} ({{ $value }} in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EnvoyClusterOutlierDetectionEjectionsActive
|
||||
expr: 'envoy_cluster_outlier_detection_ejections_active > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Envoy cluster outlier detection ejections active (instance {{ $labels.instance }})
|
||||
description: "There are active outlier detection ejections in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EnvoyListenerSslConnectionErrors
|
||||
expr: 'increase(envoy_listener_ssl_connection_error[5m]) > 5'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Envoy listener SSL connection errors (instance {{ $labels.instance }})
|
||||
description: "Envoy listener is experiencing SSL/TLS connection errors on {{ $labels.instance }} ({{ $value }} in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EnvoyGlobalDownstreamConnectionsOverflowing
|
||||
expr: 'increase(envoy_listener_downstream_global_cx_overflow[5m]) > 5'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Envoy global downstream connections overflowing (instance {{ $labels.instance }})
|
||||
description: "Downstream connections are being rejected due to global connection limit on {{ $labels.instance }} ({{ $value }} in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EnvoySslCertificateExpiringSoon
|
||||
expr: 'envoy_server_days_until_first_cert_expiring < 7'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Envoy SSL certificate expiring soon (instance {{ $labels.instance }})
|
||||
description: "SSL certificate loaded by Envoy on {{ $labels.instance }} expires in less than 7 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EnvoySslCertificateExpired
|
||||
expr: 'envoy_server_days_until_first_cert_expiring < 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Envoy SSL certificate expired (instance {{ $labels.instance }})
|
||||
description: "SSL certificate loaded by Envoy on {{ $labels.instance }} has expired\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EnvoyClusterCircuitBreakerTripped
|
||||
expr: 'envoy_cluster_circuit_breakers_default_cx_open == 1 or envoy_cluster_circuit_breakers_default_rq_open == 1'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Envoy cluster circuit breaker tripped (instance {{ $labels.instance }})
|
||||
description: "Circuit breaker is open for cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EnvoyNoHealthyUpstream
|
||||
expr: 'increase(envoy_cluster_upstream_cx_none_healthy[5m]) > 3'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Envoy no healthy upstream (instance {{ $labels.instance }})
|
||||
description: "Upstream connection attempts failed because no healthy upstream was available in cluster {{ $labels.envoy_cluster_name }} on {{ $labels.instance }} ({{ $value }} in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EnvoyHighDownstreamRequestTimeoutRate
|
||||
expr: 'increase(envoy_http_downstream_rq_timeout[5m]) > 5'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Envoy high downstream request timeout rate (instance {{ $labels.instance }})
|
||||
description: "Downstream requests are timing out on {{ $labels.instance }} ({{ $value }} in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
40
dist/rules/etcd/embedded-exporter.yml
vendored
40
dist/rules/etcd/embedded-exporter.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: EtcdInsufficientMembers
|
||||
|
|
@ -29,24 +30,26 @@ groups:
|
|||
severity: warning
|
||||
annotations:
|
||||
summary: Etcd high number of leader changes (instance {{ $labels.instance }})
|
||||
description: "Etcd leader changed more than 2 times during 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Etcd leader changed {{ $value }} times during 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EtcdHighNumberOfFailedGrpcRequests
|
||||
expr: 'sum(rate(grpc_server_handled_total{grpc_code!="OK"}[1m])) BY (grpc_service, grpc_method) / sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0.01'
|
||||
# Filters to actual error codes. grpc_code!="OK" includes benign codes like NotFound, AlreadyExists, and Cancelled.
|
||||
- alert: EtcdHighNumberOfFailedGrpcRequestsWarning
|
||||
expr: 'sum(rate(grpc_server_handled_total{grpc_code=~"Internal|Unavailable|DeadlineExceeded|ResourceExhausted|Aborted|Unknown"}[1m])) BY (grpc_service, grpc_method) / sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0.01 and sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Etcd high number of failed GRPC requests (instance {{ $labels.instance }})
|
||||
summary: Etcd high number of failed GRPC requests warning (instance {{ $labels.instance }})
|
||||
description: "More than 1% GRPC request failure detected in Etcd\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EtcdHighNumberOfFailedGrpcRequests
|
||||
expr: 'sum(rate(grpc_server_handled_total{grpc_code!="OK"}[1m])) BY (grpc_service, grpc_method) / sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0.05'
|
||||
# Filters to actual error codes. grpc_code!="OK" includes benign codes like NotFound, AlreadyExists, and Cancelled.
|
||||
- alert: EtcdHighNumberOfFailedGrpcRequestsCritical
|
||||
expr: 'sum(rate(grpc_server_handled_total{grpc_code=~"Internal|Unavailable|DeadlineExceeded|ResourceExhausted|Aborted|Unknown"}[1m])) BY (grpc_service, grpc_method) / sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0.05 and sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Etcd high number of failed GRPC requests (instance {{ $labels.instance }})
|
||||
summary: Etcd high number of failed GRPC requests critical (instance {{ $labels.instance }})
|
||||
description: "More than 5% GRPC request failure detected in Etcd\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EtcdGrpcRequestsSlow
|
||||
|
|
@ -58,24 +61,27 @@ groups:
|
|||
summary: Etcd GRPC requests slow (instance {{ $labels.instance }})
|
||||
description: "GRPC requests slowing down, 99th percentile is over 0.15s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EtcdHighNumberOfFailedHttpRequests
|
||||
expr: 'sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.01'
|
||||
# These etcd_http_* metrics are from the etcd v2 API and do not exist in etcd 3.x. Remove these rules if running etcd 3.x.
|
||||
- alert: EtcdHighNumberOfFailedHttpRequestsWarning
|
||||
expr: 'sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.01 and sum(rate(etcd_http_received_total[1m])) BY (method) > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Etcd high number of failed HTTP requests (instance {{ $labels.instance }})
|
||||
summary: Etcd high number of failed HTTP requests warning (instance {{ $labels.instance }})
|
||||
description: "More than 1% HTTP failure detected in Etcd\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EtcdHighNumberOfFailedHttpRequests
|
||||
expr: 'sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.05'
|
||||
# These etcd_http_* metrics are from the etcd v2 API and do not exist in etcd 3.x. Remove these rules if running etcd 3.x.
|
||||
- alert: EtcdHighNumberOfFailedHttpRequestsCritical
|
||||
expr: 'sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.05 and sum(rate(etcd_http_received_total[1m])) BY (method) > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Etcd high number of failed HTTP requests (instance {{ $labels.instance }})
|
||||
summary: Etcd high number of failed HTTP requests critical (instance {{ $labels.instance }})
|
||||
description: "More than 5% HTTP failure detected in Etcd\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# This etcd_http_* metric is from the etcd v2 API and does not exist in etcd 3.x. Remove this rule if running etcd 3.x.
|
||||
- alert: EtcdHttpRequestsSlow
|
||||
expr: 'histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[1m])) > 0.15'
|
||||
for: 2m
|
||||
|
|
@ -86,7 +92,7 @@ groups:
|
|||
description: "HTTP requests slowing down, 99th percentile is over 0.15s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EtcdMemberCommunicationSlow
|
||||
expr: 'histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) > 0.15'
|
||||
expr: 'histogram_quantile(0.99, sum(rate(etcd_network_peer_round_trip_time_seconds_bucket[5m])) by (instance, le)) > 0.15'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -101,10 +107,10 @@ groups:
|
|||
severity: warning
|
||||
annotations:
|
||||
summary: Etcd high number of failed proposals (instance {{ $labels.instance }})
|
||||
description: "Etcd server got more than 5 failed proposals past hour\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Etcd server got {{ $value }} failed proposals in the past hour\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EtcdHighFsyncDurations
|
||||
expr: 'histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) > 0.5'
|
||||
expr: 'histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) by (instance, le)) > 0.5'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -113,7 +119,7 @@ groups:
|
|||
description: "Etcd WAL fsync duration increasing, 99th percentile is over 0.5s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: EtcdHighCommitDurations
|
||||
expr: 'histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[1m])) > 0.25'
|
||||
expr: 'histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) by (instance, le)) > 0.25'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
|
|||
42
dist/rules/fluxcd/embedded-exporter.yml
vendored
Normal file
42
dist/rules/fluxcd/embedded-exporter.yml
vendored
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
groups:
|
||||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: FluxKustomizationFailure
|
||||
expr: 'gotk_resource_info{ready="False", customresource_kind="Kustomization"} > 0'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Flux Kustomization Failure (instance {{ $labels.instance }})
|
||||
description: "The {{ $labels.customresource_kind }} '{{ $labels.name }}' in namespace {{ $labels.exported_namespace }} is marked as not ready.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: FluxHelmreleaseFailure
|
||||
expr: 'gotk_resource_info{ready="False", customresource_kind="HelmRelease"} > 0'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Flux HelmRelease Failure (instance {{ $labels.instance }})
|
||||
description: "The {{ $labels.customresource_kind }} '{{ $labels.name }}' in namespace {{ $labels.exported_namespace }} is marked as not ready.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: FluxSourceIssue
|
||||
expr: 'gotk_resource_info{ready="False", customresource_kind=~"GitRepository|HelmRepository|Bucket|OCIRepository"} > 0'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Flux Source Issue (instance {{ $labels.instance }})
|
||||
description: "Flux source {{ $labels.customresource_kind }} '{{ $labels.name }}' has issue(s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: FluxImageIssue
|
||||
expr: 'gotk_resource_info{ready="False", customresource_kind=~"ImagePolicy|ImageRepository|ImageUpdateAutomation"} > 0'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Flux Image Issue (instance {{ $labels.instance }})
|
||||
description: "The {{ $labels.customresource_kind }} '{{ $labels.name }}' is marked as not ready.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
@ -2,19 +2,20 @@ groups:
|
|||
|
||||
- name: ZnerolFreeswitchExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: FreeswitchDown
|
||||
expr: 'freeswitch_up == 0'
|
||||
for: 0m
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Freeswitch down (instance {{ $labels.instance }})
|
||||
description: "Freeswitch is unresponsive\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Freeswitch {{ $labels.instance }} is unresponsive.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: FreeswitchSessionsWarning
|
||||
expr: '(freeswitch_session_active * 100 / freeswitch_session_limit) > 80'
|
||||
expr: '(freeswitch_session_active * 100 / freeswitch_session_limit) > 80 and freeswitch_session_limit > 0'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -23,7 +24,7 @@ groups:
|
|||
description: "High sessions usage on {{ $labels.instance }}: {{ $value | printf \"%.2f\"}}%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: FreeswitchSessionsCritical
|
||||
expr: '(freeswitch_session_active * 100 / freeswitch_session_limit) > 90'
|
||||
expr: '(freeswitch_session_active * 100 / freeswitch_session_limit) > 90 and freeswitch_session_limit > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
|
|
|
|||
66
dist/rules/gitlab-ci/gitaly.yml
vendored
Normal file
66
dist/rules/gitlab-ci/gitaly.yml
vendored
Normal file
|
|
@ -0,0 +1,66 @@
|
|||
groups:
|
||||
|
||||
- name: Gitaly
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
# Filters to actual error codes. grpc_code!="OK" includes benign codes like NotFound, AlreadyExists, and Cancelled.
|
||||
- alert: GitlabGitalyHighGrpcErrorRate
|
||||
expr: 'sum(rate(grpc_server_handled_total{job="gitaly",grpc_code=~"Internal|Unavailable|DeadlineExceeded|ResourceExhausted|Aborted|Unknown|DataLoss"}[5m])) / sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) * 100 > 5 and sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: GitLab Gitaly high gRPC error rate (instance {{ $labels.instance }})
|
||||
description: "Gitaly on {{ $labels.instance }} is returning more than 5% gRPC errors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# ResourceExhausted errors from Gitaly mean Git operations are being rejected due to
|
||||
# concurrency limits. This directly impacts users trying to push, pull, or clone.
|
||||
- alert: GitlabGitalyResourceExhausted
|
||||
expr: 'sum(rate(grpc_server_handled_total{job="gitaly",grpc_code="ResourceExhausted"}[5m])) / sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) * 100 > 1 and sum(rate(grpc_server_handled_total{job="gitaly"}[5m])) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: GitLab Gitaly resource exhausted (instance {{ $labels.instance }})
|
||||
description: "Gitaly on {{ $labels.instance }} is returning ResourceExhausted errors, indicating overload ({{ $value }}%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: GitlabGitalyHighRpcLatency
|
||||
expr: 'histogram_quantile(0.95, sum(rate(grpc_server_handling_seconds_bucket{job="gitaly",grpc_type="unary"}[5m])) by (le)) > 1'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: GitLab Gitaly high RPC latency (instance {{ $labels.instance }})
|
||||
description: "Gitaly on {{ $labels.instance }} p95 unary RPC latency exceeds 1 second ({{ $value }}s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Brief throttling spikes are normal. Threshold of 0.1s/s (10% of CPU time throttled) filters out transient noise.
|
||||
- alert: GitlabGitalyCpuThrottled
|
||||
expr: 'rate(gitaly_cgroup_cpu_cfs_throttled_seconds_total[5m]) > 0.1'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: GitLab Gitaly CPU throttled (instance {{ $labels.instance }})
|
||||
description: "Gitaly processes on {{ $labels.instance }} are being CPU throttled by cgroups.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: GitlabGitalyAuthenticationFailures
|
||||
expr: 'increase(gitaly_authentications_total{status="failed"}[5m]) > 3'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: GitLab Gitaly authentication failures (instance {{ $labels.instance }})
|
||||
description: "Gitaly on {{ $labels.instance }} has authentication failures ({{ $value }}).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# When the circuit breaker trips to "open" state, Git operations (push, pull, clone) will fail.
|
||||
# Check Gitaly service health and logs.
|
||||
- alert: GitlabGitalyCircuitBreakerTripped
|
||||
expr: 'increase(gitaly_circuit_breaker_transitions_total{to_state="open"}[5m]) > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: GitLab Gitaly circuit breaker tripped (instance {{ $labels.instance }})
|
||||
description: "Gitaly circuit breaker has tripped on {{ $labels.instance }}. Git operations are failing.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
216
dist/rules/gitlab-ci/gitlab-built-in-exporter.yml
vendored
Normal file
216
dist/rules/gitlab-ci/gitlab-built-in-exporter.yml
vendored
Normal file
|
|
@ -0,0 +1,216 @@
|
|||
groups:
|
||||
|
||||
- name: GitlabBuiltInExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
# Queued connections indicate Puma workers are saturated.
|
||||
# Consider increasing puma['worker_processes'] or puma['max_threads'] in gitlab.rb.
|
||||
- alert: GitlabPumaHighQueuedConnections
|
||||
expr: 'puma_queued_connections > 5'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: GitLab Puma high queued connections (instance {{ $labels.instance }})
|
||||
description: "GitLab Puma has {{ $value }} queued connections on {{ $labels.instance }}. Requests are waiting for an available worker thread.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: GitlabPumaNoAvailablePoolCapacity
|
||||
expr: 'puma_pool_capacity == 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: GitLab Puma no available pool capacity (instance {{ $labels.instance }})
|
||||
description: "GitLab Puma pool capacity on {{ $labels.instance }} has been at 0 for 5 minutes. All threads are busy.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: GitlabPumaWorkersNotRunning
|
||||
expr: 'puma_running_workers < puma_workers'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: GitLab Puma workers not running (instance {{ $labels.instance }})
|
||||
description: "GitLab Puma on {{ $labels.instance }} has {{ $value }} running workers out of expected total.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold is 5% of all requests returning server errors.
|
||||
# Check GitLab logs at /var/log/gitlab/ for root cause.
|
||||
- alert: GitlabHighHttpErrorRate
|
||||
expr: 'sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m])) * 100 > 5 and sum(rate(http_requests_total[5m])) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: GitLab high HTTP error rate (instance {{ $labels.instance }})
|
||||
description: "GitLab is returning more than 5% HTTP 5xx errors on {{ $labels.instance }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold of 10s may need adjustment based on your instance size and workload.
|
||||
- alert: GitlabHighHttpRequestLatency
|
||||
expr: 'histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le)) > 10'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: GitLab high HTTP request latency (instance {{ $labels.instance }})
|
||||
description: "GitLab p95 HTTP request latency on {{ $labels.instance }} is above 10 seconds.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# This metric requires the emit_sidekiq_histogram_metrics feature flag to be enabled.
|
||||
# A sustained failure rate indicates background processing issues.
|
||||
- alert: GitlabSidekiqJobsFailing
|
||||
expr: 'rate(sidekiq_jobs_failed_total[5m]) > 0.1'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: GitLab Sidekiq jobs failing (instance {{ $labels.instance }})
|
||||
description: "GitLab Sidekiq jobs are failing at a rate of {{ $value }} per second on {{ $labels.instance }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# When running jobs approach the concurrency limit, new jobs will queue up.
|
||||
# Consider scaling Sidekiq workers or increasing concurrency.
|
||||
- alert: GitlabSidekiqQueueTooLarge
|
||||
expr: 'sum(sidekiq_running_jobs) >= sum(sidekiq_concurrency) * 0.9'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: GitLab Sidekiq queue too large (instance {{ $labels.instance }})
|
||||
description: "GitLab Sidekiq has {{ $value }} running jobs, approaching concurrency limit on {{ $labels.instance }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# This metric requires the emit_sidekiq_histogram_metrics feature flag to be enabled.
|
||||
- alert: GitlabSidekiqHighJobCompletionTime
|
||||
expr: 'histogram_quantile(0.95, sum(rate(sidekiq_jobs_completion_seconds_bucket[5m])) by (le, worker)) > 300'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: GitLab Sidekiq high job completion time (instance {{ $labels.instance }})
|
||||
description: "GitLab Sidekiq job p95 completion time on {{ $labels.instance }} is above 5 minutes ({{ $value | humanizeDuration }}).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# This metric requires the emit_sidekiq_histogram_metrics feature flag to be enabled.
|
||||
# High queue latency means jobs are stuck waiting. Check Sidekiq concurrency and queue sizes.
|
||||
- alert: GitlabSidekiqHighQueueLatency
|
||||
expr: 'histogram_quantile(0.95, sum(rate(sidekiq_jobs_queue_duration_seconds_bucket[5m])) by (le)) > 60'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: GitLab Sidekiq high queue latency (instance {{ $labels.instance }})
|
||||
description: "GitLab Sidekiq jobs on {{ $labels.instance }} are waiting more than 60 seconds before being processed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# When the pool is near saturation, requests may block waiting for a connection.
|
||||
# Increase db_pool_size in gitlab.rb or investigate slow queries.
|
||||
- alert: GitlabDatabaseConnectionPoolSaturation
|
||||
expr: 'gitlab_database_connection_pool_busy / gitlab_database_connection_pool_size * 100 > 90 and gitlab_database_connection_pool_size > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: GitLab database connection pool saturation (instance {{ $labels.instance }})
|
||||
description: "GitLab database connection pool on {{ $labels.instance }} ({{ $labels.class }}) is {{ $value }}% busy.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: GitlabDatabaseConnectionPoolDeadConnections
|
||||
expr: 'gitlab_database_connection_pool_dead > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: GitLab database connection pool dead connections (instance {{ $labels.instance }})
|
||||
description: "GitLab database connection pool on {{ $labels.instance }} ({{ $labels.class }}) has {{ $value }} dead connections.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: GitlabDatabaseConnectionPoolWaiting
|
||||
expr: 'gitlab_database_connection_pool_waiting > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: GitLab database connection pool waiting (instance {{ $labels.instance }})
|
||||
description: "GitLab on {{ $labels.instance }} has {{ $value }} threads waiting for a database connection.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: GitlabCiPipelineCreationSlow
|
||||
expr: 'histogram_quantile(0.95, sum(rate(gitlab_ci_pipeline_creation_duration_seconds_bucket[5m])) by (le)) > 30'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: GitLab CI pipeline creation slow (instance {{ $labels.instance }})
|
||||
description: "GitLab CI pipeline creation p95 latency on {{ $labels.instance }} is above 30 seconds.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# This metric may not exist in all GitLab versions. Verify against your GitLab installation.
|
||||
- alert: GitlabCiPipelineFailuresIncreasing
|
||||
expr: 'deriv(gitlab_ci_pipeline_failure_reasons[5m]) > 0.05'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: GitLab CI pipeline failures increasing (instance {{ $labels.instance }})
|
||||
description: "GitLab CI pipeline failures are increasing on {{ $labels.instance }} ({{ $value }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Frequent runner auth failures may indicate expired tokens or misconfigured runners.
|
||||
- alert: GitlabCiRunnerAuthenticationFailures
|
||||
expr: 'increase(gitlab_ci_runner_authentication_failure_total[5m]) > 5'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: GitLab CI runner authentication failures (instance {{ $labels.instance }})
|
||||
description: "GitLab CI runners are experiencing authentication failures on {{ $labels.instance }} ({{ $value }} failures).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold of 2GB may need adjustment based on your instance size.
|
||||
# High memory usage can lead to OOM kills and service disruptions.
|
||||
- alert: GitlabHighMemoryUsage
|
||||
expr: 'process_resident_memory_bytes{job=~".*gitlab.*"} > 2e+9'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: GitLab high memory usage (instance {{ $labels.instance }})
|
||||
description: "GitLab process on {{ $labels.instance }} is using {{ $value | humanize1024 }}B of RSS memory.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Heap fragmentation above 50% means a significant amount of memory is wasted.
|
||||
# A Puma worker restart may help reclaim memory.
|
||||
- alert: GitlabRubyHeapFragmentation
|
||||
expr: 'ruby_gc_stat_ext_heap_fragmentation{job=~".*gitlab.*"} > 0.5'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: GitLab Ruby heap fragmentation (instance {{ $labels.instance }})
|
||||
description: "GitLab Ruby heap fragmentation on {{ $labels.instance }} is {{ $value }}. High fragmentation wastes memory.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: GitlabRackUncaughtErrors
|
||||
expr: 'rate(rack_uncaught_errors_total[5m]) > 0.05'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: GitLab rack uncaught errors (instance {{ $labels.instance }})
|
||||
description: "GitLab is experiencing uncaught errors in the Rack layer on {{ $labels.instance }} ({{ $value }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# This may happen during a rolling deployment. If it persists, investigate incomplete upgrades.
|
||||
- alert: GitlabVersionMismatch
|
||||
expr: 'count(count by (version) (gitlab_build_info)) > 1'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: GitLab version mismatch (instance {{ $labels.instance }})
|
||||
description: "Multiple GitLab versions are running across the fleet.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: GitlabHighFileDescriptorUsage
|
||||
expr: 'process_open_fds{job=~".*gitlab.*"} / process_max_fds * 100 > 80 and process_max_fds > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: GitLab high file descriptor usage (instance {{ $labels.instance }})
|
||||
description: "GitLab on {{ $labels.instance }} is using {{ $value }}% of available file descriptors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: GitlabRubyThreadsSaturated
|
||||
expr: 'sum by (instance) (gitlab_ruby_threads_running_threads) > on(instance) gitlab_ruby_threads_max_expected_threads * 1.5'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: GitLab Ruby threads saturated (instance {{ $labels.instance }})
|
||||
description: "GitLab running threads on {{ $labels.instance }} have exceeded the expected maximum ({{ $value }}).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
36
dist/rules/gitlab-ci/workhorse.yml
vendored
Normal file
36
dist/rules/gitlab-ci/workhorse.yml
vendored
Normal file
|
|
@ -0,0 +1,36 @@
|
|||
groups:
|
||||
|
||||
- name: Workhorse
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
# Workhorse sits in front of Puma and handles Git HTTP, file uploads, and proxying.
|
||||
# Threshold from GitLab Omnibus default rules: 10% for high-traffic instances.
|
||||
- alert: GitlabWorkhorseHighErrorRate
|
||||
expr: 'sum(rate(gitlab_workhorse_http_request_duration_seconds_count{code=~"5.."}[5m])) / sum(rate(gitlab_workhorse_http_request_duration_seconds_count[5m])) * 100 > 10 and sum(rate(gitlab_workhorse_http_request_duration_seconds_count[5m])) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: GitLab Workhorse high error rate (instance {{ $labels.instance }})
|
||||
description: "GitLab Workhorse on {{ $labels.instance }} is returning more than 10% HTTP 5xx errors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: GitlabWorkhorseHighLatency
|
||||
expr: 'histogram_quantile(0.95, sum(rate(gitlab_workhorse_http_request_duration_seconds_bucket[5m])) by (le)) > 10'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: GitLab Workhorse high latency (instance {{ $labels.instance }})
|
||||
description: "GitLab Workhorse on {{ $labels.instance }} p95 request latency is above 10 seconds.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold of 100 may need adjustment based on instance size.
|
||||
- alert: GitlabWorkhorseHighIn-flightRequests
|
||||
expr: 'gitlab_workhorse_http_in_flight_requests > 100'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: GitLab Workhorse high in-flight requests (instance {{ $labels.instance }})
|
||||
description: "GitLab Workhorse on {{ $labels.instance }} has {{ $value }} in-flight requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
109
dist/rules/golang/golang-exporter.yml
vendored
Normal file
109
dist/rules/golang/golang-exporter.yml
vendored
Normal file
|
|
@ -0,0 +1,109 @@
|
|||
groups:
|
||||
|
||||
- name: GolangExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
# Threshold is a rough default. High-concurrency servers may legitimately run thousands of goroutines. Adjust to match your baseline.
|
||||
- alert: GoGoroutineCountHigh
|
||||
expr: 'go_goroutines > 1000'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Go goroutine count high (instance {{ $labels.instance }})
|
||||
description: "Go application has too many goroutines (> 1000), potential goroutine leak\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# quantile="1" is the maximum observed GC pause in the current summary window, not p99.
|
||||
# A single outlier pause can push this above 1s. The for: 5m ensures the max stays elevated.
|
||||
- alert: GoGcDurationHigh
|
||||
expr: 'go_gc_duration_seconds{quantile="1"} > 1'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Go GC duration high (instance {{ $labels.instance }})
|
||||
description: "Go GC pause duration is too high (max > 1s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# go_memstats_sys_bytes is the total memory obtained from the OS by the Go runtime, not total host memory.
|
||||
# This ratio measures Go-internal memory utilization, not system-level memory pressure.
|
||||
- alert: GoMemoryUsageHigh
|
||||
expr: '(go_memstats_heap_alloc_bytes / go_memstats_sys_bytes) * 100 > 90'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Go memory usage high (instance {{ $labels.instance }})
|
||||
description: "Go heap allocation is using most of the runtime's reserved memory (> 90%), indicating the process may need more memory or has a leak\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold is workload-dependent. Applications with heavy CGo or blocking I/O may legitimately use more OS threads. Adjust to match your baseline.
|
||||
- alert: GoThreadCountHigh
|
||||
expr: 'go_threads > 500'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Go thread count high (instance {{ $labels.instance }})
|
||||
description: "Go OS thread count is high (> 500), potential blocking syscall or CGo leak\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold is a rough default. Adjust based on your application's normal object count.
|
||||
- alert: GoHeapObjectsCountHigh
|
||||
expr: 'go_memstats_heap_objects > 10000000'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Go heap objects count high (instance {{ $labels.instance }})
|
||||
description: "Go heap has too many live objects (> 10M), high GC pressure\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# rate(go_gc_duration_seconds_sum) approximates the fraction of wall-clock time spent in GC.
|
||||
# This replaces go_memstats_gc_cpu_fraction which was removed in client_golang v1.12+.
|
||||
- alert: GoGcCpuFractionHigh
|
||||
expr: 'rate(go_gc_duration_seconds_sum[5m]) > 0.05'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Go GC CPU fraction high (instance {{ $labels.instance }})
|
||||
description: "Go GC is consuming too much CPU (> 5%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# A threshold of 100/s only catches catastrophic leaks (30k goroutines in 5m). 10/s catches gradual leaks (~3k in 5m).
|
||||
# Adjust based on your application's expected concurrency patterns.
|
||||
- alert: GoGoroutineSpike
|
||||
expr: 'deriv(go_goroutines[5m]) > 10'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Go goroutine spike (instance {{ $labels.instance }})
|
||||
description: "Go goroutine count is growing rapidly ({{ $value | printf \"%.0f\" }} goroutines/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Alerts when heap in-use grows by more than 10MB/s sustained over 10 minutes.
|
||||
# Adjust threshold based on your workload.
|
||||
- alert: GoHeapIn-useGrowing
|
||||
expr: 'deriv(go_memstats_heap_inuse_bytes[10m]) > 1e7'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Go heap in-use growing (instance {{ $labels.instance }})
|
||||
description: "Go heap in-use memory is growing steadily, potential memory leak or under-sized heap\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: GoMemoryLeak
|
||||
expr: 'rate(go_memstats_alloc_bytes_total[5m]) > 1e9'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Go memory leak (instance {{ $labels.instance }})
|
||||
description: "Go application has sustained high allocation rate (> 1GB/s), potential memory leak\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: GoStackMemoryHigh
|
||||
expr: 'go_memstats_stack_inuse_bytes > 1e9'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Go stack memory high (instance {{ $labels.instance }})
|
||||
description: "Go stack memory usage is high (> 1GB), likely excessive goroutines or deep recursion\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
53
dist/rules/google-cloud-stackdriver/stackdriver-exporter.yml
vendored
Normal file
53
dist/rules/google-cloud-stackdriver/stackdriver-exporter.yml
vendored
Normal file
|
|
@ -0,0 +1,53 @@
|
|||
groups:
|
||||
|
||||
- name: StackdriverExporter
|
||||
|
||||
# Self-monitoring metrics use the stackdriver_monitoring_* prefix.
|
||||
# All self-monitoring metrics include a project_id label.
|
||||
|
||||
rules:
|
||||
|
||||
- alert: StackdriverExporterScrapeError
|
||||
expr: 'stackdriver_monitoring_last_scrape_error > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Stackdriver exporter scrape error (instance {{ $labels.instance }})
|
||||
description: "Stackdriver exporter failed to scrape metrics from Google Cloud Monitoring API for project {{ $labels.project_id }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: StackdriverExporterSlowScrape
|
||||
expr: 'stackdriver_monitoring_last_scrape_duration_seconds > 300'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Stackdriver exporter slow scrape (instance {{ $labels.instance }})
|
||||
description: "Stackdriver exporter scrape for project {{ $labels.project_id }} is taking more than 5 minutes ({{ $value }}s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: StackdriverExporterScrapeErrorsIncreasing
|
||||
expr: 'increase(stackdriver_monitoring_scrape_errors_total[15m]) > 5'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Stackdriver exporter scrape errors increasing (instance {{ $labels.instance }})
|
||||
description: "Stackdriver exporter has had {{ $value }} scrape errors in the last 15 minutes for project {{ $labels.project_id }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: StackdriverExporterHighApiCalls
|
||||
expr: 'rate(stackdriver_monitoring_api_calls_total[5m]) * 60 > 100'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Stackdriver exporter high API calls (instance {{ $labels.instance }})
|
||||
description: "Stackdriver exporter is making {{ $value }} API calls per minute for project {{ $labels.project_id }}. This may hit Google Cloud Monitoring API quotas.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: StackdriverExporterScrapeStale
|
||||
expr: 'time() - stackdriver_monitoring_last_scrape_timestamp > 600'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Stackdriver exporter scrape stale (instance {{ $labels.instance }})
|
||||
description: "Stackdriver exporter has not successfully scraped metrics for project {{ $labels.project_id }} in the last 10 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
15
dist/rules/grafana-alloy/embedded-exporter.yml
vendored
Normal file
15
dist/rules/grafana-alloy/embedded-exporter.yml
vendored
Normal file
|
|
@ -0,0 +1,15 @@
|
|||
groups:
|
||||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: GrafanaAlloyServiceDown
|
||||
expr: 'count by (instance) (alloy_build_info offset 2h) unless count by (instance) (alloy_build_info)'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Grafana Alloy service down (instance {{ $labels.instance }})
|
||||
description: "Alloy on instance {{ $labels.instance }} is not responding or has stopped running.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
465
dist/rules/grafana-mimir/embedded-exporter.yml
vendored
Normal file
465
dist/rules/grafana-mimir/embedded-exporter.yml
vendored
Normal file
|
|
@ -0,0 +1,465 @@
|
|||
groups:
|
||||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
# Mimir uses the `cortex_` metric prefix for backward compatibility with Cortex. This is intentional and expected.
|
||||
|
||||
rules:
|
||||
|
||||
- alert: MimirIngesterUnhealthy
|
||||
expr: 'min by (job) (cortex_ring_members{state="Unhealthy", name="ingester"}) > 0'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Mimir ingester unhealthy (instance {{ $labels.instance }})
|
||||
description: "Mimir has {{ $value }} unhealthy ingester(s) in the ring.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MimirRequestErrors
|
||||
expr: '100 * sum by (job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..", route!~"ready|debug_pprof"}[5m])) / sum by (job, route) (rate(cortex_request_duration_seconds_count{route!~"ready|debug_pprof"}[5m])) > 1 and sum by (job, route) (rate(cortex_request_duration_seconds_count{route!~"ready|debug_pprof"}[5m])) > 0'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Mimir request errors (instance {{ $labels.instance }})
|
||||
description: "Mimir {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}% errors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MimirInconsistentRuntimeConfig
|
||||
expr: 'count(count by (job, sha256) (cortex_runtime_config_hash)) without(sha256) > 1'
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Mimir inconsistent runtime config (instance {{ $labels.instance }})
|
||||
description: "An inconsistent runtime config file is used across Mimir instances.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MimirBadRuntimeConfig
|
||||
expr: 'sum by (job) (cortex_runtime_config_last_reload_successful == 0) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Mimir bad runtime config (instance {{ $labels.instance }})
|
||||
description: "{{ $labels.job }} failed to reload runtime config.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MimirSchedulerQueriesStuck
|
||||
expr: 'sum by (job) (min_over_time(cortex_query_scheduler_queue_length[1m])) > 0'
|
||||
for: 7m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Mimir scheduler queries stuck (instance {{ $labels.instance }})
|
||||
description: "There are {{ $value }} queued up queries in {{ $labels.job }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MimirCacheRequestErrors
|
||||
expr: '(sum by (name, operation, job) (rate(thanos_cache_operation_failures_total[5m])) / sum by (name, operation, job) (rate(thanos_cache_operations_total[5m]))) * 100 > 5 and sum by (name, operation, job) (rate(thanos_cache_operations_total[5m])) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Mimir cache request errors (instance {{ $labels.instance }})
|
||||
description: "Mimir cache {{ $labels.name }} is experiencing {{ printf \"%.2f\" $value }}% errors for {{ $labels.operation }} operation.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MimirKvStoreFailure
|
||||
expr: '(sum by (job, kv_name) (rate(cortex_kv_request_duration_seconds_count{status_code!~"2.."}[5m])) / sum by (job, kv_name) (rate(cortex_kv_request_duration_seconds_count[5m]))) == 1 and sum by (job, kv_name) (rate(cortex_kv_request_duration_seconds_count[5m])) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Mimir KV store failure (instance {{ $labels.instance }})
|
||||
description: "Mimir {{ $labels.job }} KV store {{ $labels.kv_name }} is failing with 100% error rate.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MimirMemoryMapAreasTooHigh
|
||||
expr: 'process_memory_map_areas{job=~".*(ingester|cortex|mimir|store-gateway).*"} / process_memory_map_areas_limit{job=~".*(ingester|cortex|mimir|store-gateway).*"} * 100 > 80 and process_memory_map_areas_limit{job=~".*(ingester|cortex|mimir|store-gateway).*"} > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Mimir memory map areas too high (instance {{ $labels.instance }})
|
||||
description: "Mimir {{ $labels.job }} is using {{ printf \"%.0f\" $value }}% of its memory map area limit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MimirIngesterInstanceHasNoTenants
|
||||
expr: '(cortex_ingester_memory_users == 0) and on (instance) (cortex_ingester_memory_users offset 1h > 0)'
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Mimir ingester instance has no tenants (instance {{ $labels.instance }})
|
||||
description: "Mimir ingester {{ $labels.instance }} has no tenants assigned.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MimirRulerInstanceHasNoRuleGroups
|
||||
expr: '(cortex_ruler_managers_total == 0) and on (instance) (cortex_ruler_managers_total offset 1h > 0)'
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Mimir ruler instance has no rule groups (instance {{ $labels.instance }})
|
||||
description: "Mimir ruler {{ $labels.instance }} has no rule groups assigned.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MimirIngestedDataTooFarInTheFuture
|
||||
expr: 'max by (job) (cortex_ingester_tsdb_head_max_timestamp_seconds - time() and cortex_ingester_tsdb_head_max_timestamp_seconds > 0) > 3600'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Mimir ingested data too far in the future (instance {{ $labels.instance }})
|
||||
description: "Mimir ingester {{ $labels.job }} has ingested samples with timestamps more than 1 hour in the future.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold of 0.05/s avoids firing on transient single-event spikes.
|
||||
- alert: MimirStoreGatewayTooManyFailedOperations
|
||||
expr: 'sum by (job) (rate(thanos_objstore_bucket_operation_failures_total[5m])) > 0.05'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Mimir store gateway too many failed operations (instance {{ $labels.instance }})
|
||||
description: "Mimir store-gateway {{ $labels.job }} bucket operations are failing ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MimirRingMembersMismatch
|
||||
expr: 'max by (name, job) (sum by (name, job, instance) (cortex_ring_members)) != min by (name, job) (sum by (name, job, instance) (cortex_ring_members))'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Mimir ring members mismatch (instance {{ $labels.instance }})
|
||||
description: "Mimir {{ $labels.name }} ring has inconsistent member counts across instances.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MimirIngesterReachingSeriesLimitWarning
|
||||
expr: '(cortex_ingester_memory_series / ignoring(limit) cortex_ingester_instance_limits{limit="max_series"} * 100 > 80) and cortex_ingester_instance_limits{limit="max_series"} > 0'
|
||||
for: 3h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Mimir ingester reaching series limit warning (instance {{ $labels.instance }})
|
||||
description: "Mimir ingester {{ $labels.instance }} has reached {{ printf \"%.0f\" $value }}% of its series limit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MimirIngesterReachingSeriesLimitCritical
|
||||
expr: '(cortex_ingester_memory_series / ignoring(limit) cortex_ingester_instance_limits{limit="max_series"} * 100 > 90) and cortex_ingester_instance_limits{limit="max_series"} > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Mimir ingester reaching series limit critical (instance {{ $labels.instance }})
|
||||
description: "Mimir ingester {{ $labels.instance }} has reached {{ printf \"%.0f\" $value }}% of its series limit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MimirIngesterReachingTenantsLimitWarning
|
||||
expr: '(cortex_ingester_memory_users / ignoring(limit) cortex_ingester_instance_limits{limit="max_tenants"} * 100 > 70) and cortex_ingester_instance_limits{limit="max_tenants"} > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Mimir ingester reaching tenants limit warning (instance {{ $labels.instance }})
|
||||
description: "Mimir ingester {{ $labels.instance }} has reached {{ printf \"%.0f\" $value }}% of its tenants limit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MimirIngesterReachingTenantsLimitCritical
|
||||
expr: '(cortex_ingester_memory_users / ignoring(limit) cortex_ingester_instance_limits{limit="max_tenants"} * 100 > 80) and cortex_ingester_instance_limits{limit="max_tenants"} > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Mimir ingester reaching tenants limit critical (instance {{ $labels.instance }})
|
||||
description: "Mimir ingester {{ $labels.instance }} has reached {{ printf \"%.0f\" $value }}% of its tenants limit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MimirReachingTcpConnectionsLimit
|
||||
expr: 'cortex_tcp_connections / cortex_tcp_connections_limit * 100 > 80 and cortex_tcp_connections_limit > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Mimir reaching TCP connections limit (instance {{ $labels.instance }})
|
||||
description: "Mimir instance {{ $labels.instance }} is using {{ printf \"%.0f\" $value }}% of its TCP connections limit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MimirDistributorInflightRequestsHigh
|
||||
expr: '(cortex_distributor_inflight_push_requests / ignoring(limit) cortex_distributor_instance_limits{limit="max_inflight_push_requests"} * 100 > 80) and cortex_distributor_instance_limits{limit="max_inflight_push_requests"} > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Mimir distributor inflight requests high (instance {{ $labels.instance }})
|
||||
description: "Mimir distributor {{ $labels.instance }} is using {{ printf \"%.0f\" $value }}% of its inflight push requests limit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold of 0.05/s avoids firing on transient single-event spikes.
|
||||
- alert: MimirIngesterTsdbHeadCompactionFailed
|
||||
expr: 'rate(cortex_ingester_tsdb_compactions_failed_total[5m]) > 0.05'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Mimir ingester TSDB head compaction failed (instance {{ $labels.instance }})
|
||||
description: "Mimir ingester {{ $labels.instance }} is failing to compact TSDB head ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold of 0.05/s avoids firing on transient single-event spikes.
|
||||
- alert: MimirIngesterTsdbHeadTruncationFailed
|
||||
expr: 'rate(cortex_ingester_tsdb_head_truncations_failed_total[5m]) > 0.05'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Mimir ingester TSDB head truncation failed (instance {{ $labels.instance }})
|
||||
description: "Mimir ingester {{ $labels.instance }} is failing to truncate TSDB head ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold of 0.05/s avoids firing on transient single-event spikes.
|
||||
- alert: MimirIngesterTsdbCheckpointCreationFailed
|
||||
expr: 'rate(cortex_ingester_tsdb_checkpoint_creations_failed_total[5m]) > 0.05'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Mimir ingester TSDB checkpoint creation failed (instance {{ $labels.instance }})
|
||||
description: "Mimir ingester {{ $labels.instance }} is failing to create TSDB checkpoints ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold of 0.05/s avoids firing on transient single-event spikes.
|
||||
- alert: MimirIngesterTsdbCheckpointDeletionFailed
|
||||
expr: 'rate(cortex_ingester_tsdb_checkpoint_deletions_failed_total[5m]) > 0.05'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Mimir ingester TSDB checkpoint deletion failed (instance {{ $labels.instance }})
|
||||
description: "Mimir ingester {{ $labels.instance }} is failing to delete TSDB checkpoints ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold of 0.05/s avoids firing on transient single-event spikes.
|
||||
- alert: MimirIngesterTsdbWalTruncationFailed
|
||||
expr: 'rate(cortex_ingester_tsdb_wal_truncations_failed_total[5m]) > 0.05'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Mimir ingester TSDB WAL truncation failed (instance {{ $labels.instance }})
|
||||
description: "Mimir ingester {{ $labels.instance }} is failing to truncate TSDB WAL ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold of 0.05/s avoids firing on transient single-event spikes.
|
||||
- alert: MimirIngesterTsdbWalWritesFailed
|
||||
expr: 'rate(cortex_ingester_tsdb_wal_writes_failed_total[1m]) > 0.05'
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Mimir ingester TSDB WAL writes failed (instance {{ $labels.instance }})
|
||||
description: "Mimir ingester {{ $labels.instance }} is failing to write to TSDB WAL ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold of 30 minutes. Adjust based on your sync interval.
|
||||
- alert: MimirStoreGatewayHasNotSyncedBucket
|
||||
expr: '(time() - cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 1800) and cortex_bucket_stores_blocks_last_successful_sync_timestamp_seconds{component="store-gateway"} > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Mimir store gateway has not synced bucket (instance {{ $labels.instance }})
|
||||
description: "Mimir store-gateway {{ $labels.instance }} has not synced the bucket for more than 30 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MimirStoreGatewayNoSyncedTenants
|
||||
expr: '(min by (instance, job) (cortex_bucket_stores_tenants_synced{component="store-gateway"}) == 0) and on (instance) (cortex_bucket_stores_tenants_synced{component="store-gateway"} offset 1h > 0)'
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Mimir store gateway no synced tenants (instance {{ $labels.instance }})
|
||||
description: "Mimir store-gateway {{ $labels.instance }} has no synced tenants.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MimirBucketIndexNotUpdated
|
||||
expr: 'min by (user, job) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 2100'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Mimir bucket index not updated (instance {{ $labels.instance }})
|
||||
description: "Mimir bucket index for tenant {{ $labels.user }} has not been updated for more than 35 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MimirCompactorNotCleaningUpBlocks
|
||||
expr: '(time() - cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds > 21600) and cortex_compactor_block_cleanup_last_successful_run_timestamp_seconds > 0'
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Mimir compactor not cleaning up blocks (instance {{ $labels.instance }})
|
||||
description: "Mimir compactor {{ $labels.instance }} has not cleaned up blocks in the last 6 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MimirCompactorNotRunningCompaction
|
||||
expr: '(time() - cortex_compactor_last_successful_run_timestamp_seconds > 86400) and cortex_compactor_last_successful_run_timestamp_seconds > 0'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Mimir compactor not running compaction (instance {{ $labels.instance }})
|
||||
description: "Mimir compactor {{ $labels.instance }} has not run compaction in the last 24 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MimirCompactorHasConsecutiveFailures
|
||||
expr: 'increase(cortex_compactor_runs_failed_total{reason!="shutdown"}[2h]) > 1'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Mimir compactor has consecutive failures (instance {{ $labels.instance }})
|
||||
description: "Mimir compactor {{ $labels.instance }} has had {{ $value }} compaction failures in the last 2 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# cortex_compactor_disk_out_of_space_errors_total is declared as gauge by Mimir despite the _total suffix, so delta() is used instead of increase().
|
||||
- alert: MimirCompactorHasRunOutOfDiskSpace
|
||||
expr: 'delta(cortex_compactor_disk_out_of_space_errors_total[24h]) >= 1'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Mimir compactor has run out of disk space (instance {{ $labels.instance }})
|
||||
description: "Mimir compactor {{ $labels.instance }} has run out of disk space.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MimirCompactorHasNotUploadedBlocks
|
||||
expr: '(time() - thanos_objstore_bucket_last_successful_upload_time{component="compactor"} > 86400) and thanos_objstore_bucket_last_successful_upload_time{component="compactor"} > 0'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Mimir compactor has not uploaded blocks (instance {{ $labels.instance }})
|
||||
description: "Mimir compactor {{ $labels.instance }} has not uploaded any block in the last 24 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Using a 24h window as compaction skips are rare events.
|
||||
- alert: MimirCompactorSkippedBlocks
|
||||
expr: 'increase(cortex_compactor_blocks_marked_for_no_compaction_total[24h]) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Mimir compactor skipped blocks (instance {{ $labels.instance }})
|
||||
description: "Mimir compactor has found {{ $value }} blocks that cannot be compacted (reason {{ $labels.reason }}).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MimirRulerTooManyFailedPushes
|
||||
expr: '100 * sum by (instance, job) (rate(cortex_ruler_write_requests_failed_total[5m])) / sum by (instance, job) (rate(cortex_ruler_write_requests_total[5m])) > 1 and sum by (instance, job) (rate(cortex_ruler_write_requests_total[5m])) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Mimir ruler too many failed pushes (instance {{ $labels.instance }})
|
||||
description: "Mimir ruler {{ $labels.instance }} is failing to push {{ printf \"%.2f\" $value }}% of write requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MimirRulerTooManyFailedQueries
|
||||
expr: '100 * sum by (instance, job) (rate(cortex_ruler_queries_failed_total[5m])) / sum by (instance, job) (rate(cortex_ruler_queries_total[5m])) > 1 and sum by (instance, job) (rate(cortex_ruler_queries_total[5m])) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Mimir ruler too many failed queries (instance {{ $labels.instance }})
|
||||
description: "Mimir ruler {{ $labels.instance }} is failing {{ printf \"%.2f\" $value }}% of query evaluations.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MimirRulerMissedEvaluations
|
||||
expr: '100 * sum by (instance, job) (rate(cortex_prometheus_rule_group_iterations_missed_total[5m])) / sum by (instance, job) (rate(cortex_prometheus_rule_group_iterations_total[5m])) > 1 and sum by (instance, job) (rate(cortex_prometheus_rule_group_iterations_total[5m])) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Mimir ruler missed evaluations (instance {{ $labels.instance }})
|
||||
description: "Mimir ruler {{ $labels.instance }} is missing {{ printf \"%.2f\" $value }}% of rule group evaluations.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold of 0.05/s avoids firing on transient single-event spikes.
|
||||
- alert: MimirRulerFailedRingCheck
|
||||
expr: 'sum by (job) (rate(cortex_ruler_ring_check_errors_total[5m])) > 0.05'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Mimir ruler failed ring check (instance {{ $labels.instance }})
|
||||
description: "Mimir ruler {{ $labels.job }} is failing ring checks ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold of 0.05/s avoids firing on transient single-event spikes.
|
||||
- alert: MimirAlertmanagerSyncConfigsFailing
|
||||
expr: 'rate(cortex_alertmanager_sync_configs_failed_total[5m]) > 0.05'
|
||||
for: 30m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Mimir alertmanager sync configs failing (instance {{ $labels.instance }})
|
||||
description: "Mimir alertmanager {{ $labels.job }} is failing to sync configs ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold of 0.05/s avoids firing on transient single-event spikes.
|
||||
- alert: MimirAlertmanagerRingCheckFailing
|
||||
expr: 'rate(cortex_alertmanager_ring_check_errors_total[5m]) > 0.05'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Mimir alertmanager ring check failing (instance {{ $labels.instance }})
|
||||
description: "Mimir alertmanager {{ $labels.job }} is failing ring checks ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold of 0.05/s avoids firing on transient single-event spikes.
|
||||
- alert: MimirAlertmanagerStateMergeFailing
|
||||
expr: 'rate(cortex_alertmanager_partial_state_merges_failed_total[5m]) > 0.05'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Mimir alertmanager state merge failing (instance {{ $labels.instance }})
|
||||
description: "Mimir alertmanager {{ $labels.job }} is failing to merge state updates ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold of 0.05/s avoids firing on transient single-event spikes.
|
||||
- alert: MimirAlertmanagerReplicationFailing
|
||||
expr: 'rate(cortex_alertmanager_state_replication_failed_total[5m]) > 0.05'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Mimir alertmanager replication failing (instance {{ $labels.instance }})
|
||||
description: "Mimir alertmanager {{ $labels.job }} is failing to replicate state ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold of 0.05/s avoids firing on transient single-event spikes.
|
||||
- alert: MimirAlertmanagerPersistStateFailing
|
||||
expr: 'rate(cortex_alertmanager_state_persist_failed_total[15m]) > 0.05'
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Mimir alertmanager persist state failing (instance {{ $labels.instance }})
|
||||
description: "Mimir alertmanager {{ $labels.job }} is failing to persist state ({{ $value | humanize }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MimirAlertmanagerInitialSyncFailed
|
||||
expr: 'increase(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed"}[1m]) > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Mimir alertmanager initial sync failed (instance {{ $labels.instance }})
|
||||
description: "Mimir alertmanager {{ $labels.job }} failed initial state sync.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MimirAlertmanagerInstanceHasNoTenants
|
||||
expr: '(cortex_alertmanager_tenants_owned == 0) and on (instance) (cortex_alertmanager_tenants_owned offset 1h > 0)'
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Mimir alertmanager instance has no tenants (instance {{ $labels.instance }})
|
||||
description: "Mimir alertmanager {{ $labels.instance }} has no tenants assigned.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MimirGossipMembersCountTooHigh
|
||||
expr: 'avg(memberlist_client_cluster_members_count{job=~".*(mimir|cortex).*"}) by (job) * 1.15 + 10 < max(memberlist_client_cluster_members_count{job=~".*(mimir|cortex).*"}) by (job)'
|
||||
for: 20m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Mimir gossip members count too high (instance {{ $labels.instance }})
|
||||
description: "Mimir gossip cluster has more members than expected.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MimirGossipMembersCountTooLow
|
||||
expr: 'avg(memberlist_client_cluster_members_count{job=~".*(mimir|cortex).*"}) by (job) * 0.5 > min(memberlist_client_cluster_members_count{job=~".*(mimir|cortex).*"}) by (job)'
|
||||
for: 20m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Mimir gossip members count too low (instance {{ $labels.instance }})
|
||||
description: "Mimir gossip cluster has fewer members than expected.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# A high number of Go threads may indicate a goroutine leak.
|
||||
- alert: MimirGoThreadsTooHighWarning
|
||||
expr: 'go_threads{job=~".*(mimir|cortex).*"} > 5000'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Mimir go threads too high warning (instance {{ $labels.instance }})
|
||||
description: "Mimir {{ $labels.instance }} has {{ $value }} Go threads.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MimirGoThreadsTooHighCritical
|
||||
expr: 'go_threads{job=~".*(mimir|cortex).*"} > 8000'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Mimir go threads too high critical (instance {{ $labels.instance }})
|
||||
description: "Mimir {{ $labels.instance }} has {{ $value }} Go threads.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
175
dist/rules/grafana-tempo/embedded-exporter.yml
vendored
Normal file
175
dist/rules/grafana-tempo/embedded-exporter.yml
vendored
Normal file
|
|
@ -0,0 +1,175 @@
|
|||
groups:
|
||||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: TempoDistributorUnhealthy
|
||||
expr: 'max by (job) (tempo_ring_members{state="Unhealthy", name="distributor"}) > 0'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Tempo distributor unhealthy (instance {{ $labels.instance }})
|
||||
description: "Tempo has {{ $value }} unhealthy distributor(s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: TempoLiveStoreUnhealthy
|
||||
expr: 'max by (job) (tempo_ring_members{state="Unhealthy", name="live-store"}) > 0'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Tempo live store unhealthy (instance {{ $labels.instance }})
|
||||
description: "Tempo has {{ $value }} unhealthy live store(s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: TempoMetricsGeneratorUnhealthy
|
||||
expr: 'max by (job) (tempo_ring_members{state="Unhealthy", name="metrics-generator"}) > 0'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Tempo metrics generator unhealthy (instance {{ $labels.instance }})
|
||||
description: "Tempo has {{ $value }} unhealthy metrics generator(s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Uses a two-window approach: 1h for historical count and 5m to confirm the issue is ongoing.
|
||||
- alert: TempoCompactionsFailing
|
||||
expr: 'sum by (job) (increase(tempodb_compaction_errors_total[1h])) > 2 and sum by (job) (increase(tempodb_compaction_errors_total[5m])) > 0'
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Tempo compactions failing (instance {{ $labels.instance }})
|
||||
description: "{{ $value }} compactions have failed in the past hour.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: TempoPollsFailing
|
||||
expr: 'sum by (job) (increase(tempodb_blocklist_poll_errors_total[1h])) > 2 and sum by (job) (increase(tempodb_blocklist_poll_errors_total[5m])) > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Tempo polls failing (instance {{ $labels.instance }})
|
||||
description: "{{ $value }} blocklist polls have failed in the past hour.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: TempoTenantIndexFailures
|
||||
expr: 'sum by (job) (increase(tempodb_blocklist_tenant_index_errors_total[1h])) > 2 and sum by (job) (increase(tempodb_blocklist_tenant_index_errors_total[5m])) > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Tempo tenant index failures (instance {{ $labels.instance }})
|
||||
description: "{{ $value }} tenant index failures in the past hour.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: TempoNoTenantIndexBuilders
|
||||
expr: 'sum by (tenant) (tempodb_blocklist_tenant_index_builder) == 0 and on() max(tempodb_blocklist_length) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Tempo no tenant index builders (instance {{ $labels.instance }})
|
||||
description: "No tenant index builders for tenant {{ $labels.tenant }}. Tenant index will quickly become stale.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold of 600s (10 minutes). Adjust based on your tenant index build interval.
|
||||
- alert: TempoTenantIndexTooOld
|
||||
expr: 'max by (tenant) (tempodb_blocklist_tenant_index_age_seconds) > 600'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Tempo tenant index too old (instance {{ $labels.instance }})
|
||||
description: "Tenant index for {{ $labels.tenant }} is {{ $value }}s old.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Fires when the blocklist grows more than 40% over 7 days.
|
||||
- alert: TempoBlockListRisingQuickly
|
||||
expr: '(avg(tempodb_blocklist_length) / avg(tempodb_blocklist_length offset 7d) - 1) * 100 > 40 and avg(tempodb_blocklist_length offset 7d) > 0'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Tempo block list rising quickly (instance {{ $labels.instance }})
|
||||
description: "Tempo blocklist length is up {{ printf \"%.0f\" $value }}% over the last 7 days. Consider scaling compactors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: TempoBadOverrides
|
||||
expr: 'sum by (job) (tempo_runtime_config_last_reload_successful == 0) > 0'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Tempo bad overrides (instance {{ $labels.instance }})
|
||||
description: "{{ $labels.job }} failed to reload runtime overrides.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: TempoUserConfigurableOverridesReloadFailing
|
||||
expr: 'sum by (job) (increase(tempo_overrides_user_configurable_overrides_reload_failed_total[1h])) > 5 and sum by (job) (increase(tempo_overrides_user_configurable_overrides_reload_failed_total[5m])) > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Tempo user configurable overrides reload failing (instance {{ $labels.instance }})
|
||||
description: "{{ $value }} user-configurable overrides reloads have failed in the past hour.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold of 100 blocks per compactor instance. Adjust based on your environment.
|
||||
- alert: TempoCompactionTooManyOutstandingBlocksWarning
|
||||
expr: 'sum by (instance) (tempodb_compaction_outstanding_blocks) > 100'
|
||||
for: 6h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Tempo compaction too many outstanding blocks warning (instance {{ $labels.instance }})
|
||||
description: "There are too many outstanding compaction blocks for {{ $labels.instance }}. Consider increasing compactor resources.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold of 100 blocks per compactor instance. Normalize by backend-worker count if needed. Adjust based on your environment.
|
||||
- alert: TempoCompactionTooManyOutstandingBlocksCritical
|
||||
expr: 'sum by (instance) (tempodb_compaction_outstanding_blocks) > 250'
|
||||
for: 24h
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Tempo compaction too many outstanding blocks critical (instance {{ $labels.instance }})
|
||||
description: "There are too many outstanding compaction blocks for {{ $labels.instance }}. Increase compactor resources immediately.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold of 0.05/s avoids firing on transient single-event spikes.
|
||||
- alert: TempoDistributorUsageTrackerErrors
|
||||
expr: 'sum by (job, reason) (rate(tempo_distributor_usage_tracker_errors_total[5m])) > 0.05'
|
||||
for: 30m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Tempo distributor usage tracker errors (instance {{ $labels.instance }})
|
||||
description: "Tempo distributor usage tracker errors for {{ $labels.job }} at {{ $value | humanize }}/s (reason {{ $labels.reason }}).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: TempoMetricsGeneratorProcessorUpdatesFailing
|
||||
expr: 'sum by (job) (increase(tempo_metrics_generator_active_processors_update_failed_total[5m])) > 2'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Tempo metrics generator processor updates failing (instance {{ $labels.instance }})
|
||||
description: "Tempo metrics generator processor updates are failing for {{ $labels.job }} ({{ $value }} failures in 5m).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: TempoMetricsGeneratorServiceGraphsDroppingSpans
|
||||
expr: '100 * sum by (job) (rate(tempo_metrics_generator_processor_service_graphs_dropped_spans_total[5m])) / sum by (job) (rate(tempo_metrics_generator_spans_received_total[5m])) > 0.5 and sum by (job) (rate(tempo_metrics_generator_spans_received_total[5m])) > 0'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Tempo metrics generator service graphs dropping spans (instance {{ $labels.instance }})
|
||||
description: "Tempo metrics generator is dropping {{ printf \"%.2f\" $value }}% of spans in service graphs for {{ $labels.job }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: TempoMetricsGeneratorCollectionsFailing
|
||||
expr: 'sum by (job) (increase(tempo_metrics_generator_registry_collections_failed_total[5m])) > 2'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Tempo metrics generator collections failing (instance {{ $labels.instance }})
|
||||
description: "Tempo metrics generator collections are failing for {{ $labels.job }} ({{ $value }} failures in 5m).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Fires when the memcached error rate exceeds 20%. Only relevant if Tempo is configured with memcached caching.
|
||||
- alert: TempoMemcachedErrorsElevated
|
||||
expr: '100 * sum by (name, job) (rate(tempo_memcache_request_duration_seconds_count{status_code="500"}[5m])) / sum by (name, job) (rate(tempo_memcache_request_duration_seconds_count[5m])) > 20 and sum by (name, job) (rate(tempo_memcache_request_duration_seconds_count[5m])) > 0'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Tempo memcached errors elevated (instance {{ $labels.instance }})
|
||||
description: "Tempo memcached error rate is {{ printf \"%.2f\" $value }}% for {{ $labels.name }} in {{ $labels.job }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
13
dist/rules/graph-node/embedded-exporter.yml
vendored
13
dist/rules/graph-node/embedded-exporter.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: ProviderFailedBecauseNet_versionFailed
|
||||
|
|
@ -40,20 +41,22 @@ groups:
|
|||
summary: Provider failed because get genesis timeout (instance {{ $labels.instance }})
|
||||
description: "Timeout to get genesis for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: StoreConnectionIsTooSlow
|
||||
# Threshold of 10ms. Adjust based on your expected database latency.
|
||||
- alert: StoreConnectionSlow
|
||||
expr: 'store_connection_wait_time_ms > 10'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Store connection is too slow (instance {{ $labels.instance }})
|
||||
summary: Store connection slow (instance {{ $labels.instance }})
|
||||
description: "Store connection is too slow to `{{$labels.pool}}` pool, `{{$labels.shard}}` shard in Graph node `{{$labels.instance}}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: StoreConnectionIsTooSlow
|
||||
# Threshold of 20ms. Adjust based on your expected database latency.
|
||||
- alert: StoreConnectionVerySlow
|
||||
expr: 'store_connection_wait_time_ms > 20'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Store connection is too slow (instance {{ $labels.instance }})
|
||||
description: "Store connection is too slow to `{{$labels.pool}}` pool, `{{$labels.shard}}` shard in Graph node `{{$labels.instance}}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
summary: Store connection very slow (instance {{ $labels.instance }})
|
||||
description: "Store connection is very slow to `{{$labels.pool}}` pool, `{{$labels.shard}}` shard in Graph node `{{$labels.instance}}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
|
|||
19
dist/rules/hadoop/jmx_exporter.yml
vendored
19
dist/rules/hadoop/jmx_exporter.yml
vendored
|
|
@ -2,8 +2,12 @@ groups:
|
|||
|
||||
- name: Jmx_exporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
# When targets are managed via service discovery, a disappeared target goes stale rather than reporting up==0,
|
||||
# so this alert may not fire. Prefer application-level availability metrics if available.
|
||||
# Rename job="hadoop-namenode" to match the actual job name in your Prometheus scrape config.
|
||||
- alert: HadoopNameNodeDown
|
||||
expr: 'up{job="hadoop-namenode"} == 0'
|
||||
for: 5m
|
||||
|
|
@ -13,6 +17,9 @@ groups:
|
|||
summary: Hadoop Name Node Down (instance {{ $labels.instance }})
|
||||
description: "The Hadoop NameNode service is unavailable.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# When targets are managed via service discovery, a disappeared target goes stale rather than reporting up==0,
|
||||
# so this alert may not fire. Prefer application-level availability metrics if available.
|
||||
# Rename job="hadoop-resourcemanager" to match the actual job name in your Prometheus scrape config.
|
||||
- alert: HadoopResourceManagerDown
|
||||
expr: 'up{job="hadoop-resourcemanager"} == 0'
|
||||
for: 5m
|
||||
|
|
@ -32,7 +39,7 @@ groups:
|
|||
description: "The Hadoop DataNode is not sending heartbeats.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HadoopHdfsDiskSpaceLow
|
||||
expr: '(hadoop_hdfs_bytes_total - hadoop_hdfs_bytes_used) / hadoop_hdfs_bytes_total < 0.1'
|
||||
expr: '(hadoop_hdfs_bytes_total - hadoop_hdfs_bytes_used) / hadoop_hdfs_bytes_total < 0.1 and hadoop_hdfs_bytes_total > 0'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -41,7 +48,7 @@ groups:
|
|||
description: "Available HDFS disk space is running low.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HadoopMapReduceTaskFailures
|
||||
expr: 'hadoop_mapreduce_task_failures_total > 100'
|
||||
expr: 'increase(hadoop_mapreduce_task_failures_total[1h]) > 100'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
|
|
@ -50,7 +57,7 @@ groups:
|
|||
description: "There is an unusually high number of MapReduce task failures.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HadoopResourceManagerMemoryHigh
|
||||
expr: 'hadoop_resourcemanager_memory_bytes / hadoop_resourcemanager_memory_max_bytes > 0.8'
|
||||
expr: 'hadoop_resourcemanager_memory_bytes / hadoop_resourcemanager_memory_max_bytes > 0.8 and hadoop_resourcemanager_memory_max_bytes > 0'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -59,7 +66,7 @@ groups:
|
|||
description: "The Hadoop ResourceManager is approaching its memory limit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HadoopYarnContainerAllocationFailures
|
||||
expr: 'hadoop_yarn_container_allocation_failures_total > 10'
|
||||
expr: 'increase(hadoop_yarn_container_allocation_failures_total[1h]) > 10'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -77,10 +84,10 @@ groups:
|
|||
description: "The HBase cluster has an unusually high number of regions.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HadoopHbaseRegionServerHeapLow
|
||||
expr: 'hadoop_hbase_region_server_heap_bytes / hadoop_hbase_region_server_max_heap_bytes < 0.2'
|
||||
expr: 'hadoop_hbase_region_server_heap_bytes / hadoop_hbase_region_server_max_heap_bytes > 0.8 and hadoop_hbase_region_server_max_heap_bytes > 0'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Hadoop HBase Region Server Heap Low (instance {{ $labels.instance }})
|
||||
description: "HBase Region Servers are running low on heap space.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
|
|||
34
dist/rules/haproxy/embedded-exporter-v2.yml
vendored
34
dist/rules/haproxy/embedded-exporter-v2.yml
vendored
|
|
@ -2,28 +2,29 @@ groups:
|
|||
|
||||
- name: EmbeddedExporterV2
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: HaproxyHighHttp4xxErrorRateBackend
|
||||
expr: '((sum by (proxy) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5'
|
||||
expr: '((sum by (proxy) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 and sum by (proxy) (rate(haproxy_server_http_responses_total[1m])) > 0'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: HAProxy high HTTP 4xx error rate backend (instance {{ $labels.instance }})
|
||||
description: "Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.proxy }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HaproxyHighHttp5xxErrorRateBackend
|
||||
expr: '((sum by (proxy) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5'
|
||||
expr: '((sum by (proxy) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 and sum by (proxy) (rate(haproxy_server_http_responses_total[1m])) > 0'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: HAProxy high HTTP 5xx error rate backend (instance {{ $labels.instance }})
|
||||
description: "Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.proxy }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HaproxyHighHttp4xxErrorRateServer
|
||||
expr: '((sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5'
|
||||
expr: '((sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
|
|
@ -32,7 +33,7 @@ groups:
|
|||
description: "Too many HTTP requests with status 4xx (> 5%) on server {{ $labels.server }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HaproxyHighHttp5xxErrorRateServer
|
||||
expr: '((sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5'
|
||||
expr: '((sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
|
|
@ -41,7 +42,7 @@ groups:
|
|||
description: "Too many HTTP requests with status 5xx (> 5%) on server {{ $labels.server }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HaproxyServerResponseErrors
|
||||
expr: '(sum by (server) (rate(haproxy_server_response_errors_total[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100 > 5'
|
||||
expr: '(sum by (server) (rate(haproxy_server_response_errors_total[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100 > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
|
|
@ -56,7 +57,7 @@ groups:
|
|||
severity: critical
|
||||
annotations:
|
||||
summary: HAProxy backend connection errors (instance {{ $labels.instance }})
|
||||
description: "Too many connection errors to {{ $labels.fqdn }}/{{ $labels.backend }} backend (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Too many connection errors to {{ $labels.proxy }} backend (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HaproxyServerConnectionErrors
|
||||
expr: '(sum by (proxy) (rate(haproxy_server_connection_errors_total[1m]))) > 100'
|
||||
|
|
@ -65,19 +66,20 @@ groups:
|
|||
severity: critical
|
||||
annotations:
|
||||
summary: HAProxy server connection errors (instance {{ $labels.instance }})
|
||||
description: "Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Too many connection errors to {{ $labels.proxy }} (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HaproxyBackendMaxActiveSession>80%
|
||||
expr: '((haproxy_server_max_sessions >0) * 100) / (haproxy_server_limit_sessions > 0) > 80'
|
||||
expr: '(haproxy_backend_current_sessions / haproxy_backend_limit_sessions * 100) > 80 and haproxy_backend_limit_sessions > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: HAProxy backend max active session > 80% (instance {{ $labels.instance }})
|
||||
description: "Session limit from backend {{ $labels.proxy }} to server {{ $labels.server }} reached 80% of limit - {{ $value | printf \"%.2f\"}}%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Session limit from backend {{ $labels.proxy }} reached 80% of limit - {{ $value | printf \"%.2f\"}}%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# haproxy_backend_current_queue is a gauge (current queue depth), not a counter.
|
||||
- alert: HaproxyPendingRequests
|
||||
expr: 'sum by (proxy) (rate(haproxy_backend_current_queue[2m])) > 0'
|
||||
expr: 'sum by (proxy) (haproxy_backend_current_queue) > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -92,7 +94,7 @@ groups:
|
|||
severity: warning
|
||||
annotations:
|
||||
summary: HAProxy HTTP slowing down (instance {{ $labels.instance }})
|
||||
description: "Average request time is increasing - {{ $value | printf \"%.2f\"}}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "HAProxy backend max total time is above 1s on {{ $labels.proxy }} - {{ $value | printf \"%.2f\"}}s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HaproxyRetryHigh
|
||||
expr: 'sum by (proxy) (rate(haproxy_backend_retry_warnings_total[1m])) > 10'
|
||||
|
|
@ -122,10 +124,10 @@ groups:
|
|||
description: "HAProxy is blocking requests for security reason\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HaproxyServerHealthcheckFailure
|
||||
expr: 'increase(haproxy_server_check_failures_total[1m]) > 0'
|
||||
for: 1m
|
||||
expr: 'increase(haproxy_server_check_failures_total[1m]) > 2'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: HAProxy server healthcheck failure (instance {{ $labels.instance }})
|
||||
description: "Some server healthcheck are failing on {{ $labels.server }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Some server healthcheck are failing on {{ $labels.server }} ({{ $value }} in the last 1m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
|
|||
79
dist/rules/haproxy/haproxy-exporter-v1.yml
vendored
79
dist/rules/haproxy/haproxy-exporter-v1.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: HaproxyExporterV1
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: HaproxyDown
|
||||
|
|
@ -13,104 +14,104 @@ groups:
|
|||
summary: HAProxy down (instance {{ $labels.instance }})
|
||||
description: "HAProxy down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HaproxyHighHttp4xxErrorRateBackend
|
||||
expr: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 5'
|
||||
- alert: HaproxyHighHttp4xxErrorRateBackend(v1)
|
||||
expr: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) * 100 > 5 and sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 0'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: HAProxy high HTTP 4xx error rate backend (instance {{ $labels.instance }})
|
||||
description: "Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
summary: HAProxy high HTTP 4xx error rate backend (v1) (instance {{ $labels.instance }})
|
||||
description: "Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.backend }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HaproxyHighHttp5xxErrorRateBackend
|
||||
expr: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 5'
|
||||
- alert: HaproxyHighHttp5xxErrorRateBackend(v1)
|
||||
expr: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) * 100 > 5 and sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 0'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: HAProxy high HTTP 5xx error rate backend (instance {{ $labels.instance }})
|
||||
description: "Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
summary: HAProxy high HTTP 5xx error rate backend (v1) (instance {{ $labels.instance }})
|
||||
description: "Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.backend }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HaproxyHighHttp4xxErrorRateServer
|
||||
expr: 'sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5'
|
||||
- alert: HaproxyHighHttp4xxErrorRateServer(v1)
|
||||
expr: 'sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: HAProxy high HTTP 4xx error rate server (instance {{ $labels.instance }})
|
||||
summary: HAProxy high HTTP 4xx error rate server (v1) (instance {{ $labels.instance }})
|
||||
description: "Too many HTTP requests with status 4xx (> 5%) on server {{ $labels.server }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HaproxyHighHttp5xxErrorRateServer
|
||||
expr: 'sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5'
|
||||
- alert: HaproxyHighHttp5xxErrorRateServer(v1)
|
||||
expr: 'sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: HAProxy high HTTP 5xx error rate server (instance {{ $labels.instance }})
|
||||
summary: HAProxy high HTTP 5xx error rate server (v1) (instance {{ $labels.instance }})
|
||||
description: "Too many HTTP requests with status 5xx (> 5%) on server {{ $labels.server }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HaproxyServerResponseErrors
|
||||
expr: 'sum by (server) (rate(haproxy_server_response_errors_total[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5'
|
||||
- alert: HaproxyServerResponseErrors(v1)
|
||||
expr: 'sum by (server) (rate(haproxy_server_response_errors_total[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5 and sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 0'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: HAProxy server response errors (instance {{ $labels.instance }})
|
||||
summary: HAProxy server response errors (v1) (instance {{ $labels.instance }})
|
||||
description: "Too many response errors to {{ $labels.server }} server (> 5%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HaproxyBackendConnectionErrors
|
||||
- alert: HaproxyBackendConnectionErrors(v1)
|
||||
expr: 'sum by (backend) (rate(haproxy_backend_connection_errors_total[1m])) > 100'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: HAProxy backend connection errors (instance {{ $labels.instance }})
|
||||
description: "Too many connection errors to {{ $labels.fqdn }}/{{ $labels.backend }} backend (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
summary: HAProxy backend connection errors (v1) (instance {{ $labels.instance }})
|
||||
description: "Too many connection errors to {{ $labels.backend }} backend (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HaproxyServerConnectionErrors
|
||||
- alert: HaproxyServerConnectionErrors(v1)
|
||||
expr: 'sum by (server) (rate(haproxy_server_connection_errors_total[1m])) > 100'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: HAProxy server connection errors (instance {{ $labels.instance }})
|
||||
summary: HAProxy server connection errors (v1) (instance {{ $labels.instance }})
|
||||
description: "Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HaproxyBackendMaxActiveSession
|
||||
expr: '((sum by (backend) (avg_over_time(haproxy_backend_max_sessions[2m]) * 100) / sum by (backend) (avg_over_time(haproxy_backend_limit_sessions[2m])))) > 80'
|
||||
expr: '((sum by (backend) (haproxy_backend_current_sessions * 100) / sum by (backend) (haproxy_backend_limit_sessions))) > 80 and sum by (backend) (haproxy_backend_limit_sessions) > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: HAProxy backend max active session (instance {{ $labels.instance }})
|
||||
description: "HAproxy backend {{ $labels.fqdn }}/{{ $labels.backend }} is reaching session limit (> 80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "HAProxy backend {{ $labels.backend }} is reaching session limit (> 80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HaproxyPendingRequests
|
||||
- alert: HaproxyPendingRequests(v1)
|
||||
expr: 'sum by (backend) (haproxy_backend_current_queue) > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: HAProxy pending requests (instance {{ $labels.instance }})
|
||||
description: "Some HAProxy requests are pending on {{ $labels.fqdn }}/{{ $labels.backend }} backend\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
summary: HAProxy pending requests (v1) (instance {{ $labels.instance }})
|
||||
description: "Some HAProxy requests are pending on {{ $labels.backend }} backend\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HaproxyHttpSlowingDown
|
||||
- alert: HaproxyHttpSlowingDown(v1)
|
||||
expr: 'avg by (backend) (haproxy_backend_http_total_time_average_seconds) > 1'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: HAProxy HTTP slowing down (instance {{ $labels.instance }})
|
||||
summary: HAProxy HTTP slowing down (v1) (instance {{ $labels.instance }})
|
||||
description: "Average request time is increasing\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HaproxyRetryHigh
|
||||
- alert: HaproxyRetryHigh(v1)
|
||||
expr: 'sum by (backend) (rate(haproxy_backend_retry_warnings_total[1m])) > 10'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: HAProxy retry high (instance {{ $labels.instance }})
|
||||
description: "High rate of retry on {{ $labels.fqdn }}/{{ $labels.backend }} backend\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
summary: HAProxy retry high (v1) (instance {{ $labels.instance }})
|
||||
description: "High rate of retry on {{ $labels.backend }} backend\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HaproxyBackendDown
|
||||
expr: 'haproxy_backend_up == 0'
|
||||
|
|
@ -130,20 +131,20 @@ groups:
|
|||
summary: HAProxy server down (instance {{ $labels.instance }})
|
||||
description: "HAProxy server is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HaproxyFrontendSecurityBlockedRequests
|
||||
- alert: HaproxyFrontendSecurityBlockedRequests(v1)
|
||||
expr: 'sum by (frontend) (rate(haproxy_frontend_requests_denied_total[2m])) > 10'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: HAProxy frontend security blocked requests (instance {{ $labels.instance }})
|
||||
summary: HAProxy frontend security blocked requests (v1) (instance {{ $labels.instance }})
|
||||
description: "HAProxy is blocking requests for security reason\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HaproxyServerHealthcheckFailure
|
||||
expr: 'increase(haproxy_server_check_failures_total[1m]) > 0'
|
||||
for: 1m
|
||||
- alert: HaproxyServerHealthcheckFailure(v1)
|
||||
expr: 'increase(haproxy_server_check_failures_total[1m]) > 2'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: HAProxy server healthcheck failure (instance {{ $labels.instance }})
|
||||
description: "Some server healthcheck are failing on {{ $labels.server }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
summary: HAProxy server healthcheck failure (v1) (instance {{ $labels.instance }})
|
||||
description: "Some server healthcheck are failing on {{ $labels.server }} ({{ $value }} in the last 1m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
|
|||
11
dist/rules/hashicorp-vault/embedded-exporter.yml
vendored
11
dist/rules/hashicorp-vault/embedded-exporter.yml
vendored
|
|
@ -2,11 +2,12 @@ groups:
|
|||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: VaultSealed
|
||||
expr: 'vault_core_unsealed == 0'
|
||||
for: 0m
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
|
|
@ -20,7 +21,7 @@ groups:
|
|||
severity: warning
|
||||
annotations:
|
||||
summary: Vault too many pending tokens (instance {{ $labels.instance }})
|
||||
description: "Too many pending tokens {{ $labels.instance }}: {{ $value | printf \"%.2f\"}}%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Too many pending tokens on {{ $labels.instance }}: {{ $value }} tokens created but not yet stored.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: VaultTooManyInfinityTokens
|
||||
expr: 'vault_token_count_by_ttl{creation_ttl="+Inf"} > 3'
|
||||
|
|
@ -29,13 +30,13 @@ groups:
|
|||
severity: warning
|
||||
annotations:
|
||||
summary: Vault too many infinity tokens (instance {{ $labels.instance }})
|
||||
description: "Too many infinity tokens {{ $labels.instance }}: {{ $value | printf \"%.2f\"}}%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Too many non-expiring tokens on {{ $labels.instance }}: {{ $value }} tokens with infinite TTL.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: VaultClusterHealth
|
||||
expr: 'sum(vault_core_active) / count(vault_core_active) <= 0.5'
|
||||
expr: 'sum(vault_core_active) / count(vault_core_active) <= 0.5 and count(vault_core_active) > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Vault cluster health (instance {{ $labels.instance }})
|
||||
description: "Vault cluster is not healthy {{ $labels.instance }}: {{ $value | printf \"%.2f\"}}%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Vault cluster is not healthy: only {{ $value | humanizePercentage }} of nodes are active.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
|
|||
190
dist/rules/host-and-hardware/node-exporter.yml
vendored
190
dist/rules/host-and-hardware/node-exporter.yml
vendored
|
|
@ -2,10 +2,11 @@ groups:
|
|||
|
||||
- name: NodeExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: HostOutOfMemory
|
||||
expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < .10)'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -13,107 +14,106 @@ groups:
|
|||
summary: Host out of memory (instance {{ $labels.instance }})
|
||||
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# node_vmstat_pgmajfault is exposed as untyped/gauge by node_exporter (from /proc/vmstat), so deriv() is used instead of rate().
|
||||
- alert: HostMemoryUnderMemoryPressure
|
||||
expr: '(rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
expr: '(deriv(node_vmstat_pgmajfault[5m]) > 1000)'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host memory under memory pressure (instance {{ $labels.instance }})
|
||||
description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "The node is under heavy memory pressure. High rate of major page faults ({{ $value }}/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
|
||||
- alert: HostMemoryIsUnderutilized
|
||||
expr: '(100 - (avg_over_time(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 1w
|
||||
expr: 'min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Host Memory is underutilized (instance {{ $labels.instance }})
|
||||
description: "Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualNetworkThroughputIn
|
||||
expr: '(sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 5m
|
||||
expr: '((rate(node_network_receive_bytes_total[5m]) / node_network_speed_bytes) > .80) and node_network_speed_bytes > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual network throughput in (instance {{ $labels.instance }})
|
||||
description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Host receive bandwidth is high (>80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualNetworkThroughputOut
|
||||
expr: '(sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 5m
|
||||
expr: '((rate(node_network_transmit_bytes_total[5m]) / node_network_speed_bytes) > .80) and node_network_speed_bytes > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual network throughput out (instance {{ $labels.instance }})
|
||||
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Host transmit bandwidth is high (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualDiskReadRate
|
||||
expr: '(sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 5m
|
||||
- alert: HostDiskIoUtilizationHigh
|
||||
expr: '(rate(node_disk_io_time_seconds_total[5m]) > .80)'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk read rate (instance {{ $labels.instance }})
|
||||
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualDiskWriteRate
|
||||
expr: '(sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk write rate (instance {{ $labels.instance }})
|
||||
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
summary: Host disk IO utilization high (instance {{ $labels.instance }})
|
||||
description: "Disk utilization is high (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Please add ignored mountpoints in node_exporter parameters like
|
||||
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
|
||||
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
|
||||
- alert: HostOutOfDiskSpace
|
||||
expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
expr: '(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0)'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Host out of disk space (instance {{ $labels.instance }})
|
||||
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostDiskWillFillIn24Hours
|
||||
expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
# Please add ignored mountpoints in node_exporter parameters like
|
||||
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
|
||||
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
|
||||
- alert: HostDiskMayFillIn24Hours
|
||||
expr: 'predict_linear(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[3h], 86400) <= 0 and node_filesystem_avail_bytes > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
|
||||
description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
summary: Host disk may fill in 24 hours (instance {{ $labels.instance }})
|
||||
description: "Filesystem will likely run out of space within the next 24 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostOutOfInodes
|
||||
expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
expr: '(node_filesystem_files_free / node_filesystem_files < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) and node_filesystem_files > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Host out of inodes (instance {{ $labels.instance }})
|
||||
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostFilesystemDeviceError
|
||||
expr: 'node_filesystem_device_error == 1'
|
||||
expr: 'node_filesystem_device_error{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} == 1'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Host filesystem device error (instance {{ $labels.instance }})
|
||||
description: "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Error stat-ing the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostInodesWillFillIn24Hours
|
||||
expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{fstype!="msdosfs"} == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
- alert: HostInodesMayFillIn24Hours
|
||||
expr: 'predict_linear(node_filesystem_files_free{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_files_free > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
|
||||
description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
summary: Host inodes may fill in 24 hours (instance {{ $labels.instance }})
|
||||
description: "Filesystem will likely run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualDiskReadLatency
|
||||
expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0)'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -122,7 +122,7 @@ groups:
|
|||
description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualDiskWriteLatency
|
||||
expr: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
expr: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0)'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -131,7 +131,7 @@ groups:
|
|||
description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostHighCpuLoad
|
||||
expr: '(sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
expr: '1 - (avg without (cpu) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) > .80'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -139,17 +139,18 @@ groups:
|
|||
summary: Host high CPU load (instance {{ $labels.instance }})
|
||||
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
|
||||
- alert: HostCpuIsUnderutilized
|
||||
expr: '(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
expr: '(min without (cpu) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8'
|
||||
for: 1w
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Host CPU is underutilized (instance {{ $labels.instance }})
|
||||
description: "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostCpuStealNoisyNeighbor
|
||||
expr: '(avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
expr: 'avg without (cpu) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -158,34 +159,37 @@ groups:
|
|||
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostCpuHighIowait
|
||||
expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
expr: 'avg without (cpu) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host CPU high iowait (instance {{ $labels.instance }})
|
||||
description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "CPU iowait > 10%. Your CPU is idling waiting for storage to respond.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualDiskIo
|
||||
expr: '(rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
expr: 'rate(node_disk_io_time_seconds_total[5m]) > 0.8'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk IO (instance {{ $labels.instance }})
|
||||
description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostContextSwitching
|
||||
expr: '((rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
# x2 context switches is an arbitrary number.
|
||||
# The alert threshold depends on the nature of the application.
|
||||
# Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
|
||||
- alert: HostContextSwitchingHigh
|
||||
expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2 and rate(node_context_switches_total[1d]) > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host context switching (instance {{ $labels.instance }})
|
||||
description: "Context switching is growing on the node (> 10000 / CPU / s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
summary: Host context switching high (instance {{ $labels.instance }})
|
||||
description: "Context switching is growing on the node (twice the daily average during the last 15m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostSwapIsFillingUp
|
||||
expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) and node_memory_SwapTotal_bytes > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -194,16 +198,16 @@ groups:
|
|||
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostSystemdServiceCrashed
|
||||
expr: '(node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
expr: '(node_systemd_unit_state{state="failed"} == 1)'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host systemd service crashed (instance {{ $labels.instance }})
|
||||
description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "systemd service {{ $labels.name }} crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostPhysicalComponentTooHot
|
||||
expr: '((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl"} > 75)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
expr: 'node_hwmon_temp_celsius > node_hwmon_temp_max_celsius'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -212,7 +216,7 @@ groups:
|
|||
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostNodeOvertemperatureAlarm
|
||||
expr: '(node_hwmon_temp_crit_alarm_celsius == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
expr: '((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1))'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
|
|
@ -220,35 +224,37 @@ groups:
|
|||
summary: Host node overtemperature alarm (instance {{ $labels.instance }})
|
||||
description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostRaidArrayGotInactive
|
||||
expr: '(node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
# Uses ignoring(state) to handle additional labels on node_md_disks.
|
||||
- alert: HostSoftwareRaidInsufficientDrives
|
||||
expr: '((node_md_disks_required - ignoring(state) node_md_disks{state="active"}) > 0)'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Host RAID array got inactive (instance {{ $labels.instance }})
|
||||
description: "RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
summary: Host software RAID insufficient drives (instance {{ $labels.instance }})
|
||||
description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} has insufficient drives remaining.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostRaidDiskFailure
|
||||
expr: '(node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
- alert: HostSoftwareRaidDiskFailure
|
||||
expr: '(node_md_disks{state="failed"} > 0)'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host RAID disk failure (instance {{ $labels.instance }})
|
||||
description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
summary: Host software RAID disk failure (instance {{ $labels.instance }})
|
||||
description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} needs attention.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostKernelVersionDeviations
|
||||
expr: '(count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 6h
|
||||
expr: 'changes(node_uname_info[1h]) > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Host kernel version deviations (instance {{ $labels.instance }})
|
||||
description: "Different kernel versions are running\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Kernel version for {{ $labels.instance }} has changed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# When a machine runs out of memory, the node exporter can become unresponsive for several minutes. Even if the system takes 15–20 minutes to recover, the alert should still trigger.
|
||||
- alert: HostOomKillDetected
|
||||
expr: '(increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
expr: '(delta(node_vmstat_oom_kill[30m]) > 0)'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -257,25 +263,25 @@ groups:
|
|||
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostEdacCorrectableErrorsDetected
|
||||
expr: '(increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
expr: '(increase(node_edac_correctable_errors_total[1m]) > 0)'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
|
||||
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 1 minute.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostEdacUncorrectableErrorsDetected
|
||||
expr: '(node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
expr: '(node_edac_uncorrectable_errors_total > 0)'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
|
||||
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostNetworkReceiveErrors
|
||||
expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) and rate(node_network_receive_packets_total[2m]) > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -284,7 +290,7 @@ groups:
|
|||
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostNetworkTransmitErrors
|
||||
expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) and rate(node_network_transmit_packets_total[2m]) > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -292,17 +298,8 @@ groups:
|
|||
summary: Host Network Transmit Errors (instance {{ $labels.instance }})
|
||||
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostNetworkInterfaceSaturated
|
||||
expr: '((rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} > 0.8 < 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host Network Interface Saturated (instance {{ $labels.instance }})
|
||||
description: "The network interface \"{{ $labels.device }}\" on \"{{ $labels.instance }}\" is getting overloaded.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostNetworkBondDegraded
|
||||
expr: '((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
expr: '((node_bonding_active - node_bonding_slaves) != 0)'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -311,7 +308,7 @@ groups:
|
|||
description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostConntrackLimit
|
||||
expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) and node_nf_conntrack_entries_limit > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -320,7 +317,7 @@ groups:
|
|||
description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostClockSkew
|
||||
expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0))'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -329,19 +326,10 @@ groups:
|
|||
description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostClockNotSynchronising
|
||||
expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16)'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host clock not synchronising (instance {{ $labels.instance }})
|
||||
description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostRequiresReboot
|
||||
expr: '(node_reboot_required > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 4h
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Host requires reboot (instance {{ $labels.instance }})
|
||||
description: "{{ $labels.instance }} requires a reboot.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
|
|||
165
dist/rules/ipmi/ipmi-exporter.yml
vendored
Normal file
165
dist/rules/ipmi/ipmi-exporter.yml
vendored
Normal file
|
|
@ -0,0 +1,165 @@
|
|||
groups:
|
||||
|
||||
- name: IpmiExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
# The ipmi_up metric is per-collector. A value of 0 means the collector could not retrieve data from the BMC.
|
||||
- alert: IpmiCollectorDown
|
||||
expr: 'ipmi_up == 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: IPMI collector down (instance {{ $labels.instance }})
|
||||
description: "IPMI collector {{ $labels.collector }} on {{ $labels.instance }} failed to scrape sensor data. Check FreeIPMI tools and BMC connectivity.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# State values: 0=nominal, 1=warning, 2=critical. Thresholds are defined in the BMC firmware.
|
||||
- alert: IpmiTemperatureSensorWarning
|
||||
expr: 'ipmi_temperature_state == 1'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: IPMI temperature sensor warning (instance {{ $labels.instance }})
|
||||
description: "IPMI temperature sensor {{ $labels.name }} on {{ $labels.instance }} is in warning state.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: IpmiTemperatureSensorCritical
|
||||
expr: 'ipmi_temperature_state == 2'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: IPMI temperature sensor critical (instance {{ $labels.instance }})
|
||||
description: "IPMI temperature sensor {{ $labels.name }} on {{ $labels.instance }} is in critical state. Immediate attention required to prevent hardware damage.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: IpmiFanSpeedSensorWarning
|
||||
expr: 'ipmi_fan_speed_state == 1'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: IPMI fan speed sensor warning (instance {{ $labels.instance }})
|
||||
description: "IPMI fan sensor {{ $labels.name }} on {{ $labels.instance }} is in warning state.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: IpmiFanSpeedSensorCritical
|
||||
expr: 'ipmi_fan_speed_state == 2'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: IPMI fan speed sensor critical (instance {{ $labels.instance }})
|
||||
description: "IPMI fan sensor {{ $labels.name }} on {{ $labels.instance }} is in critical state. A fan may have failed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: IpmiFanSpeedZero
|
||||
expr: 'ipmi_fan_speed_rpm == 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: IPMI fan speed zero (instance {{ $labels.instance }})
|
||||
description: "IPMI fan {{ $labels.name }} on {{ $labels.instance }} reports 0 RPM. The fan may have failed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: IpmiVoltageSensorWarning
|
||||
expr: 'ipmi_voltage_state == 1'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: IPMI voltage sensor warning (instance {{ $labels.instance }})
|
||||
description: "IPMI voltage sensor {{ $labels.name }} on {{ $labels.instance }} is in warning state.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: IpmiVoltageSensorCritical
|
||||
expr: 'ipmi_voltage_state == 2'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: IPMI voltage sensor critical (instance {{ $labels.instance }})
|
||||
description: "IPMI voltage sensor {{ $labels.name }} on {{ $labels.instance }} is in critical state. Power supply or motherboard issue possible.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: IpmiCurrentSensorWarning
|
||||
expr: 'ipmi_current_state == 1'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: IPMI current sensor warning (instance {{ $labels.instance }})
|
||||
description: "IPMI current sensor {{ $labels.name }} on {{ $labels.instance }} is in warning state.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: IpmiCurrentSensorCritical
|
||||
expr: 'ipmi_current_state == 2'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: IPMI current sensor critical (instance {{ $labels.instance }})
|
||||
description: "IPMI current sensor {{ $labels.name }} on {{ $labels.instance }} is in critical state.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: IpmiPowerSensorWarning
|
||||
expr: 'ipmi_power_state == 1'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: IPMI power sensor warning (instance {{ $labels.instance }})
|
||||
description: "IPMI power sensor {{ $labels.name }} on {{ $labels.instance }} is in warning state.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: IpmiPowerSensorCritical
|
||||
expr: 'ipmi_power_state == 2'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: IPMI power sensor critical (instance {{ $labels.instance }})
|
||||
description: "IPMI power sensor {{ $labels.name }} on {{ $labels.instance }} is in critical state.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Catches any sensor type not covered by the specific temperature/fan/voltage/current/power alerts.
|
||||
- alert: IpmiGenericSensorCritical
|
||||
expr: 'ipmi_sensor_state == 2'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: IPMI generic sensor critical (instance {{ $labels.instance }})
|
||||
description: "IPMI sensor {{ $labels.name }} (type={{ $labels.type }}) on {{ $labels.instance }} is in critical state.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: IpmiChassisPowerOff
|
||||
expr: 'ipmi_chassis_power_state == 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: IPMI chassis power off (instance {{ $labels.instance }})
|
||||
description: "IPMI reports chassis power is off on {{ $labels.instance }}. The server may have shut down unexpectedly.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# The metric uses inverted logic: 1=no fault, 0=fault detected.
|
||||
- alert: IpmiChassisDriveFault
|
||||
expr: 'ipmi_chassis_drive_fault_state == 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: IPMI chassis drive fault (instance {{ $labels.instance }})
|
||||
description: "IPMI reports a drive fault on {{ $labels.instance }}. Check disk health.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# The metric uses inverted logic: 1=no fault, 0=fault detected.
|
||||
- alert: IpmiChassisCoolingFault
|
||||
expr: 'ipmi_chassis_cooling_fault_state == 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: IPMI chassis cooling fault (instance {{ $labels.instance }})
|
||||
description: "IPMI reports a cooling/fan fault on {{ $labels.instance }}. Check fans and airflow.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# SEL storage is typically very limited (e.g., 16KB). When full, new events may be dropped.
|
||||
- alert: IpmiSelAlmostFull
|
||||
expr: 'ipmi_sel_free_space_bytes < 512'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: IPMI SEL almost full (instance {{ $labels.instance }})
|
||||
description: "IPMI System Event Log on {{ $labels.instance }} has only {{ printf \"%.0f\" $value }} bytes free. Clear the SEL to prevent loss of new events.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
36
dist/rules/istio/embedded-exporter.yml
vendored
36
dist/rules/istio/embedded-exporter.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: IstioKubernetesGatewayAvailabilityDrop
|
||||
|
|
@ -11,17 +12,18 @@ groups:
|
|||
severity: warning
|
||||
annotations:
|
||||
summary: Istio Kubernetes gateway availability drop (instance {{ $labels.instance }})
|
||||
description: "Gateway pods have dropped. Inbound traffic will likely be affected.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Istio ingress gateway has only {{ $value }} available pod(s). Inbound traffic will likely be affected.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: IstioPilotHighTotalRequestRate
|
||||
expr: 'sum(rate(pilot_xds_push_errors[1m])) / sum(rate(pilot_xds_pushes[1m])) * 100 > 5'
|
||||
- alert: IstioPilotHighPushErrorRate
|
||||
expr: 'sum(rate(pilot_xds_push_errors[1m])) / sum(rate(pilot_xds_pushes[1m])) * 100 > 5 and sum(rate(pilot_xds_pushes[1m])) > 0'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Istio Pilot high total request rate (instance {{ $labels.instance }})
|
||||
summary: Istio Pilot high push error rate (instance {{ $labels.instance }})
|
||||
description: "Number of Istio Pilot push errors is too high (> 5%). Envoy sidecars might have outdated configuration.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Mixer was deprecated in Istio 1.5 and removed in Istio 1.8+. This alert only applies to Istio < 1.8.
|
||||
- alert: IstioMixerPrometheusDispatchesLow
|
||||
expr: 'sum(rate(mixer_runtime_dispatches_total{adapter=~"prometheus"}[1m])) < 180'
|
||||
for: 1m
|
||||
|
|
@ -31,6 +33,7 @@ groups:
|
|||
summary: Istio Mixer Prometheus dispatches low (instance {{ $labels.instance }})
|
||||
description: "Number of Mixer dispatches to Prometheus is too low. Istio metrics might not be being exported properly.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold of 1000 req/s is a rough default. Adjust to your expected peak traffic.
|
||||
- alert: IstioHighTotalRequestRate
|
||||
expr: 'sum(rate(istio_requests_total{reporter="destination"}[5m])) > 1000'
|
||||
for: 2m
|
||||
|
|
@ -38,8 +41,9 @@ groups:
|
|||
severity: warning
|
||||
annotations:
|
||||
summary: Istio high total request rate (instance {{ $labels.instance }})
|
||||
description: "Global request rate in the service mesh is unusually high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Global request rate in the service mesh is unusually high ({{ $value | printf \"%.2f\" }} req/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold of 100 req/s is a rough default. Adjust to your expected baseline traffic. This alert may fire on startup or low-traffic environments.
|
||||
- alert: IstioLowTotalRequestRate
|
||||
expr: 'sum(rate(istio_requests_total{reporter="destination"}[5m])) < 100'
|
||||
for: 2m
|
||||
|
|
@ -47,49 +51,49 @@ groups:
|
|||
severity: warning
|
||||
annotations:
|
||||
summary: Istio low total request rate (instance {{ $labels.instance }})
|
||||
description: "Global request rate in the service mesh is unusually low.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Global request rate in the service mesh is unusually low ({{ $value | printf \"%.2f\" }} req/s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: IstioHigh4xxErrorRate
|
||||
expr: 'sum(rate(istio_requests_total{reporter="destination", response_code=~"4.*"}[5m])) / sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100 > 5'
|
||||
expr: 'sum(rate(istio_requests_total{reporter="destination", response_code=~"4.*"}[5m])) / sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100 > 5 and sum(rate(istio_requests_total{reporter="destination"}[5m])) > 0'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Istio high 4xx error rate (instance {{ $labels.instance }})
|
||||
description: "High percentage of HTTP 5xx responses in Istio (> 5%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "High percentage of HTTP 4xx responses in Istio ({{ $value | printf \"%.1f\" }}% > 5%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: IstioHigh5xxErrorRate
|
||||
expr: 'sum(rate(istio_requests_total{reporter="destination", response_code=~"5.*"}[5m])) / sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100 > 5'
|
||||
expr: 'sum(rate(istio_requests_total{reporter="destination", response_code=~"5.*"}[5m])) / sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100 > 5 and sum(rate(istio_requests_total{reporter="destination"}[5m])) > 0'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Istio high 5xx error rate (instance {{ $labels.instance }})
|
||||
description: "High percentage of HTTP 5xx responses in Istio (> 5%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "High percentage of HTTP 5xx responses in Istio ({{ $value | printf \"%.1f\" }}% > 5%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: IstioHighRequestLatency
|
||||
expr: 'rate(istio_request_duration_milliseconds_sum{reporter="destination"}[1m]) / rate(istio_request_duration_milliseconds_count{reporter="destination"}[1m]) > 100'
|
||||
expr: 'rate(istio_request_duration_milliseconds_sum{reporter="destination"}[1m]) / rate(istio_request_duration_milliseconds_count{reporter="destination"}[1m]) > 100 and rate(istio_request_duration_milliseconds_count{reporter="destination"}[1m]) > 0'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Istio high request latency (instance {{ $labels.instance }})
|
||||
description: "Istio average requests execution is longer than 100ms.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Istio average request duration is {{ $value }}ms (> 100ms).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: IstioLatency99Percentile
|
||||
expr: 'histogram_quantile(0.99, sum(rate(istio_request_duration_milliseconds_bucket[1m])) by (destination_canonical_service, destination_workload_namespace, source_canonical_service, source_workload_namespace, le)) > 1000'
|
||||
expr: 'histogram_quantile(0.99, sum(rate(istio_request_duration_milliseconds_bucket[1m])) by (destination_canonical_service, destination_workload_namespace, le)) > 1000'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Istio latency 99 percentile (instance {{ $labels.instance }})
|
||||
description: "Istio 1% slowest requests are longer than 1000ms.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Istio p99 request latency is {{ $value }}ms (threshold: 1000ms).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: IstioPilotDuplicateEntry
|
||||
expr: 'sum(rate(pilot_duplicate_envoy_clusters{}[5m])) > 0'
|
||||
expr: 'sum(pilot_duplicate_envoy_clusters{}) > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Istio Pilot Duplicate Entry (instance {{ $labels.instance }})
|
||||
description: "Istio pilot duplicate entry error.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Istio Pilot has detected {{ $value }} duplicate Envoy cluster(s), indicating misconfigured DestinationRules or ServiceEntries.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
|
|||
82
dist/rules/jaeger/embedded-exporter-legacy.yml
vendored
Normal file
82
dist/rules/jaeger/embedded-exporter-legacy.yml
vendored
Normal file
|
|
@ -0,0 +1,82 @@
|
|||
groups:
|
||||
|
||||
- name: EmbeddedExporterLegacy
|
||||
|
||||
# These rules target Jaeger v1.x metrics (jaeger_* prefix).
|
||||
# Jaeger v1 reached end-of-life on December 31, 2025.
|
||||
# For Jaeger v2+, use the "Embedded exporter (v2+)" rules instead.
|
||||
# Note: jaeger-agent was deprecated in v1.35 and removed in v2.0.
|
||||
|
||||
rules:
|
||||
|
||||
- alert: JaegerAgentHttpServerErrors
|
||||
expr: '100 * sum(rate(jaeger_agent_http_server_errors_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_agent_http_server_total[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_agent_http_server_total[1m])) by (instance, job, namespace) > 0'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Jaeger agent HTTP server errors (instance {{ $labels.instance }})
|
||||
description: "Jaeger agent on {{ $labels.instance }} is experiencing {{ $value | humanize }}% HTTP server errors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: JaegerClientRpcRequestErrors
|
||||
expr: '100 * sum(rate(jaeger_client_jaeger_rpc_http_requests{status_code=~"4xx|5xx"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_client_jaeger_rpc_http_requests[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_client_jaeger_rpc_http_requests[1m])) by (instance, job, namespace) > 0'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Jaeger client RPC request errors (instance {{ $labels.instance }})
|
||||
description: "Jaeger client on {{ $labels.instance }} is experiencing {{ $value | humanize }}% RPC HTTP errors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: JaegerClientSpansDropped
|
||||
expr: '100 * sum(rate(jaeger_reporter_spans{result=~"dropped|err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_reporter_spans[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_reporter_spans[1m])) by (instance, job, namespace) > 0'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Jaeger client spans dropped (instance {{ $labels.instance }})
|
||||
description: "Jaeger client on {{ $labels.instance }} is dropping {{ $value | humanize }}% of spans.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: JaegerAgentSpansDropped
|
||||
expr: '100 * sum(rate(jaeger_agent_reporter_batches_failures_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_agent_reporter_batches_submitted_total[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_agent_reporter_batches_submitted_total[1m])) by (instance, job, namespace) > 0'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Jaeger agent spans dropped (instance {{ $labels.instance }})
|
||||
description: "Jaeger agent on {{ $labels.instance }} is dropping {{ $value | humanize }}% of span batches.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: JaegerCollectorDroppingSpans
|
||||
expr: '100 * sum(rate(jaeger_collector_spans_dropped_total[1m])) by (instance, job, namespace) / sum(rate(jaeger_collector_spans_received_total[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_collector_spans_received_total[1m])) by (instance, job, namespace) > 0'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Jaeger collector dropping spans (instance {{ $labels.instance }})
|
||||
description: "Jaeger collector on {{ $labels.instance }} is dropping {{ $value | humanize }}% of spans.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: JaegerSamplingUpdateFailing
|
||||
expr: '100 * sum(rate(jaeger_sampler_queries{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_sampler_queries[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_sampler_queries[1m])) by (instance, job, namespace) > 0'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Jaeger sampling update failing (instance {{ $labels.instance }})
|
||||
description: "Jaeger on {{ $labels.instance }} is failing {{ $value | humanize }}% of sampling policy updates.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: JaegerThrottlingUpdateFailing
|
||||
expr: '100 * sum(rate(jaeger_throttler_updates{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_throttler_updates[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_throttler_updates[1m])) by (instance, job, namespace) > 0'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Jaeger throttling update failing (instance {{ $labels.instance }})
|
||||
description: "Jaeger on {{ $labels.instance }} is failing {{ $value | humanize }}% of throttling policy updates.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: JaegerQueryRequestFailures
|
||||
expr: '100 * sum(rate(jaeger_query_requests_total{result="err"}[1m])) by (instance, job, namespace) / sum(rate(jaeger_query_requests_total[1m])) by (instance, job, namespace) > 1 and sum(rate(jaeger_query_requests_total[1m])) by (instance, job, namespace) > 0'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Jaeger query request failures (instance {{ $labels.instance }})
|
||||
description: "Jaeger query on {{ $labels.instance }} is failing {{ $value | humanize }}% of requests.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
94
dist/rules/jaeger/embedded-exporter.yml
vendored
Normal file
94
dist/rules/jaeger/embedded-exporter.yml
vendored
Normal file
|
|
@ -0,0 +1,94 @@
|
|||
groups:
|
||||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
# Jaeger v2 is built on OpenTelemetry Collector and exposes metrics on port 8888 (/metrics).
|
||||
# It emits standard otelcol_* pipeline metrics alongside Jaeger-specific storage and query metrics.
|
||||
# For span ingestion pipeline alerts (refused spans, export failures, queue saturation),
|
||||
# use the OpenTelemetry Collector rules instead.
|
||||
|
||||
rules:
|
||||
|
||||
- alert: JaegerHighStorageErrorRate
|
||||
expr: '100 * sum(rate(jaeger_storage_requests_total{result="err"}[1m])) by (instance, job, namespace, operation) / sum(rate(jaeger_storage_requests_total[1m])) by (instance, job, namespace, operation) > 1 and sum(rate(jaeger_storage_requests_total[1m])) by (instance, job, namespace, operation) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Jaeger high storage error rate (instance {{ $labels.instance }})
|
||||
description: "Jaeger on {{ $labels.instance }} is experiencing {{ $value | humanize }}% storage errors on {{ $labels.operation }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold of 1s is a rough default. Adjust based on your storage backend and data volume.
|
||||
- alert: JaegerSlowStorageOperations
|
||||
expr: 'histogram_quantile(0.99, sum(rate(jaeger_storage_latency_seconds_bucket[5m])) by (le, instance, job, namespace, operation)) > 1'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Jaeger slow storage operations (instance {{ $labels.instance }})
|
||||
description: "Jaeger on {{ $labels.instance }} storage p99 latency for {{ $labels.operation }} is {{ $value | humanizeDuration }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Filters on http_route="/api/traces" (the trace search endpoint). The http_server_request_duration_seconds
|
||||
# metric is emitted by the otelhttp middleware used by the Jaeger query service.
|
||||
- alert: JaegerQueryServiceHighErrorRate
|
||||
expr: '100 * sum(rate(http_server_request_duration_seconds_count{http_route="/api/traces",http_response_status_code=~"5.."}[1m])) by (instance, job, namespace) / sum(rate(http_server_request_duration_seconds_count{http_route="/api/traces"}[1m])) by (instance, job, namespace) > 1 and sum(rate(http_server_request_duration_seconds_count{http_route="/api/traces"}[1m])) by (instance, job, namespace) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Jaeger query service high error rate (instance {{ $labels.instance }})
|
||||
description: "Jaeger query service on {{ $labels.instance }} is returning {{ $value | humanize }}% HTTP 5xx errors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold of 2s is a rough default. Adjust based on your storage backend and data volume.
|
||||
- alert: JaegerQueryServiceSlowResponses
|
||||
expr: 'histogram_quantile(0.99, sum(rate(http_server_request_duration_seconds_bucket{http_route="/api/traces"}[5m])) by (le, instance, job, namespace)) > 2'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Jaeger query service slow responses (instance {{ $labels.instance }})
|
||||
description: "Jaeger query service on {{ $labels.instance }} p99 response latency is {{ $value | humanizeDuration }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Fires when all storage operations for a given type are failing and none are succeeding.
|
||||
# Indicates the storage backend (Cassandra, Elasticsearch, etc.) is likely unreachable or misconfigured.
|
||||
- alert: JaegerStorageCompletelyUnavailable
|
||||
expr: 'sum(rate(jaeger_storage_requests_total{result="err"}[1m])) by (instance, job, namespace, operation) > 0 and sum(rate(jaeger_storage_requests_total{result="ok"}[1m])) by (instance, job, namespace, operation) == 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Jaeger storage completely unavailable (instance {{ $labels.instance }})
|
||||
description: "Jaeger on {{ $labels.instance }} has 100% storage errors for {{ $labels.operation }} — storage backend may be down.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Single trace retrieval (/api/traces/{traceID}) can be slower than search, especially for large traces.
|
||||
# Threshold of 5s is a rough default.
|
||||
- alert: JaegerSlowSingleTraceRetrieval
|
||||
expr: 'histogram_quantile(0.99, sum(rate(http_server_request_duration_seconds_bucket{http_route="/api/traces/{traceID}"}[5m])) by (le, instance, job, namespace)) > 5'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Jaeger slow single trace retrieval (instance {{ $labels.instance }})
|
||||
description: "Jaeger on {{ $labels.instance }} p99 latency for single trace retrieval is {{ $value | humanizeDuration }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Errors on /api/services indicate the storage backend cannot return the list of instrumented services,
|
||||
# which breaks the Jaeger UI service selector.
|
||||
- alert: JaegerServiceDiscoveryErrors
|
||||
expr: '100 * sum(rate(http_server_request_duration_seconds_count{http_route="/api/services",http_response_status_code=~"5.."}[1m])) by (instance, job, namespace) / sum(rate(http_server_request_duration_seconds_count{http_route="/api/services"}[1m])) by (instance, job, namespace) > 1 and sum(rate(http_server_request_duration_seconds_count{http_route="/api/services"}[1m])) by (instance, job, namespace) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Jaeger service discovery errors (instance {{ $labels.instance }})
|
||||
description: "Jaeger on {{ $labels.instance }} is returning {{ $value | humanize }}% HTTP 5xx errors on the services endpoint.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Fires when an operation (e.g. find_traces, get_services) has received requests but none succeeded.
|
||||
# May indicate a persistent storage error or a backend that is slow to recover.
|
||||
- alert: JaegerNoStorageReadsSucceeding
|
||||
expr: 'sum(increase(jaeger_storage_requests_total{result="ok"}[15m])) by (instance, job, namespace, operation) == 0 and sum(increase(jaeger_storage_requests_total[15m])) by (instance, job, namespace, operation) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Jaeger no storage reads succeeding (instance {{ $labels.instance }})
|
||||
description: "Jaeger on {{ $labels.instance }} has no successful storage reads for {{ $labels.operation }} in the past 15 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
26
dist/rules/jenkins/metric-plugin.yml
vendored
26
dist/rules/jenkins/metric-plugin.yml
vendored
|
|
@ -2,16 +2,26 @@ groups:
|
|||
|
||||
- name: MetricPlugin
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: JenkinsOffline
|
||||
expr: 'jenkins_node_offline_value > 1'
|
||||
- alert: JenkinsNodeOffline
|
||||
expr: 'jenkins_node_offline_value > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Jenkins node offline (instance {{ $labels.instance }})
|
||||
description: "At least one Jenkins node offline: `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: JenkinsNoNodeOnline
|
||||
expr: 'jenkins_node_online_value == 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Jenkins offline (instance {{ $labels.instance }})
|
||||
description: "Jenkins offline: `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
summary: Jenkins no node online (instance {{ $labels.instance }})
|
||||
description: "No Jenkins nodes are online: `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: JenkinsHealthcheck
|
||||
expr: 'jenkins_health_check_score < 1'
|
||||
|
|
@ -41,7 +51,7 @@ groups:
|
|||
description: "Healthcheck failure for `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: JenkinsRunFailureTotal
|
||||
expr: 'delta(jenkins_runs_failure_total[1h]) > 100'
|
||||
expr: 'increase(jenkins_runs_failure_total[1h]) > 100'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -58,6 +68,12 @@ groups:
|
|||
summary: Jenkins build tests failing (instance {{ $labels.instance }})
|
||||
description: "Last build tests failed: {{$labels.jenkins_job}}. Failed build Tests for job `{{$labels.jenkins_job}}` on {{$labels.instance}}/{{$labels.env}} ({{$labels.region}})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# * RUNNING -1 true - The build had no errors.
|
||||
# * SUCCESS 0 true - The build had no errors.
|
||||
# * UNSTABLE 1 true - The build had some errors but they were not fatal. For example, some tests failed.
|
||||
# * FAILURE 2 false - The build had a fatal error.
|
||||
# * NOT_BUILT 3 false - The module was not built.
|
||||
# * ABORTED 4 false - The build was manually aborted.
|
||||
- alert: JenkinsLastBuildFailed
|
||||
expr: 'default_jenkins_builds_last_build_result_ordinal == 2'
|
||||
for: 0m
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: CzerwonkJunosExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: JuniperSwitchDown
|
||||
|
|
@ -13,20 +14,20 @@ groups:
|
|||
summary: Juniper switch down (instance {{ $labels.instance }})
|
||||
description: "The switch appears to be down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: JuniperHighBandwidthUsage1gib
|
||||
- alert: JuniperCriticalBandwidthUsage1gib
|
||||
expr: 'rate(junos_interface_transmit_bytes[1m]) * 8 > 1e+9 * 0.90'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Juniper high Bandwidth Usage 1GiB (instance {{ $labels.instance }})
|
||||
summary: Juniper critical Bandwidth Usage 1GiB (instance {{ $labels.instance }})
|
||||
description: "Interface is highly saturated. (> 0.90GiB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: JuniperHighBandwidthUsage1gib
|
||||
- alert: JuniperWarningBandwidthUsage1gib
|
||||
expr: 'rate(junos_interface_transmit_bytes[1m]) * 8 > 1e+9 * 0.80'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Juniper high Bandwidth Usage 1GiB (instance {{ $labels.instance }})
|
||||
summary: Juniper warning Bandwidth Usage 1GiB (instance {{ $labels.instance }})
|
||||
description: "Interface is getting saturated. (> 0.80GiB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
|
|||
108
dist/rules/jvm/jvm-exporter.yml
vendored
108
dist/rules/jvm/jvm-exporter.yml
vendored
|
|
@ -2,13 +2,119 @@ groups:
|
|||
|
||||
- name: JvmExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: JvmMemoryFillingUp
|
||||
expr: '(sum by (instance)(jvm_memory_used_bytes{area="heap"}) / sum by (instance)(jvm_memory_max_bytes{area="heap"})) * 100 > 80'
|
||||
expr: '(sum by (instance)(jvm_memory_used_bytes{area="heap"}) / sum by (instance)(jvm_memory_max_bytes{area="heap"})) * 100 > 80 and sum by (instance)(jvm_memory_max_bytes{area="heap"}) > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: JVM memory filling up (instance {{ $labels.instance }})
|
||||
description: "JVM memory is filling up (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Many JVM configurations leave metaspace unbounded, in which case jvm_memory_max_bytes{area="nonheap"} is -1 and this alert will not fire.
|
||||
# The query filters out max_bytes <= 0 to avoid false negatives.
|
||||
- alert: JvmNon-heapMemoryFillingUp
|
||||
expr: '(sum by (instance)(jvm_memory_used_bytes{area="nonheap"}) / (sum by (instance)(jvm_memory_max_bytes{area="nonheap"}) > 0)) * 100 > 80'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: JVM non-heap memory filling up (instance {{ $labels.instance }})
|
||||
description: "JVM non-heap memory (metaspace/code cache) is filling up (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: JvmGcTimeTooHigh
|
||||
expr: 'sum by (instance)(rate(jvm_gc_collection_seconds_sum[5m])) > 0.05'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: JVM GC time too high (instance {{ $labels.instance }})
|
||||
description: "JVM is spending too much time in garbage collection (> 5% of wall clock time)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: JvmThreadsDeadlocked
|
||||
expr: 'jvm_threads_deadlocked > 0'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: JVM threads deadlocked (instance {{ $labels.instance }})
|
||||
description: "JVM has deadlocked threads\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: JvmThreadCountHigh
|
||||
expr: 'jvm_threads_current > 300'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: JVM thread count high (instance {{ $labels.instance }})
|
||||
description: "JVM thread count is high (> 300), potential thread leak\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: JvmThreadsBlocked
|
||||
expr: 'jvm_threads_state{state="BLOCKED"} > 50'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: JVM threads BLOCKED (instance {{ $labels.instance }})
|
||||
description: "JVM has high number of BLOCKED threads, indicating lock contention\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# This regex matches CMS, G1, and Parallel collector names. It will not match ZGC or Shenandoah cycle names.
|
||||
# Adjust the gc label filter if you use a different collector.
|
||||
- alert: JvmOldGenGcFrequency
|
||||
expr: 'rate(jvm_gc_collection_seconds_count{gc=~".*old.*|.*major.*"}[5m]) > 0.3'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: JVM old gen GC frequency (instance {{ $labels.instance }})
|
||||
description: "Frequent old/major GC cycles, indicating memory pressure\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: JvmDirectBufferPoolFillingUp
|
||||
expr: '(jvm_buffer_pool_used_bytes / jvm_buffer_pool_capacity_bytes) * 100 > 90 and jvm_buffer_pool_capacity_bytes > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: JVM direct buffer pool filling up (instance {{ $labels.instance }})
|
||||
description: "JVM direct buffer pool is filling up (> 90%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: JvmObjectsPendingFinalization
|
||||
expr: 'jvm_memory_objects_pending_finalization > 1000'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: JVM objects pending finalization (instance {{ $labels.instance }})
|
||||
description: "JVM has objects pending finalization, potential memory leak\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# process_open_fds and process_max_fds are generic metrics from the Prometheus client library, not JVM-specific.
|
||||
# This alert will also fire for Go, Python, or any process exposing these metrics.
|
||||
- alert: JvmFileDescriptorsExhaustion
|
||||
expr: '(process_open_fds / process_max_fds) * 100 > 90 and process_max_fds > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: JVM file descriptors exhaustion (instance {{ $labels.instance }})
|
||||
description: "JVM process is running out of file descriptors (> 90% used)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: JvmClassLoadingAnomaly
|
||||
expr: 'rate(jvm_classes_loaded_total[5m]) > 100'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: JVM class loading anomaly (instance {{ $labels.instance }})
|
||||
description: "Rapid class loading detected, potential classloader leak\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: JvmCompilationTimeSpike
|
||||
expr: 'rate(jvm_compilation_time_seconds_total[5m]) > 0.1'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: JVM compilation time spike (instance {{ $labels.instance }})
|
||||
description: "Excessive JIT compilation time consuming CPU\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
|
|||
15
dist/rules/kafka/danielqsj-kafka-exporter.yml
vendored
15
dist/rules/kafka/danielqsj-kafka-exporter.yml
vendored
|
|
@ -2,22 +2,23 @@ groups:
|
|||
|
||||
- name: DanielqsjKafkaExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: KafkaTopicsReplicas
|
||||
expr: 'sum(kafka_topic_partition_in_sync_replica) by (topic) < 3'
|
||||
expr: 'min(kafka_topic_partition_in_sync_replica) by (topic) < 3'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Kafka topics replicas (instance {{ $labels.instance }})
|
||||
description: "Kafka topic in-sync partition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Kafka topic {{ $labels.topic }} has fewer than 3 in-sync replicas ({{ $value }}), data durability is at risk.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KafkaConsumersGroup
|
||||
expr: 'sum(kafka_consumergroup_lag) by (consumergroup) > 50'
|
||||
- alert: KafkaConsumerGroupLag
|
||||
expr: 'sum(kafka_consumergroup_lag) by (consumergroup) > 10000'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kafka consumers group (instance {{ $labels.instance }})
|
||||
description: "Kafka consumers group\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
summary: Kafka consumer group lag (instance {{ $labels.instance }})
|
||||
description: "Kafka consumer group {{ $labels.consumergroup }} is lagging behind ({{ $value }} messages)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
|
|||
1
dist/rules/kafka/linkedin-kafka-exporter.yml
vendored
1
dist/rules/kafka/linkedin-kafka-exporter.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: LinkedinKafkaExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: KafkaTopicOffsetDecreased
|
||||
|
|
|
|||
67
dist/rules/keycloak/aerogear-keycloak-metrics-spi.yml
vendored
Normal file
67
dist/rules/keycloak/aerogear-keycloak-metrics-spi.yml
vendored
Normal file
|
|
@ -0,0 +1,67 @@
|
|||
groups:
|
||||
|
||||
- name: AerogearKeycloakMetricsSpi
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
# Threshold of 5% is a rough default. Adjust based on your user base and expected error rates.
|
||||
# A spike in failed logins may indicate a brute-force attack or misconfigured client.
|
||||
- alert: KeycloakHighLoginFailureRate
|
||||
expr: '(sum by (realm) (rate(keycloak_failed_login_attempts_total[5m])) / (sum by (realm) (rate(keycloak_logins_total[5m])) + sum by (realm) (rate(keycloak_failed_login_attempts_total[5m])))) * 100 > 5 and (sum by (realm) (rate(keycloak_logins_total[5m])) + sum by (realm) (rate(keycloak_failed_login_attempts_total[5m]))) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Keycloak high login failure rate (instance {{ $labels.instance }})
|
||||
description: "More than 5% of login attempts are failing in realm {{ $labels.realm }} (current value: {{ $value | printf \"%.1f\" }}%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Only fires when login attempts exist but none succeed — may indicate an authentication outage.
|
||||
- alert: KeycloakNoSuccessfulLogins
|
||||
expr: 'sum by (realm) (rate(keycloak_logins_total[15m])) == 0 and (sum by (realm) (rate(keycloak_logins_total[15m])) + sum by (realm) (rate(keycloak_failed_login_attempts_total[15m]))) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Keycloak no successful logins (instance {{ $labels.instance }})
|
||||
description: "No successful logins in realm {{ $labels.realm }} for the last 15 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold of 10% is a rough default. High refresh token errors may indicate expired sessions or token store issues.
|
||||
- alert: KeycloakHighTokenRefreshErrorRate
|
||||
expr: '(sum by (realm) (rate(keycloak_refresh_tokens_errors_total[5m])) / sum by (realm) (rate(keycloak_refresh_tokens_total[5m]))) * 100 > 10 and sum by (realm) (rate(keycloak_refresh_tokens_total[5m])) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Keycloak high token refresh error rate (instance {{ $labels.instance }})
|
||||
description: "More than 10% of token refresh attempts are failing in realm {{ $labels.realm }} (current value: {{ $value | printf \"%.1f\" }}%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold of 10% is a rough default. Code-to-token failures may indicate misconfigured OAuth clients or replay attacks.
|
||||
- alert: KeycloakHighCode-to-tokenExchangeErrorRate
|
||||
expr: '(sum by (realm) (rate(keycloak_code_to_tokens_errors_total[5m])) / sum by (realm) (rate(keycloak_code_to_tokens_total[5m]))) * 100 > 10 and sum by (realm) (rate(keycloak_code_to_tokens_total[5m])) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Keycloak high code-to-token exchange error rate (instance {{ $labels.instance }})
|
||||
description: "More than 10% of code-to-token exchanges are failing in realm {{ $labels.realm }} (current value: {{ $value | printf \"%.1f\" }}%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold of 10% is a rough default.
|
||||
- alert: KeycloakHighRegistrationFailureRate
|
||||
expr: '(sum by (realm) (rate(keycloak_registrations_errors_total[5m])) / sum by (realm) (rate(keycloak_registrations_total[5m]))) * 100 > 10 and sum by (realm) (rate(keycloak_registrations_total[5m])) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Keycloak high registration failure rate (instance {{ $labels.instance }})
|
||||
description: "More than 10% of registration attempts are failing in realm {{ $labels.realm }} (current value: {{ $value | printf \"%.1f\" }}%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# keycloak_request_duration is in milliseconds. Threshold of 2000ms (2 seconds) is a rough default.
|
||||
- alert: KeycloakSlowRequestResponseTime
|
||||
expr: 'sum by (method) (rate(keycloak_request_duration_sum[5m])) / sum by (method) (rate(keycloak_request_duration_count[5m])) > 2000 and sum by (method) (rate(keycloak_request_duration_count[5m])) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Keycloak slow request response time (instance {{ $labels.instance }})
|
||||
description: "Keycloak {{ $labels.method }} requests are taking more than 2 seconds on average.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
93
dist/rules/kubernetes/kubestate-exporter.yml
vendored
93
dist/rules/kubernetes/kubestate-exporter.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: KubestateExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: KubernetesNodeNotReady
|
||||
|
|
@ -10,16 +11,27 @@ groups:
|
|||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Kubernetes Node ready (node {{ $labels.node }})
|
||||
summary: Kubernetes Node not ready (instance {{ $labels.instance }})
|
||||
description: "Node {{ $labels.node }} has been unready for a long time\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Kubernetes Node with disabled schedules are fine.
|
||||
# This alarm can be useful to get warned if there are nodes which are longer unscheduled.
|
||||
- alert: KubernetesNodeSchedulingDisabled
|
||||
expr: 'kube_node_spec_taint{key="node.kubernetes.io/unschedulable"} == 1'
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kubernetes Node scheduling disabled (instance {{ $labels.instance }})
|
||||
description: "Node {{ $labels.node }} has been marked as unschedulable for more than 30 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesNodeMemoryPressure
|
||||
expr: 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Kubernetes memory pressure (node {{ $labels.node }})
|
||||
summary: Kubernetes Node memory pressure (instance {{ $labels.instance }})
|
||||
description: "Node {{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesNodeDiskPressure
|
||||
|
|
@ -28,7 +40,7 @@ groups:
|
|||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Kubernetes disk pressure (node {{ $labels.node }})
|
||||
summary: Kubernetes Node disk pressure (instance {{ $labels.instance }})
|
||||
description: "Node {{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesNodeNetworkUnavailable
|
||||
|
|
@ -41,7 +53,7 @@ groups:
|
|||
description: "Node {{ $labels.node }} has NetworkUnavailable condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesNodeOutOfPodCapacity
|
||||
expr: 'sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90'
|
||||
expr: 'sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid, instance) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -55,7 +67,7 @@ groups:
|
|||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kubernetes container oom killer ({{ $labels.namespace }}/{{ $labels.pod }}:{{ $labels.container }})
|
||||
summary: Kubernetes Container oom killer (instance {{ $labels.instance }})
|
||||
description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesJobFailed
|
||||
|
|
@ -64,16 +76,34 @@ groups:
|
|||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kubernetes Job failed ({{ $labels.namespace }}/{{ $labels.job_name }})
|
||||
summary: Kubernetes Job failed (instance {{ $labels.instance }})
|
||||
description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesJobNotStarting
|
||||
expr: 'kube_job_status_active == 0 and kube_job_status_failed == 0 and kube_job_status_succeeded == 0 and (time() - kube_job_status_start_time) > 600'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kubernetes Job not starting (instance {{ $labels.instance }})
|
||||
description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} did not start for 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesCronjobFailing
|
||||
expr: '(kube_cronjob_status_last_schedule_time > kube_cronjob_status_last_successful_time) AND (kube_cronjob_status_active == 0) AND (kube_cronjob_spec_suspend == 0)'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Kubernetes CronJob failing (instance {{ $labels.instance }})
|
||||
description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is failing\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesCronjobSuspended
|
||||
expr: 'kube_cronjob_spec_suspend != 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kubernetes CronJob suspended ({{ $labels.namespace }}/{{ $labels.cronjob }})
|
||||
summary: Kubernetes CronJob suspended (instance {{ $labels.instance }})
|
||||
description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is suspended\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesPersistentvolumeclaimPending
|
||||
|
|
@ -82,11 +112,11 @@ groups:
|
|||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kubernetes PersistentVolumeClaim pending ({{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }})
|
||||
summary: Kubernetes PersistentVolumeClaim pending (instance {{ $labels.instance }})
|
||||
description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pending\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesVolumeOutOfDiskSpace
|
||||
expr: 'kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes * 100 < 10'
|
||||
expr: 'kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes * 100 < 10 and kubelet_volume_stats_capacity_bytes > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -104,12 +134,12 @@ groups:
|
|||
description: "Volume under {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesPersistentvolumeError
|
||||
expr: 'kube_persistentvolume_status_phase{phase=~"Failed|Pending", job="kube-state-metrics"} > 0'
|
||||
expr: 'kube_persistentvolume_status_phase{phase=~"Failed|Pending"} > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Kubernetes PersistentVolumeClaim pending ({{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }})
|
||||
summary: Kubernetes PersistentVolume error (instance {{ $labels.instance }})
|
||||
description: "Persistent volume {{ $labels.persistentvolume }} is in bad state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesStatefulsetDown
|
||||
|
|
@ -118,7 +148,7 @@ groups:
|
|||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Kubernetes StatefulSet down ({{ $labels.namespace }}/{{ $labels.statefulset }})
|
||||
summary: Kubernetes StatefulSet down (instance {{ $labels.instance }})
|
||||
description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} went down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesHpaScaleInability
|
||||
|
|
@ -140,7 +170,7 @@ groups:
|
|||
description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to collect metrics\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesHpaScaleMaximum
|
||||
expr: 'kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas'
|
||||
expr: '(kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas) and (kube_horizontalpodautoscaler_spec_max_replicas > 1) and (kube_horizontalpodautoscaler_spec_min_replicas != kube_horizontalpodautoscaler_spec_max_replicas)'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: info
|
||||
|
|
@ -163,7 +193,7 @@ groups:
|
|||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Kubernetes Pod not healthy ({{ $labels.namespace }}/{{ $labels.pod }})
|
||||
summary: Kubernetes Pod not healthy (instance {{ $labels.instance }})
|
||||
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-running state for longer than 15 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesPodCrashLooping
|
||||
|
|
@ -172,7 +202,7 @@ groups:
|
|||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kubernetes pod crash looping ({{ $labels.namespace }}/{{ $labels.pod }})
|
||||
summary: Kubernetes pod crash looping (instance {{ $labels.instance }})
|
||||
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesReplicasetReplicasMismatch
|
||||
|
|
@ -181,7 +211,7 @@ groups:
|
|||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kubernetes ReplicasSet mismatch ({{ $labels.namespace }}/{{ $labels.replicaset }})
|
||||
summary: Kubernetes ReplicaSet replicas mismatch (instance {{ $labels.instance }})
|
||||
description: "ReplicaSet {{ $labels.namespace }}/{{ $labels.replicaset }} replicas mismatch\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesDeploymentReplicasMismatch
|
||||
|
|
@ -190,7 +220,7 @@ groups:
|
|||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kubernetes Deployment replicas mismatch ({{ $labels.namespace }}/{{ $labels.deployment }})
|
||||
summary: Kubernetes Deployment replicas mismatch (instance {{ $labels.instance }})
|
||||
description: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replicas mismatch\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesStatefulsetReplicasMismatch
|
||||
|
|
@ -208,7 +238,7 @@ groups:
|
|||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Kubernetes Deployment generation mismatch ({{ $labels.namespace }}/{{ $labels.deployment }})
|
||||
summary: Kubernetes Deployment generation mismatch (instance {{ $labels.instance }})
|
||||
description: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has failed but has not been rolled back.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesStatefulsetGenerationMismatch
|
||||
|
|
@ -217,7 +247,7 @@ groups:
|
|||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Kubernetes StatefulSet generation mismatch ({{ $labels.namespace }}/{{ $labels.statefulset }})
|
||||
summary: Kubernetes StatefulSet generation mismatch (instance {{ $labels.instance }})
|
||||
description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has failed but has not been rolled back.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesStatefulsetUpdateNotRolledOut
|
||||
|
|
@ -226,16 +256,16 @@ groups:
|
|||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kubernetes StatefulSet update not rolled out ({{ $labels.namespace }}/{{ $labels.statefulset }})
|
||||
summary: Kubernetes StatefulSet update not rolled out (instance {{ $labels.instance }})
|
||||
description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesDaemonsetRolloutStuck
|
||||
expr: 'kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 or kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0'
|
||||
expr: '(kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 and kube_daemonset_status_desired_number_scheduled > 0) or kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kubernetes DaemonSet rollout stuck ({{ $labels.namespace }}/{{ $labels.daemonset }})
|
||||
summary: Kubernetes DaemonSet rollout stuck (instance {{ $labels.instance }})
|
||||
description: "Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled or not ready\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesDaemonsetMisscheduled
|
||||
|
|
@ -244,16 +274,17 @@ groups:
|
|||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Kubernetes DaemonSet misscheduled ({{ $labels.namespace }}/{{ $labels.daemonset }})
|
||||
summary: Kubernetes DaemonSet misscheduled (instance {{ $labels.instance }})
|
||||
description: "Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold should be customized for each cronjob name.
|
||||
- alert: KubernetesCronjobTooLong
|
||||
expr: 'time() - kube_cronjob_next_schedule_time > 3600'
|
||||
expr: 'kube_job_status_start_time > 0 and absent(kube_job_status_completion_time) and (time() - kube_job_status_start_time) > 3600'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Kubernetes CronJob too long ({{ $labels.namespace }}/{{ $labels.cronjob }})
|
||||
summary: Kubernetes CronJob too long (instance {{ $labels.instance }})
|
||||
description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesJobSlowCompletion
|
||||
|
|
@ -262,26 +293,26 @@ groups:
|
|||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Kubernetes job slow completion ({{ $labels.namespace }}/{{ $labels.job_name }})
|
||||
summary: Kubernetes Job slow completion (instance {{ $labels.instance }})
|
||||
description: "Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in time.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesApiServerErrors
|
||||
expr: 'sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[1m])) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) * 100 > 3'
|
||||
expr: 'sum(rate(apiserver_request_total{job="apiserver",code=~"(?:5..)"}[1m])) by (instance, job) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) by (instance, job) * 100 > 3 and sum(rate(apiserver_request_total{job="apiserver"}[1m])) by (instance, job) > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Kubernetes API server errors (instance {{ $labels.instance }})
|
||||
description: "Kubernetes API server is experiencing high error rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Kubernetes API server is experiencing {{ $value | humanize }}% error rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesApiClientErrors
|
||||
expr: '(sum(rate(rest_client_requests_total{code=~"(4|5).."}[1m])) by (instance, job) / sum(rate(rest_client_requests_total[1m])) by (instance, job)) * 100 > 1'
|
||||
expr: '(sum(rate(rest_client_requests_total{code=~"(4|5).."}[1m])) by (instance, job) / sum(rate(rest_client_requests_total[1m])) by (instance, job)) * 100 > 1 and sum(rate(rest_client_requests_total[1m])) by (instance, job) > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Kubernetes API client errors (instance {{ $labels.instance }})
|
||||
description: "Kubernetes API client is experiencing high error rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Kubernetes API client is experiencing {{ $value | humanize }}% error rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesClientCertificateExpiresNextWeek
|
||||
expr: 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60'
|
||||
|
|
@ -302,7 +333,7 @@ groups:
|
|||
description: "A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: KubernetesApiServerLatency
|
||||
expr: 'histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"} [10m])) WITHOUT (instance, resource)) > 1'
|
||||
expr: 'histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{verb!~"(?:CONNECT|WATCHLIST|WATCH|PROXY)"} [10m])) WITHOUT (subresource)) > 1'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
|
|||
6
dist/rules/linkerd/embedded-exporter.yml
vendored
6
dist/rules/linkerd/embedded-exporter.yml
vendored
|
|
@ -2,13 +2,15 @@ groups:
|
|||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
# Linkerd does not expose request_errors_total. Errors are tracked via response_total{classification="failure"}.
|
||||
- alert: LinkerdHighErrorRate
|
||||
expr: 'sum(rate(request_errors_total[1m])) by (deployment, statefulset, daemonset) / sum(rate(request_total[1m])) by (deployment, statefulset, daemonset) * 100 > 10'
|
||||
expr: 'sum(rate(response_total{classification="failure"}[1m])) by (deployment, statefulset, daemonset) / sum(rate(response_total[1m])) by (deployment, statefulset, daemonset) * 100 > 10 and sum(rate(response_total[1m])) by (deployment, statefulset, daemonset) > 0'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Linkerd high error rate (instance {{ $labels.instance }})
|
||||
description: "Linkerd error rate for {{ $labels.deployment | $labels.statefulset | $labels.daemonset }} is over 10%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Linkerd error rate for {{ $labels.deployment }}{{ $labels.statefulset }}{{ $labels.daemonset }} is over 10%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
|
|||
36
dist/rules/litellm/embedded-exporter.yml
vendored
Normal file
36
dist/rules/litellm/embedded-exporter.yml
vendored
Normal file
|
|
@ -0,0 +1,36 @@
|
|||
groups:
|
||||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
# The threshold (1) is in USD. The `model` label carries the resolved model-name (post-routing).
|
||||
# PromQL `increase()` requires ≥2 datapoints with growth-difference to extrapolate positive —
|
||||
# for brand-new counter series this needs ≥2 distinct request bursts ≥1 scrape-cycle apart.
|
||||
- alert: LitellmProviderSpendOverBudget
|
||||
expr: 'sum(increase(litellm_spend_metric_total{model=~"(claude-|anthropic/).*"}[24h])) > 1'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: LiteLLM provider spend over budget (instance {{ $labels.instance }})
|
||||
description: "Cumulative spend for an LLM provider has exceeded the daily budget threshold. Replace the regex `(claude-|anthropic/).*` with your provider's model-name pattern. Useful as a soft-warning when `provider_budget_config` hard-cap is unavailable or disabled.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: LitellmProxyFailedRequestsRateHigh
|
||||
expr: 'sum(rate(litellm_proxy_failed_requests_metric_total[5m])) / sum(rate(litellm_proxy_total_requests_metric_total[5m])) > 0.05'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: LiteLLM proxy failed requests rate high (instance {{ $labels.instance }})
|
||||
description: "LiteLLM proxy is returning failed responses to clients (>5% error rate over 5min). Investigate downstream LLM provider availability or auth issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: LitellmRequestLatencyP95High
|
||||
expr: 'histogram_quantile(0.95, sum(rate(litellm_request_total_latency_metric_bucket[5m])) by (le)) > 10'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: LiteLLM request latency p95 high (instance {{ $labels.instance }})
|
||||
description: "LiteLLM request total latency p95 exceeds 10 seconds over 5min. Check downstream LLM provider response-times and proxy queue-depth.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
15
dist/rules/loki/embedded-exporter.yml
vendored
15
dist/rules/loki/embedded-exporter.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: LokiProcessTooManyRestarts
|
||||
|
|
@ -14,28 +15,28 @@ groups:
|
|||
description: "A loki process had too many restarts (target {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: LokiRequestErrors
|
||||
expr: '100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10'
|
||||
expr: '100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10 and sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 0'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Loki request errors (instance {{ $labels.instance }})
|
||||
description: "The {{ $labels.job }} and {{ $labels.route }} are experiencing errors\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "The {{ $labels.job }} and {{ $labels.route }} are experiencing {{ printf \"%.2f\" $value }}% errors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: LokiRequestPanic
|
||||
expr: 'sum(increase(loki_panic_total[10m])) by (namespace, job) > 0'
|
||||
for: 5m
|
||||
expr: 'sum(increase(loki_panic_total[5m])) by (namespace, job) > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Loki request panic (instance {{ $labels.instance }})
|
||||
description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% increase of panics\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "{{ $labels.job }} is experiencing {{ $value | humanize }} panic(s) in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: LokiRequestLatency
|
||||
expr: '(histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (le))) > 1'
|
||||
expr: 'histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (namespace, job, route, le)) > 1'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Loki request latency (instance {{ $labels.instance }})
|
||||
description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
|
|||
24
dist/rules/meilisearch/embedded-exporter.yml
vendored
Normal file
24
dist/rules/meilisearch/embedded-exporter.yml
vendored
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
groups:
|
||||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: MeilisearchIndexIsEmpty
|
||||
expr: 'meilisearch_index_docs_count == 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Meilisearch index is empty (instance {{ $labels.instance }})
|
||||
description: "Meilisearch index {{ $labels.index }} has zero documents\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MeilisearchHttpResponseTime
|
||||
expr: 'meilisearch_http_response_time_seconds > 0.5'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Meilisearch http response time (instance {{ $labels.instance }})
|
||||
description: "Meilisearch http response time is too high\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
91
dist/rules/memcached/memcached-exporter.yml
vendored
Normal file
91
dist/rules/memcached/memcached-exporter.yml
vendored
Normal file
|
|
@ -0,0 +1,91 @@
|
|||
groups:
|
||||
|
||||
- name: MemcachedExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
# 1m delay allows a restart without triggering an alert.
|
||||
- alert: MemcachedDown
|
||||
expr: 'memcached_up == 0'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Memcached down (instance {{ $labels.instance }})
|
||||
description: "Memcached instance is down on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MemcachedConnectionLimitApproaching(>80%)
|
||||
expr: '(memcached_current_connections / memcached_max_connections * 100) > 80 and memcached_max_connections > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Memcached connection limit approaching (> 80%) (instance {{ $labels.instance }})
|
||||
description: "Memcached connection usage is above 80% on {{ $labels.instance }} (current value: {{ $value }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MemcachedConnectionLimitApproaching(>95%)
|
||||
expr: '(memcached_current_connections / memcached_max_connections * 100) > 95 and memcached_max_connections > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Memcached connection limit approaching (> 95%) (instance {{ $labels.instance }})
|
||||
description: "Memcached connection usage is above 95% on {{ $labels.instance }} (current value: {{ $value }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MemcachedOutOfMemoryErrors
|
||||
expr: 'sum without (slab) (rate(memcached_slab_items_outofmemory_total[5m])) > 0.05'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Memcached out of memory errors (instance {{ $labels.instance }})
|
||||
description: "Memcached is returning out-of-memory errors on {{ $labels.instance }} ({{ $value }} errors/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# High memory usage is expected if the cache is well-utilized. This alert fires when it approaches the configured limit, which may cause evictions.
|
||||
- alert: MemcachedMemoryUsageHigh(>90%)
|
||||
expr: '(memcached_current_bytes / memcached_limit_bytes * 100) > 90 and memcached_limit_bytes > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Memcached memory usage high (> 90%) (instance {{ $labels.instance }})
|
||||
description: "Memcached memory usage is above 90% on {{ $labels.instance }} (current value: {{ $value }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# A sustained eviction rate indicates memory pressure. Consider increasing memcached memory limit or reducing cache usage. Threshold of 10 evictions/s is a rough default — adjust based on your workload.
|
||||
- alert: MemcachedHighEvictionRate
|
||||
expr: 'rate(memcached_items_evicted_total[5m]) > 10'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Memcached high eviction rate (instance {{ $labels.instance }})
|
||||
description: "Memcached is evicting items at a high rate on {{ $labels.instance }} ({{ $value }} evictions/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# A low hit rate may indicate poor cache utilization, incorrect cache keys, or TTLs that are too short. Threshold of 80% is a rough default — adjust based on your workload and access patterns.
|
||||
- alert: MemcachedLowCacheHitRate(<80%)
|
||||
expr: '(rate(memcached_commands_total{command="get", status="hit"}[5m]) / (rate(memcached_commands_total{command="get", status="hit"}[5m]) + rate(memcached_commands_total{command="get", status="miss"}[5m])) * 100) < 80 and (rate(memcached_commands_total{command="get", status="hit"}[5m]) + rate(memcached_commands_total{command="get", status="miss"}[5m])) > 0'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Memcached low cache hit rate (< 80%) (instance {{ $labels.instance }})
|
||||
description: "Memcached cache hit rate is below 80% on {{ $labels.instance }} (current value: {{ $value }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MemcachedConnectionsRejected
|
||||
expr: 'increase(memcached_connections_rejected_total[5m]) > 3'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Memcached connections rejected (instance {{ $labels.instance }})
|
||||
description: "Memcached is rejecting connections on {{ $labels.instance }} ({{ $value }} rejections in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MemcachedItemsTooLarge
|
||||
expr: 'increase(memcached_item_too_large_total[5m]) > 3'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Memcached items too large (instance {{ $labels.instance }})
|
||||
description: "Memcached is rejecting items exceeding max-item-size on {{ $labels.instance }} ({{ $value }} rejections in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
5
dist/rules/minio/embedded-exporter.yml
vendored
5
dist/rules/minio/embedded-exporter.yml
vendored
|
|
@ -2,10 +2,11 @@ groups:
|
|||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: MinioClusterDiskOffline
|
||||
expr: 'minio_cluster_disk_offline_total > 0'
|
||||
expr: 'minio_cluster_drive_offline_total > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
|
|
@ -23,7 +24,7 @@ groups:
|
|||
description: "Minio cluster node disk is offline\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MinioDiskSpaceUsage
|
||||
expr: 'disk_storage_available / disk_storage_total * 100 < 10'
|
||||
expr: 'minio_cluster_capacity_raw_free_bytes / minio_cluster_capacity_raw_total_bytes * 100 < 10 and minio_cluster_capacity_raw_total_bytes > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
|
|||
30
dist/rules/mongodb/dcu-mongodb-exporter.yml
vendored
30
dist/rules/mongodb/dcu-mongodb-exporter.yml
vendored
|
|
@ -2,15 +2,16 @@ groups:
|
|||
|
||||
- name: DcuMongodbExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: MongodbReplicationLag
|
||||
- alert: MongodbReplicationLag(dcu)
|
||||
expr: 'avg(mongodb_replset_member_optime_date{state="PRIMARY"}) - avg(mongodb_replset_member_optime_date{state="SECONDARY"}) > 10'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: MongoDB replication lag (instance {{ $labels.instance }})
|
||||
summary: MongoDB replication lag (DCU) (instance {{ $labels.instance }})
|
||||
description: "Mongodb replication lag is more than 10s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MongodbReplicationStatus3
|
||||
|
|
@ -58,38 +59,29 @@ groups:
|
|||
summary: MongoDB replication Status 10 (instance {{ $labels.instance }})
|
||||
description: "MongoDB Replication set member was once in a replica set but was subsequently removed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MongodbNumberCursorsOpen
|
||||
- alert: MongodbNumberCursorsOpen(dcu)
|
||||
expr: 'mongodb_metrics_cursor_open{state="total_open"} > 10000'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: MongoDB number cursors open (instance {{ $labels.instance }})
|
||||
summary: MongoDB number cursors open (DCU) (instance {{ $labels.instance }})
|
||||
description: "Too many cursors opened by MongoDB for clients (> 10k)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MongodbCursorsTimeouts
|
||||
- alert: MongodbCursorsTimeouts(dcu)
|
||||
expr: 'increase(mongodb_metrics_cursor_timed_out_total[1m]) > 100'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: MongoDB cursors timeouts (instance {{ $labels.instance }})
|
||||
description: "Too many cursors are timing out\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
summary: MongoDB cursors timeouts (DCU) (instance {{ $labels.instance }})
|
||||
description: "Too many cursors are timing out ({{ $value }} in the last minute)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MongodbTooManyConnections
|
||||
expr: 'avg by(instance) (rate(mongodb_connections{state="current"}[1m])) / avg by(instance) (sum (mongodb_connections) by (instance)) * 100 > 80'
|
||||
- alert: MongodbTooManyConnections(dcu)
|
||||
expr: 'mongodb_connections{state="current"} / (mongodb_connections{state="current"} + mongodb_connections{state="available"}) * 100 > 80 and (mongodb_connections{state="current"} + mongodb_connections{state="available"}) > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: MongoDB too many connections (instance {{ $labels.instance }})
|
||||
summary: MongoDB too many connections (DCU) (instance {{ $labels.instance }})
|
||||
description: "Too many connections (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MongodbVirtualMemoryUsage
|
||||
expr: '(sum(mongodb_memory{type="virtual"}) BY (instance) / sum(mongodb_memory{type="mapped"}) BY (instance)) > 3'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: MongoDB virtual memory usage (instance {{ $labels.instance }})
|
||||
description: "High memory usage\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
|
|||
37
dist/rules/mongodb/percona-mongodb-exporter.yml
vendored
37
dist/rules/mongodb/percona-mongodb-exporter.yml
vendored
|
|
@ -2,35 +2,39 @@ groups:
|
|||
|
||||
- name: PerconaMongodbExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
# 1m delay allows a restart without triggering an alert.
|
||||
- alert: MongodbDown
|
||||
expr: 'mongodb_up == 0'
|
||||
for: 0m
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: MongoDB Down (instance {{ $labels.instance }})
|
||||
description: "MongoDB instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# 1m delay allows a restart without triggering an alert.
|
||||
- alert: MongodbReplicaMemberUnhealthy
|
||||
expr: 'mongodb_rs_members_health == 0'
|
||||
for: 0m
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Mongodb replica member unhealthy (instance {{ $labels.instance }})
|
||||
description: "MongoDB replica member is not healthy\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MongodbReplicationLag
|
||||
- alert: MongodbReplicationLag(percona)
|
||||
expr: '(mongodb_rs_members_optimeDate{member_state="PRIMARY"} - on (set) group_right mongodb_rs_members_optimeDate{member_state="SECONDARY"}) / 1000 > 10'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: MongoDB replication lag (instance {{ $labels.instance }})
|
||||
summary: MongoDB replication lag (Percona) (instance {{ $labels.instance }})
|
||||
description: "Mongodb replication lag is more than 10s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# This query mixes old (mongodb_mongod_*) and new (mongodb_rs_*) metric names. It requires the Percona exporter to run with --compatible-mode to expose both.
|
||||
- alert: MongodbReplicationHeadroom
|
||||
expr: 'sum(avg(mongodb_mongod_replset_oplog_head_timestamp - mongodb_mongod_replset_oplog_tail_timestamp)) - sum(avg(mongodb_rs_members_optimeDate{member_state="PRIMARY"} - on (set) group_right mongodb_rs_members_optimeDate{member_state="SECONDARY"})) <= 0'
|
||||
for: 0m
|
||||
|
|
@ -40,38 +44,29 @@ groups:
|
|||
summary: MongoDB replication headroom (instance {{ $labels.instance }})
|
||||
description: "MongoDB replication headroom is <= 0\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MongodbNumberCursorsOpen
|
||||
- alert: MongodbNumberCursorsOpen(percona)
|
||||
expr: 'mongodb_ss_metrics_cursor_open{csr_type="total"} > 10 * 1000'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: MongoDB number cursors open (instance {{ $labels.instance }})
|
||||
summary: MongoDB number cursors open (Percona) (instance {{ $labels.instance }})
|
||||
description: "Too many cursors opened by MongoDB for clients (> 10k)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MongodbCursorsTimeouts
|
||||
- alert: MongodbCursorsTimeouts(percona)
|
||||
expr: 'increase(mongodb_ss_metrics_cursor_timedOut[1m]) > 100'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: MongoDB cursors timeouts (instance {{ $labels.instance }})
|
||||
description: "Too many cursors are timing out\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
summary: MongoDB cursors timeouts (Percona) (instance {{ $labels.instance }})
|
||||
description: "Too many cursors are timing out ({{ $value }} in the last minute)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MongodbTooManyConnections
|
||||
expr: 'avg by(instance) (rate(mongodb_ss_connections{conn_type="current"}[1m])) / avg by(instance) (sum (mongodb_ss_connections) by (instance)) * 100 > 80'
|
||||
- alert: MongodbTooManyConnections(percona)
|
||||
expr: 'mongodb_ss_connections{conn_type="current"} / (mongodb_ss_connections{conn_type="current"} + mongodb_ss_connections{conn_type="available"}) * 100 > 80 and (mongodb_ss_connections{conn_type="current"} + mongodb_ss_connections{conn_type="available"}) > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: MongoDB too many connections (instance {{ $labels.instance }})
|
||||
summary: MongoDB too many connections (Percona) (instance {{ $labels.instance }})
|
||||
description: "Too many connections (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MongodbVirtualMemoryUsage
|
||||
expr: '(sum(mongodb_ss_mem_virtual) BY (instance) / sum(mongodb_ss_mem_resident) BY (instance)) > 3'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: MongoDB virtual memory usage (instance {{ $labels.instance }})
|
||||
description: "High memory usage\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: StefanprodanMgobExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: MgobBackupFailed
|
||||
|
|
|
|||
63
dist/rules/mysql/mysqld-exporter.yml
vendored
63
dist/rules/mysql/mysqld-exporter.yml
vendored
|
|
@ -2,11 +2,13 @@ groups:
|
|||
|
||||
- name: MysqldExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
# 1m delay allows a restart without triggering an alert.
|
||||
- alert: MysqlDown
|
||||
expr: 'mysql_up == 0'
|
||||
for: 0m
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
|
|
@ -14,7 +16,7 @@ groups:
|
|||
description: "MySQL instance is down on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MysqlTooManyConnections(>80%)
|
||||
expr: 'max_over_time(mysql_global_status_threads_connected[1m]) / mysql_global_variables_max_connections * 100 > 80'
|
||||
expr: 'max_over_time(mysql_global_status_threads_connected[1m]) / mysql_global_variables_max_connections * 100 > 80 and mysql_global_variables_max_connections > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -23,7 +25,7 @@ groups:
|
|||
description: "More than 80% of MySQL connections are in use on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MysqlHighPreparedStatementsUtilization(>80%)
|
||||
expr: 'max_over_time(mysql_global_status_prepared_stmt_count[1m]) / mysql_global_variables_max_prepared_stmt_count * 100 > 80'
|
||||
expr: 'max_over_time(mysql_global_status_prepared_stmt_count[1m]) / mysql_global_variables_max_prepared_stmt_count * 100 > 80 and mysql_global_variables_max_prepared_stmt_count > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -32,7 +34,7 @@ groups:
|
|||
description: "High utilization of prepared statements (>80%) on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MysqlHighThreadsRunning
|
||||
expr: 'max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections * 100 > 60'
|
||||
expr: 'max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections * 100 > 60 and mysql_global_variables_max_connections > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -40,18 +42,20 @@ groups:
|
|||
summary: MySQL high threads running (instance {{ $labels.instance }})
|
||||
description: "More than 60% of MySQL connections are in running state on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# 1m delay allows a restart without triggering an alert.
|
||||
- alert: MysqlSlaveIoThreadNotRunning
|
||||
expr: '( mysql_slave_status_slave_io_running and ON (instance) mysql_slave_status_master_server_id > 0 ) == 0'
|
||||
for: 0m
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: MySQL Slave IO thread not running (instance {{ $labels.instance }})
|
||||
description: "MySQL Slave IO thread not running on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# 1m delay allows a restart without triggering an alert.
|
||||
- alert: MysqlSlaveSqlThreadNotRunning
|
||||
expr: '( mysql_slave_status_slave_sql_running and ON (instance) mysql_slave_status_master_server_id > 0) == 0'
|
||||
for: 0m
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
|
|
@ -67,23 +71,25 @@ groups:
|
|||
summary: MySQL Slave replication lag (instance {{ $labels.instance }})
|
||||
description: "MySQL replication lag on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# mysqld_exporter exposes SHOW GLOBAL STATUS variables as untyped/gauge, so delta() is used instead of increase().
|
||||
- alert: MysqlSlowQueries
|
||||
expr: 'increase(mysql_global_status_slow_queries[1m]) > 0'
|
||||
expr: 'delta(mysql_global_status_slow_queries[1m]) > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: MySQL slow queries (instance {{ $labels.instance }})
|
||||
description: "MySQL server mysql has some new slow query.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "MySQL server has some new slow queries ({{ $value }} in the last minute).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# mysqld_exporter exposes SHOW GLOBAL STATUS variables as untyped/gauge, so deriv() is used instead of rate().
|
||||
- alert: MysqlInnodbLogWaits
|
||||
expr: 'rate(mysql_global_status_innodb_log_waits[15m]) > 10'
|
||||
expr: 'deriv(mysql_global_status_innodb_log_waits[15m]) > 10'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: MySQL InnoDB log waits (instance {{ $labels.instance }})
|
||||
description: "MySQL innodb log writes stalling\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "MySQL innodb log writes stalling ({{ $value }} waits/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MysqlRestarted
|
||||
expr: 'mysql_global_status_uptime < 60'
|
||||
|
|
@ -93,3 +99,40 @@ groups:
|
|||
annotations:
|
||||
summary: MySQL restarted (instance {{ $labels.instance }})
|
||||
description: "MySQL has just been restarted, less than one minute ago on {{ $labels.instance }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# mysqld_exporter exposes SHOW GLOBAL STATUS variables as untyped/gauge, so deriv() is used instead of irate().
|
||||
- alert: MysqlHighQps
|
||||
expr: 'deriv(mysql_global_status_questions[1m]) > 10000'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: MySQL High QPS (instance {{ $labels.instance }})
|
||||
description: "MySQL is being overload with unusual QPS (> 10k QPS).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MysqlTooManyOpenFiles
|
||||
expr: 'mysql_global_status_innodb_num_open_files / mysql_global_variables_open_files_limit * 100 > 75 and mysql_global_variables_open_files_limit > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: MySQL too many open files (instance {{ $labels.instance }})
|
||||
description: "MySQL has too many open files, consider increase variables open_files_limit on {{ $labels.instance }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MysqlInnodbForceRecoveryIsEnabled
|
||||
expr: 'mysql_global_variables_innodb_force_recovery != 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: MySQL InnoDB Force Recovery is enabled (instance {{ $labels.instance }})
|
||||
description: "MySQL InnoDB force recovery is enabled on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: MysqlInnodbHistory_lenTooLong
|
||||
expr: 'mysql_info_schema_innodb_metrics_transaction_trx_rseg_history_len > 50000'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: MySQL InnoDB history_len too long (instance {{ $labels.instance }})
|
||||
description: "MySQL history_len (undo log) too long on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
|
|||
142
dist/rules/nats/nats-exporter.yml
vendored
142
dist/rules/nats/nats-exporter.yml
vendored
|
|
@ -2,40 +2,126 @@ groups:
|
|||
|
||||
- name: NatsExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: NatsHighConnectionCount
|
||||
expr: 'gnatsd_varz_connections > 100'
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Nats high connection count (instance {{ $labels.instance }})
|
||||
description: "High number of NATS connections ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NatsHighPendingBytes
|
||||
expr: 'gnatsd_connz_pending_bytes > 100000'
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Nats high pending bytes (instance {{ $labels.instance }})
|
||||
description: "High number of NATS pending bytes ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NatsHighSubscriptionsCount
|
||||
expr: 'gnatsd_connz_subscriptions > 50'
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Nats high subscriptions count (instance {{ $labels.instance }})
|
||||
description: "High number of NATS subscriptions ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NatsHighRoutesCount
|
||||
expr: 'gnatsd_routez_num_routes > 10'
|
||||
expr: 'gnatsd_varz_routes > 10'
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Nats high routes count (instance {{ $labels.instance }})
|
||||
description: "High number of NATS routes ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NatsHighMemoryUsage
|
||||
expr: 'gnatsd_varz_mem > 200 * 1024 * 1024'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Nats high memory usage (instance {{ $labels.instance }})
|
||||
description: "NATS server memory usage is above 200MB for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NatsSlowConsumers
|
||||
expr: 'gnatsd_varz_slow_consumers > 0'
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Nats slow consumers (instance {{ $labels.instance }})
|
||||
description: "There are slow consumers in NATS for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Replace job="nats" with the actual job name in your Prometheus configuration.
|
||||
- alert: NatsServerDown
|
||||
expr: 'absent(up{job="nats"})'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Nats server down (instance {{ $labels.instance }})
|
||||
description: "NATS server has been down for more than 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# gnatsd_varz_cpu is a gauge reporting CPU percentage (0-100 scale).
|
||||
- alert: NatsHighCpuUsage
|
||||
expr: 'gnatsd_varz_cpu > 80'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Nats high CPU usage (instance {{ $labels.instance }})
|
||||
description: "NATS server is using more than 80% CPU for the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NatsHighNumberOfConnections
|
||||
expr: 'gnatsd_connz_num_connections > 1000'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Nats high number of connections (instance {{ $labels.instance }})
|
||||
description: "NATS server has more than 1000 active connections\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NatsHighJetstreamStoreUsage
|
||||
expr: 'gnatsd_varz_jetstream_stats_storage / gnatsd_varz_jetstream_config_max_storage > 0.8 and gnatsd_varz_jetstream_config_max_storage > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Nats high JetStream store usage (instance {{ $labels.instance }})
|
||||
description: "JetStream store usage is over 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NatsHighJetstreamMemoryUsage
|
||||
expr: 'gnatsd_varz_jetstream_stats_memory / gnatsd_varz_jetstream_config_max_memory > 0.8 and gnatsd_varz_jetstream_config_max_memory > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Nats high JetStream memory usage (instance {{ $labels.instance }})
|
||||
description: "JetStream memory usage is over 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NatsHighNumberOfSubscriptions
|
||||
expr: 'gnatsd_varz_subscriptions > 1000'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Nats high number of subscriptions (instance {{ $labels.instance }})
|
||||
description: "NATS server has more than 1000 active subscriptions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NatsHighPendingBytes
|
||||
expr: 'gnatsd_connz_pending_bytes > 100000'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Nats high pending bytes (instance {{ $labels.instance }})
|
||||
description: "NATS server has more than 100,000 pending bytes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NatsTooManyErrors
|
||||
expr: 'increase(gnatsd_varz_jetstream_stats_api_errors[5m]) > 5'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Nats too many errors (instance {{ $labels.instance }})
|
||||
description: "NATS server has encountered {{ $value }} JetStream API errors in the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NatsJetstreamAccountsExceeded
|
||||
expr: 'sum(gnatsd_varz_jetstream_stats_accounts) > 100'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Nats JetStream accounts exceeded (instance {{ $labels.instance }})
|
||||
description: "JetStream has more than 100 active accounts\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Only enable this alert if your deployment requires leaf node connections.
|
||||
# This will fire spuriously if leaf nodes are not configured.
|
||||
- alert: NatsLeafNodeConnectionIssue
|
||||
expr: 'gnatsd_varz_leafnodes == 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Nats leaf node connection issue (instance {{ $labels.instance }})
|
||||
description: "No leaf node connections on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
|
|||
18
dist/rules/netdata/embedded-exporter.yml
vendored
18
dist/rules/netdata/embedded-exporter.yml
vendored
|
|
@ -2,10 +2,12 @@ groups:
|
|||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
# This is a gauge metric (not a counter). Checking idle < 20% means CPU usage > 80%.
|
||||
- alert: NetdataHighCpuUsage
|
||||
expr: 'rate(netdata_cpu_cpu_percentage_average{dimension="idle"}[1m]) > 80'
|
||||
expr: 'netdata_cpu_cpu_percentage_average{dimension="idle"} < 20'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -13,17 +15,17 @@ groups:
|
|||
summary: Netdata high cpu usage (instance {{ $labels.instance }})
|
||||
description: "Netdata high CPU usage (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostCpuStealNoisyNeighbor
|
||||
expr: 'rate(netdata_cpu_cpu_percentage_average{dimension="steal"}[1m]) > 10'
|
||||
- alert: NetdataCpuStealNoisyNeighbor
|
||||
expr: 'netdata_cpu_cpu_percentage_average{dimension="steal"} > 10'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
|
||||
summary: Netdata CPU steal noisy neighbor (instance {{ $labels.instance }})
|
||||
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NetdataHighMemoryUsage
|
||||
expr: '100 / netdata_system_ram_MB_average * netdata_system_ram_MB_average{dimension=~"free|cached"} < 20'
|
||||
expr: '100 / netdata_system_ram_MiB_average * netdata_system_ram_MiB_average{dimension=~"free|cached"} < 20 and netdata_system_ram_MiB_average > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -32,7 +34,7 @@ groups:
|
|||
description: "Netdata high memory usage (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NetdataLowDiskSpace
|
||||
expr: '100 / netdata_disk_space_GB_average * netdata_disk_space_GB_average{dimension=~"avail|cached"} < 20'
|
||||
expr: '100 / netdata_disk_space_GB_average * netdata_disk_space_GB_average{dimension=~"avail|cached"} < 20 and netdata_disk_space_GB_average > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -65,7 +67,7 @@ groups:
|
|||
severity: info
|
||||
annotations:
|
||||
summary: Netdata disk reallocated sectors (instance {{ $labels.instance }})
|
||||
description: "Reallocated sectors on disk\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Disk reallocated sectors detected ({{ $value }} sectors)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NetdataDiskCurrentPendingSector
|
||||
expr: 'netdata_smartd_log_current_pending_sector_count_sectors_average > 0'
|
||||
|
|
@ -83,4 +85,4 @@ groups:
|
|||
severity: warning
|
||||
annotations:
|
||||
summary: Netdata reported uncorrectable disk sectors (instance {{ $labels.instance }})
|
||||
description: "Reported uncorrectable disk sectors\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Reported uncorrectable disk sectors ({{ $value }} sectors)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
|
|||
5
dist/rules/nginx/knyar-nginx-exporter.yml
vendored
5
dist/rules/nginx/knyar-nginx-exporter.yml
vendored
|
|
@ -2,10 +2,11 @@ groups:
|
|||
|
||||
- name: KnyarNginxExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: NginxHighHttp4xxErrorRate
|
||||
expr: 'sum(rate(nginx_http_requests_total{status=~"^4.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5'
|
||||
expr: 'sum(rate(nginx_http_requests_total{status=~"^4.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5 and sum(rate(nginx_http_requests_total[1m])) > 0'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
|
|
@ -14,7 +15,7 @@ groups:
|
|||
description: "Too many HTTP requests with status 4xx (> 5%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NginxHighHttp5xxErrorRate
|
||||
expr: 'sum(rate(nginx_http_requests_total{status=~"^5.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5'
|
||||
expr: 'sum(rate(nginx_http_requests_total{status=~"^5.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5 and sum(rate(nginx_http_requests_total[1m])) > 0'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
|
|
|
|||
9
dist/rules/nomad/embedded-exporter.yml
vendored
9
dist/rules/nomad/embedded-exporter.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: NomadJobFailed
|
||||
|
|
@ -11,7 +12,7 @@ groups:
|
|||
severity: warning
|
||||
annotations:
|
||||
summary: Nomad job failed (instance {{ $labels.instance }})
|
||||
description: "Nomad job failed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Nomad job {{ $labels.job }} has {{ $value }} failed allocations.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NomadJobLost
|
||||
expr: 'nomad_nomad_job_summary_lost > 0'
|
||||
|
|
@ -20,7 +21,7 @@ groups:
|
|||
severity: warning
|
||||
annotations:
|
||||
summary: Nomad job lost (instance {{ $labels.instance }})
|
||||
description: "Nomad job lost\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Nomad job {{ $labels.job }} has {{ $value }} lost allocations.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NomadJobQueued
|
||||
expr: 'nomad_nomad_job_summary_queued > 0'
|
||||
|
|
@ -29,7 +30,7 @@ groups:
|
|||
severity: warning
|
||||
annotations:
|
||||
summary: Nomad job queued (instance {{ $labels.instance }})
|
||||
description: "Nomad job queued\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Nomad job {{ $labels.job }} has {{ $value }} queued allocations.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NomadBlockedEvaluation
|
||||
expr: 'nomad_nomad_blocked_evals_total_blocked > 0'
|
||||
|
|
@ -38,4 +39,4 @@ groups:
|
|||
severity: warning
|
||||
annotations:
|
||||
summary: Nomad blocked evaluation (instance {{ $labels.instance }})
|
||||
description: "Nomad blocked evaluation\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Nomad has {{ $value }} blocked evaluations. The cluster may lack resources to place allocations.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
|
|||
1
dist/rules/openebs/embedded-exporter.yml
vendored
1
dist/rules/openebs/embedded-exporter.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: OpenebsUsedPoolCapacity
|
||||
|
|
|
|||
60
dist/rules/opensearch/opensearch-project-opensearch-prometheus-exporter.yml
vendored
Normal file
60
dist/rules/opensearch/opensearch-project-opensearch-prometheus-exporter.yml
vendored
Normal file
|
|
@ -0,0 +1,60 @@
|
|||
groups:
|
||||
|
||||
- name: OpensearchProjectOpensearchPrometheusExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: OpensearchIsUnhealthy
|
||||
expr: 'opensearch_cluster_status != 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: OpenSearch is unhealthy (instance {{ $labels.instance }})
|
||||
description: "OpenSearch cluster {{ $labels.cluster }} is unhealthy\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: OpensearchHighHeapUsage
|
||||
expr: 'opensearch_jvm_mem_heap_used_percent > 90'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: OpenSearch high heap usage (instance {{ $labels.instance }})
|
||||
description: "OpenSearch heap usage on cluster {{ $labels.cluster }} is too high\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: OpensearchCircuitbreakerTripped
|
||||
expr: 'opensearch_circuitbreaker_tripped_count > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: OpenSearch circuitbreaker tripped (instance {{ $labels.instance }})
|
||||
description: "The circuitbreaker on OpenSearch cluster {{ $labels.cluster }} has tripped to prevent Java OutOfMemoryError\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: OpensearchHasPendingTasks
|
||||
expr: 'opensearch_cluster_pending_tasks_number > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: OpenSearch has pending tasks (instance {{ $labels.instance }})
|
||||
description: "OpenSearch cluster {{ $labels.cluster }} has pending tasks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: OpensearchIndexingIsThrottled
|
||||
expr: 'opensearch_indices_indexing_is_throttled_bool > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: OpenSearch indexing is throttled (instance {{ $labels.instance }})
|
||||
description: "Indexing on OpenSearch cluster {{ $labels.cluster }} is throttled\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: OpensearchHasInactiveShards
|
||||
expr: 'opensearch_cluster_shards_active_percent < 100.0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: OpenSearch has inactive shards (instance {{ $labels.instance }})
|
||||
description: "OpenSearch cluster {{ $labels.cluster }} has inactive shards\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
192
dist/rules/openstack/openstack-exporter.yml
vendored
Normal file
192
dist/rules/openstack/openstack-exporter.yml
vendored
Normal file
|
|
@ -0,0 +1,192 @@
|
|||
groups:
|
||||
|
||||
- name: OpenstackExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
# Adjust the job label regex to match the actual job name in your Prometheus scrape config.
|
||||
- alert: OpenstackExporterDown
|
||||
expr: 'up{job=~".*openstack.*"} == 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: OpenStack exporter down (instance {{ $labels.instance }})
|
||||
description: "The OpenStack exporter is down. OpenStack cloud metrics are no longer being collected.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: OpenstackNovaAgentDown
|
||||
expr: 'openstack_nova_agent_state{adminState="enabled"} == 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: OpenStack Nova agent down (instance {{ $labels.instance }})
|
||||
description: "Nova agent {{ $labels.hostname }} ({{ $labels.service }}) is down in zone {{ $labels.zone }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: OpenstackNeutronAgentDown
|
||||
expr: 'openstack_neutron_agent_state{adminState="up"} == 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: OpenStack Neutron agent down (instance {{ $labels.instance }})
|
||||
description: "Neutron agent {{ $labels.hostname }} ({{ $labels.service }}) is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: OpenstackCinderAgentDown
|
||||
expr: 'openstack_cinder_agent_state{adminState="enabled"} == 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: OpenStack Cinder agent down (instance {{ $labels.instance }})
|
||||
description: "Cinder agent {{ $labels.hostname }} ({{ $labels.service }}) is down in zone {{ $labels.zone }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# The threshold of 90% is a rough default. Adjust based on your overcommit ratio and workload patterns.
|
||||
- alert: OpenstackHypervisorHighVcpuUsage
|
||||
expr: 'openstack_nova_vcpus_used / openstack_nova_vcpus_available > 0.9 and openstack_nova_vcpus_available > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: OpenStack hypervisor high vCPU usage (instance {{ $labels.instance }})
|
||||
description: "Hypervisor {{ $labels.hostname }} vCPU usage is above 90%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# The threshold of 90% is a rough default. Adjust based on your overcommit ratio and workload patterns.
|
||||
- alert: OpenstackHypervisorHighMemoryUsage
|
||||
expr: 'openstack_nova_memory_used_bytes / openstack_nova_memory_available_bytes > 0.9 and openstack_nova_memory_available_bytes > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: OpenStack hypervisor high memory usage (instance {{ $labels.instance }})
|
||||
description: "Hypervisor {{ $labels.hostname }} memory usage is above 90%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: OpenstackHypervisorHighDiskUsage
|
||||
expr: 'openstack_nova_local_storage_used_bytes / openstack_nova_local_storage_available_bytes > 0.9 and openstack_nova_local_storage_available_bytes > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: OpenStack hypervisor high disk usage (instance {{ $labels.instance }})
|
||||
description: "Hypervisor {{ $labels.hostname }} local disk usage is above 90%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# A value of -1 for limits_vcpus_max means unlimited quota (no limit set).
|
||||
- alert: OpenstackNovaTenantVcpuQuotaNearlyExhausted
|
||||
expr: 'openstack_nova_limits_vcpus_used / openstack_nova_limits_vcpus_max > 0.9 and openstack_nova_limits_vcpus_max > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: OpenStack Nova tenant vCPU quota nearly exhausted (instance {{ $labels.instance }})
|
||||
description: "Tenant {{ $labels.tenant }} has used over 90% of its vCPU quota\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: OpenstackNovaTenantMemoryQuotaNearlyExhausted
|
||||
expr: 'openstack_nova_limits_memory_used / openstack_nova_limits_memory_max > 0.9 and openstack_nova_limits_memory_max > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: OpenStack Nova tenant memory quota nearly exhausted (instance {{ $labels.instance }})
|
||||
description: "Tenant {{ $labels.tenant }} has used over 90% of its memory quota\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: OpenstackNovaTenantInstanceQuotaNearlyExhausted
|
||||
expr: 'openstack_nova_limits_instances_used / openstack_nova_limits_instances_max > 0.9 and openstack_nova_limits_instances_max > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: OpenStack Nova tenant instance quota nearly exhausted (instance {{ $labels.instance }})
|
||||
description: "Tenant {{ $labels.tenant }} has used over 90% of its instance quota\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: OpenstackCinderTenantVolumeQuotaNearlyExhausted
|
||||
expr: 'openstack_cinder_limits_volume_used_gb / openstack_cinder_limits_volume_max_gb > 0.9 and openstack_cinder_limits_volume_max_gb > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: OpenStack Cinder tenant volume quota nearly exhausted (instance {{ $labels.instance }})
|
||||
description: "Tenant {{ $labels.tenant }} has used over 90% of its volume storage quota\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: OpenstackCinderPoolLowFreeCapacity
|
||||
expr: 'openstack_cinder_pool_capacity_free_gb / openstack_cinder_pool_capacity_total_gb < 0.1 and openstack_cinder_pool_capacity_total_gb > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: OpenStack Cinder pool low free capacity (instance {{ $labels.instance }})
|
||||
description: "Cinder storage pool {{ $labels.name }} has less than 10% free capacity\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: OpenstackNeutronFloatingIpsAssociatedButNotActive
|
||||
expr: 'openstack_neutron_floating_ips_associated_not_active > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: OpenStack Neutron floating IPs associated but not active (instance {{ $labels.instance }})
|
||||
description: "{{ $value }} floating IPs are associated to a private IP but are not in ACTIVE state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: OpenstackNeutronRoutersNotActive
|
||||
expr: 'openstack_neutron_routers_not_active > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: OpenStack Neutron routers not active (instance {{ $labels.instance }})
|
||||
description: "{{ $value }} Neutron routers are not in ACTIVE state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: OpenstackNeutronSubnetIpPoolExhaustion
|
||||
expr: 'openstack_neutron_network_ip_availabilities_used / openstack_neutron_network_ip_availabilities_total > 0.9 and openstack_neutron_network_ip_availabilities_total > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: OpenStack Neutron subnet IP pool exhaustion (instance {{ $labels.instance }})
|
||||
description: "Subnet {{ $labels.subnet_name }} on network {{ $labels.network_name }} has used over 90% of its IP pool\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: OpenstackNeutronPortsWithoutIps
|
||||
expr: 'openstack_neutron_ports_no_ips > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: OpenStack Neutron ports without IPs (instance {{ $labels.instance }})
|
||||
description: "{{ $value }} active ports have no IP addresses assigned\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: OpenstackLoadBalancerNotOnline
|
||||
expr: 'openstack_loadbalancer_loadbalancer_status{operating_status!="ONLINE"} > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: OpenStack load balancer not online (instance {{ $labels.instance }})
|
||||
description: "Load balancer {{ $labels.name }} ({{ $labels.id }}) operating status is {{ $labels.operating_status }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: OpenstackNovaInstancesInErrorState
|
||||
expr: 'sum(openstack_nova_server_status{status="ERROR"}) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: OpenStack Nova instances in ERROR state (instance {{ $labels.instance }})
|
||||
description: "{{ $value }} Nova instances are in ERROR state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: OpenstackCinderVolumesInErrorState
|
||||
expr: 'openstack_cinder_volume_status_counter{status=~"error.*"} > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: OpenStack Cinder volumes in error state (instance {{ $labels.instance }})
|
||||
description: "{{ $value }} Cinder volumes are in an error state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# This alert factors in the allocation ratio to compute effective capacity.
|
||||
# The threshold of 90% is a rough default. Adjust based on your allocation ratios and workload patterns.
|
||||
- alert: OpenstackPlacementResourceHighUsage
|
||||
expr: 'openstack_placement_resource_usage / (openstack_placement_resource_total * openstack_placement_resource_allocation_ratio) > 0.9 and openstack_placement_resource_total > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: OpenStack placement resource high usage (instance {{ $labels.instance }})
|
||||
description: "Resource {{ $labels.resourcetype }} on host {{ $labels.hostname }} usage exceeds 90% of its allocation\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
128
dist/rules/opentelemetry-collector/embedded-exporter.yml
vendored
Normal file
128
dist/rules/opentelemetry-collector/embedded-exporter.yml
vendored
Normal file
|
|
@ -0,0 +1,128 @@
|
|||
groups:
|
||||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
# OpenTelemetry Collector self-monitoring metrics are exposed on port 8888 by default at the /metrics endpoint.
|
||||
# These alerts monitor the collector's health when metrics are ingested via the Prometheus OTLP endpoint or scraped directly.
|
||||
# All collector internal metrics are prefixed with 'otelcol_'.
|
||||
|
||||
rules:
|
||||
|
||||
# Adjust the job label regex to match the actual job name in your Prometheus scrape config.
|
||||
- alert: OpentelemetryCollectorDown
|
||||
expr: 'up{job=~".*otel.*collector.*"} == 0'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: OpenTelemetry Collector down (instance {{ $labels.instance }})
|
||||
description: "OpenTelemetry Collector instance has disappeared or is not being scraped\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold of 0.05/s avoids firing on transient single-event spikes.
|
||||
- alert: OpentelemetryCollectorReceiverRefusedSpans
|
||||
expr: 'rate(otelcol_receiver_refused_spans[5m]) > 0.05'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: OpenTelemetry Collector receiver refused spans (instance {{ $labels.instance }})
|
||||
description: "OpenTelemetry Collector is refusing {{ $value | humanize }}/s spans on {{ $labels.receiver }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold of 0.05/s avoids firing on transient single-event spikes.
|
||||
- alert: OpentelemetryCollectorReceiverRefusedMetricPoints
|
||||
expr: 'rate(otelcol_receiver_refused_metric_points[5m]) > 0.05'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: OpenTelemetry Collector receiver refused metric points (instance {{ $labels.instance }})
|
||||
description: "OpenTelemetry Collector is refusing {{ $value | humanize }}/s metric points on {{ $labels.receiver }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold of 0.05/s avoids firing on transient single-event spikes.
|
||||
- alert: OpentelemetryCollectorReceiverRefusedLogRecords
|
||||
expr: 'rate(otelcol_receiver_refused_log_records[5m]) > 0.05'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: OpenTelemetry Collector receiver refused log records (instance {{ $labels.instance }})
|
||||
description: "OpenTelemetry Collector is refusing {{ $value | humanize }}/s log records on {{ $labels.receiver }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold of 0.05/s avoids firing on transient single-event spikes.
|
||||
- alert: OpentelemetryCollectorExporterFailedSpans
|
||||
expr: 'rate(otelcol_exporter_send_failed_spans[5m]) > 0.05'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: OpenTelemetry Collector exporter failed spans (instance {{ $labels.instance }})
|
||||
description: "OpenTelemetry Collector failing to send {{ $value | humanize }}/s spans via {{ $labels.exporter }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold of 0.05/s avoids firing on transient single-event spikes.
|
||||
- alert: OpentelemetryCollectorExporterFailedMetricPoints
|
||||
expr: 'rate(otelcol_exporter_send_failed_metric_points[5m]) > 0.05'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: OpenTelemetry Collector exporter failed metric points (instance {{ $labels.instance }})
|
||||
description: "OpenTelemetry Collector failing to send {{ $value | humanize }}/s metric points via {{ $labels.exporter }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold of 0.05/s avoids firing on transient single-event spikes.
|
||||
- alert: OpentelemetryCollectorExporterFailedLogRecords
|
||||
expr: 'rate(otelcol_exporter_send_failed_log_records[5m]) > 0.05'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: OpenTelemetry Collector exporter failed log records (instance {{ $labels.instance }})
|
||||
description: "OpenTelemetry Collector failing to send {{ $value | humanize }}/s log records via {{ $labels.exporter }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: OpentelemetryCollectorExporterQueueNearlyFull
|
||||
expr: '(otelcol_exporter_queue_size / on(instance, job, exporter) otelcol_exporter_queue_capacity) > 0.8 and otelcol_exporter_queue_capacity > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: OpenTelemetry Collector exporter queue nearly full (instance {{ $labels.instance }})
|
||||
description: "OpenTelemetry Collector exporter {{ $labels.exporter }} queue is over 80% full\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold of 0.05/s avoids firing on transient single-event spikes.
|
||||
# These processor metrics are deprecated since collector v0.110.0.
|
||||
- alert: OpentelemetryCollectorProcessorRefusedSpans
|
||||
expr: 'rate(otelcol_processor_refused_spans[5m]) > 0.05'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: OpenTelemetry Collector processor refused spans (instance {{ $labels.instance }})
|
||||
description: "OpenTelemetry Collector processor {{ $labels.processor }} is refusing spans ({{ $value | humanize }}/s), likely due to backpressure.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold of 0.05/s avoids firing on transient single-event spikes.
|
||||
# These processor metrics are deprecated since collector v0.110.0.
|
||||
- alert: OpentelemetryCollectorProcessorRefusedMetricPoints
|
||||
expr: 'rate(otelcol_processor_refused_metric_points[5m]) > 0.05'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: OpenTelemetry Collector processor refused metric points (instance {{ $labels.instance }})
|
||||
description: "OpenTelemetry Collector processor {{ $labels.processor }} is refusing metric points ({{ $value | humanize }}/s), likely due to backpressure.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: OpentelemetryCollectorHighMemoryUsage
|
||||
expr: '(otelcol_process_runtime_heap_alloc_bytes / on(instance, job) otelcol_process_runtime_total_sys_memory_bytes) > 0.9'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: OpenTelemetry Collector high memory usage (instance {{ $labels.instance }})
|
||||
description: "OpenTelemetry Collector memory usage is above 90%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: OpentelemetryCollectorOtlpReceiverErrors
|
||||
expr: 'rate(otelcol_receiver_accepted_spans{receiver=~"otlp"}[5m]) == 0 and rate(otelcol_receiver_refused_spans{receiver=~"otlp"}[5m]) > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: OpenTelemetry Collector OTLP receiver errors (instance {{ $labels.instance }})
|
||||
description: "OpenTelemetry Collector OTLP receiver is completely failing - all spans are being refused\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
84
dist/rules/oracle-database/iamseth-oracledb-exporter.yml
vendored
Normal file
84
dist/rules/oracle-database/iamseth-oracledb-exporter.yml
vendored
Normal file
|
|
@ -0,0 +1,84 @@
|
|||
groups:
|
||||
|
||||
- name: IamsethOracledbExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
# 1m delay allows a restart without triggering an alert.
|
||||
- alert: OracleDbDown
|
||||
expr: 'oracledb_up == 0'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Oracle DB down (instance {{ $labels.instance }})
|
||||
description: "Oracle Database instance is down on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold is workload-dependent. Adjust 85% to suit your environment.
|
||||
- alert: OracleDbSessionsReachingLimit(>85%)
|
||||
expr: 'oracledb_resource_current_utilization{resource_name="sessions"} / oracledb_resource_limit_value{resource_name="sessions"} * 100 > 85 and oracledb_resource_limit_value{resource_name="sessions"} > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Oracle DB sessions reaching limit (> 85%) (instance {{ $labels.instance }})
|
||||
description: "Oracle Database session utilization is above 85% on {{ $labels.instance }} (current value: {{ $value }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold is workload-dependent. Adjust 85% to suit your environment.
|
||||
- alert: OracleDbProcessesReachingLimit(>85%)
|
||||
expr: 'oracledb_resource_current_utilization{resource_name="processes"} / oracledb_resource_limit_value{resource_name="processes"} * 100 > 85 and oracledb_resource_limit_value{resource_name="processes"} > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Oracle DB processes reaching limit (> 85%) (instance {{ $labels.instance }})
|
||||
description: "Oracle Database process utilization is above 85% on {{ $labels.instance }} (current value: {{ $value }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: OracleDbTablespaceReachingCapacity(>85%)
|
||||
expr: 'oracledb_tablespace_used_percent > 85'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Oracle DB tablespace reaching capacity (> 85%) (instance {{ $labels.instance }})
|
||||
description: "Oracle Database tablespace {{ $labels.tablespace }} is above 85% usage on {{ $labels.instance }} (current value: {{ $value }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: OracleDbTablespaceFull(>95%)
|
||||
expr: 'oracledb_tablespace_used_percent > 95'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Oracle DB tablespace full (> 95%) (instance {{ $labels.instance }})
|
||||
description: "Oracle Database tablespace {{ $labels.tablespace }} is critically full on {{ $labels.instance }} (current value: {{ $value }}%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# A high rollback rate (>20%) often indicates application-level issues such as deadlocks, constraint violations, or poorly designed transactions.
|
||||
- alert: OracleDbHighUserRollbacks
|
||||
expr: 'rate(oracledb_activity_user_rollbacks[5m]) / (rate(oracledb_activity_user_commits[5m]) + rate(oracledb_activity_user_rollbacks[5m])) * 100 > 20 and (rate(oracledb_activity_user_commits[5m]) + rate(oracledb_activity_user_rollbacks[5m])) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Oracle DB high user rollbacks (instance {{ $labels.instance }})
|
||||
description: "Oracle Database on {{ $labels.instance }} has a high rollback rate ({{ $value }}% of transactions are rolled back)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold is highly workload-dependent. Adjust 200 to suit your environment.
|
||||
- alert: OracleDbTooManyActiveSessions
|
||||
expr: 'oracledb_sessions_value{status="ACTIVE", type="USER"} > 200'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Oracle DB too many active sessions (instance {{ $labels.instance }})
|
||||
description: "Oracle Database on {{ $labels.instance }} has too many active user sessions (current value: {{ $value }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# The metric from v$waitclassmetric is already a normalized rate (centiseconds per second). Threshold 300 means 3 seconds of I/O wait per second of wall time.
|
||||
- alert: OracleDbHighWaitTime(userI/o)
|
||||
expr: 'oracledb_wait_time_user_io > 300'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Oracle DB high wait time (user I/O) (instance {{ $labels.instance }})
|
||||
description: "Oracle Database on {{ $labels.instance }} is experiencing high user I/O wait time\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
@ -2,11 +2,13 @@ groups:
|
|||
|
||||
- name: EmbeddedExporterPatroni
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
# 1m delay allows a restart without triggering an alert.
|
||||
- alert: PatroniHasNoLeader
|
||||
expr: '(max by (scope) (patroni_master) < 1) and (max by (scope) (patroni_standby_leader) < 1)'
|
||||
for: 0m
|
||||
expr: '(max by (scope) (patroni_primary) < 1) and (max by (scope) (patroni_standby_leader) < 1)'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: SpreakerPgbouncerExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: PgbouncerActiveConnections
|
||||
|
|
@ -20,10 +21,10 @@ groups:
|
|||
severity: warning
|
||||
annotations:
|
||||
summary: PGBouncer errors (instance {{ $labels.instance }})
|
||||
description: "PGBouncer is logging errors. This may be due to a a server restart or an admin typing commands at the pgbouncer console.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "PGBouncer is logging errors. This may be due to a server restart or an admin typing commands at the pgbouncer console.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PgbouncerMaxConnections
|
||||
expr: 'increase(pgbouncer_errors_count{errmsg="no more connections allowed (max_client_conn)"}[30s]) > 0'
|
||||
expr: 'increase(pgbouncer_errors_count{errmsg="no more connections allowed (max_client_conn)"}[2m]) > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
|
|
|
|||
5
dist/rules/php-fpm/bakins-fpm-exporter.yml
vendored
5
dist/rules/php-fpm/bakins-fpm-exporter.yml
vendored
|
|
@ -2,13 +2,14 @@ groups:
|
|||
|
||||
- name: BakinsFpmExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: Php-fpmMax-childrenReached
|
||||
expr: 'sum(phpfpm_max_children_reached_total) by (instance) > 0'
|
||||
expr: 'sum(increase(phpfpm_max_children_reached_total[5m])) by (instance) > 3'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: PHP-FPM max-children reached (instance {{ $labels.instance }})
|
||||
description: "PHP-FPM reached max children - {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "PHP-FPM reached max children on {{ $labels.instance }} ({{ $value }} times in the last 5m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
|
|||
66
dist/rules/postgresql/postgres-exporter.yml
vendored
66
dist/rules/postgresql/postgres-exporter.yml
vendored
|
|
@ -2,11 +2,13 @@ groups:
|
|||
|
||||
- name: PostgresExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
# 1m delay allows a restart without triggering an alert.
|
||||
- alert: PostgresqlDown
|
||||
expr: 'pg_up == 0'
|
||||
for: 0m
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
|
|
@ -32,7 +34,7 @@ groups:
|
|||
description: "Postgresql exporter is showing errors. A query may be buggy in query.yaml\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlTableNotAutoVacuumed
|
||||
expr: '(pg_stat_user_tables_last_autovacuum > 0) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10'
|
||||
expr: '((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_vacuum_threshold) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -41,7 +43,7 @@ groups:
|
|||
description: "Table {{ $labels.relname }} has not been auto vacuumed for 10 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlTableNotAutoAnalyzed
|
||||
expr: '(pg_stat_user_tables_last_autoanalyze > 0) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10'
|
||||
expr: '((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_analyze_threshold) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -62,22 +64,22 @@ groups:
|
|||
expr: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Postgresql not enough connections (instance {{ $labels.instance }})
|
||||
description: "PostgreSQL instance should have more connections (> 5)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlDeadLocks
|
||||
expr: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5'
|
||||
expr: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres",datid!="0"}[1m]) > 5'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql dead locks (instance {{ $labels.instance }})
|
||||
description: "PostgreSQL has dead-locks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "PostgreSQL has dead-locks ({{ $value }} in the last minute)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlHighRollbackRate
|
||||
expr: 'sum by (namespace,datname) ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) / ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m])))) > 0.02'
|
||||
expr: 'sum by (namespace,datname,instance) (rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) / (sum by (namespace,datname,instance) (rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + sum by (namespace,datname,instance) (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m]))) > 0.02 and (sum by (namespace,datname,instance) (rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + sum by (namespace,datname,instance) (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m]))) > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -86,7 +88,7 @@ groups:
|
|||
description: "Ratio of transactions being aborted compared to committed is > 2 %\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlCommitRateLow
|
||||
expr: 'rate(pg_stat_database_xact_commit[1m]) < 10'
|
||||
expr: 'increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[5m]) < 5'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
|
|
@ -94,6 +96,7 @@ groups:
|
|||
summary: Postgresql commit rate low (instance {{ $labels.instance }})
|
||||
description: "Postgresql seems to be processing very few transactions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# pg_txid_current is not a default postgres_exporter metric. You need to define a custom query. See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
|
||||
- alert: PostgresqlLowXidConsumption
|
||||
expr: 'rate(pg_txid_current[1m]) < 5'
|
||||
for: 2m
|
||||
|
|
@ -103,26 +106,8 @@ groups:
|
|||
summary: Postgresql low XID consumption (instance {{ $labels.instance }})
|
||||
description: "Postgresql seems to be consuming transaction IDs very slowly\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlHighRateStatementTimeout
|
||||
expr: 'rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Postgresql high rate statement timeout (instance {{ $labels.instance }})
|
||||
description: "Postgres transactions showing high rate of statement timeouts\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlHighRateDeadlock
|
||||
expr: 'increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Postgresql high rate deadlock (instance {{ $labels.instance }})
|
||||
description: "Postgres detected deadlocks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlUnusedReplicationSlot
|
||||
expr: 'pg_replication_slots_active == 0'
|
||||
expr: '(pg_replication_slots_active == 0) and (pg_replication_is_replica == 0)'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -131,7 +116,7 @@ groups:
|
|||
description: "Unused Replication Slots\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlTooManyDeadTuples
|
||||
expr: '((pg_stat_user_tables_n_dead_tup > 10000) / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)) >= 0.1'
|
||||
expr: '((pg_stat_user_tables_n_dead_tup > 10000) / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)) >= 0.1 and (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup) > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
@ -140,7 +125,7 @@ groups:
|
|||
description: "PostgreSQL dead tuples is too large\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlConfigurationChanged
|
||||
expr: '{__name__=~"pg_settings_.*"} != ON(__name__) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"} OFFSET 5m'
|
||||
expr: '{__name__=~"pg_settings_.*",__name__!="pg_settings_transaction_read_only"} != ON(__name__, instance) {__name__=~"pg_settings_.*",__name__!="pg_settings_transaction_read_only"} OFFSET 5m'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: info
|
||||
|
|
@ -148,17 +133,18 @@ groups:
|
|||
summary: Postgresql configuration changed (instance {{ $labels.instance }})
|
||||
description: "Postgres Database configuration change has occurred\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# pg_stat_ssl_compression is not a default postgres_exporter metric and is only available on PostgreSQL 9.5-13 (removed in PG 14). See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
|
||||
- alert: PostgresqlSslCompressionActive
|
||||
expr: 'sum(pg_stat_ssl_compression) > 0'
|
||||
expr: 'sum by (instance) (pg_stat_ssl_compression) > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql SSL compression active (instance {{ $labels.instance }})
|
||||
description: "Database connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Database allows connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlTooManyLocksAcquired
|
||||
expr: '((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20'
|
||||
expr: '((sum by (instance) (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20 and (pg_settings_max_locks_per_transaction * pg_settings_max_connections) > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
|
|
@ -166,6 +152,7 @@ groups:
|
|||
summary: Postgresql too many locks acquired (instance {{ $labels.instance }})
|
||||
description: "Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
|
||||
- alert: PostgresqlBloatIndexHigh(>80%)
|
||||
expr: 'pg_bloat_btree_bloat_pct > 80 and on (idxname) (pg_bloat_btree_real_size > 100000000)'
|
||||
for: 1h
|
||||
|
|
@ -175,6 +162,7 @@ groups:
|
|||
summary: Postgresql bloat index high (> 80%) (instance {{ $labels.instance }})
|
||||
description: "The index {{ $labels.idxname }} is bloated. You should execute `REINDEX INDEX CONCURRENTLY {{ $labels.idxname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
|
||||
- alert: PostgresqlBloatTableHigh(>80%)
|
||||
expr: 'pg_bloat_table_bloat_pct > 80 and on (relname) (pg_bloat_table_real_size > 200000000)'
|
||||
for: 1h
|
||||
|
|
@ -184,11 +172,21 @@ groups:
|
|||
summary: Postgresql bloat table high (> 80%) (instance {{ $labels.instance }})
|
||||
description: "The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
|
||||
- alert: PostgresqlInvalidIndex
|
||||
expr: 'pg_genaral_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}'
|
||||
expr: 'pg_general_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}'
|
||||
for: 6h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql invalid index (instance {{ $labels.instance }})
|
||||
description: "The table {{ $labels.relname }} has an invalid index: {{ $labels.indexrelname }}. You should execute `DROP INDEX {{ $labels.indexrelname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlReplicationLag
|
||||
expr: 'pg_replication_lag_seconds > 5'
|
||||
for: 30s
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql replication lag (instance {{ $labels.instance }})
|
||||
description: "The PostgreSQL replication lag is high (> 5s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
|
|||
102
dist/rules/process-exporter/process-exporter.yml
vendored
Normal file
102
dist/rules/process-exporter/process-exporter.yml
vendored
Normal file
|
|
@ -0,0 +1,102 @@
|
|||
groups:
|
||||
|
||||
- name: ProcessExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: ProcessExporterGroupDown
|
||||
expr: 'namedprocess_namegroup_num_procs == 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Process exporter group down (instance {{ $labels.instance }})
|
||||
description: "No processes found for group {{ $labels.groupname }}. The service may have stopped. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold of 4GB is arbitrary and depends on the process being monitored. Adjust per group.
|
||||
- alert: ProcessExporterHighMemoryUsage
|
||||
expr: 'namedprocess_namegroup_memory_bytes{memtype="resident"} > 4e+09'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Process exporter high memory usage (instance {{ $labels.instance }})
|
||||
description: "Process group {{ $labels.groupname }} is using {{ $value | humanize }}B of resident memory. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Value is core-equivalent %: 100% = 1 full core, 200% = 2 cores, etc. Threshold of 80% is per-core. Adjust based on expected workload.
|
||||
- alert: ProcessExporterHighCpuUsage
|
||||
expr: 'rate(namedprocess_namegroup_cpu_seconds_total[5m]) * 100 > 80'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Process exporter high CPU usage (instance {{ $labels.instance }})
|
||||
description: "Process group {{ $labels.groupname }} is using {{ $value }}% CPU (core-equivalent). (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ProcessExporterHighFileDescriptorUsage
|
||||
expr: 'namedprocess_namegroup_worst_fd_ratio > 0.8'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Process exporter high file descriptor usage (instance {{ $labels.instance }})
|
||||
description: "Process group {{ $labels.groupname }} is using more than 80% of its file descriptor limit. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ProcessExporterFileDescriptorsExhausted
|
||||
expr: 'namedprocess_namegroup_worst_fd_ratio > 0.95'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Process exporter file descriptors exhausted (instance {{ $labels.instance }})
|
||||
description: "Process group {{ $labels.groupname }} has nearly exhausted its file descriptor limit. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold of 512MB is arbitrary. Adjust per group and environment.
|
||||
- alert: ProcessExporterHighSwapUsage
|
||||
expr: 'namedprocess_namegroup_memory_bytes{memtype="swapped"} > 512e+06'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Process exporter high swap usage (instance {{ $labels.instance }})
|
||||
description: "Process group {{ $labels.groupname }} is using {{ $value | humanize }}B of swap. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ProcessExporterZombieProcesses
|
||||
expr: 'namedprocess_namegroup_states{state="Zombie"} > 5'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Process exporter zombie processes (instance {{ $labels.instance }})
|
||||
description: "Process group {{ $labels.groupname }} has {{ $value }} zombie processes. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Filters to voluntary switches only — involuntary switches are normal under CPU contention. Threshold of 50000/s is a rough default. Adjust based on workload.
|
||||
- alert: ProcessExporterHighContextSwitching
|
||||
expr: 'rate(namedprocess_namegroup_context_switches_total{ctxswitchtype="voluntary"}[5m]) > 50000'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Process exporter high context switching (instance {{ $labels.instance }})
|
||||
description: "Process group {{ $labels.groupname }} has a high rate of context switches ({{ $value }}/s). (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold of 100MB/s is arbitrary. Adjust per group.
|
||||
- alert: ProcessExporterHighDiskWriteIo
|
||||
expr: 'rate(namedprocess_namegroup_write_bytes_total[5m]) > 100e+06'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Process exporter high disk write IO (instance {{ $labels.instance }})
|
||||
description: "Process group {{ $labels.groupname }} is performing {{ $value | humanize }}B/s of disk writes. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Detects restarts by watching for changes in the oldest process start time within the group.
|
||||
- alert: ProcessExporterProcessRestarting
|
||||
expr: 'changes(namedprocess_namegroup_oldest_start_time_seconds[5m]) > 0 and namedprocess_namegroup_num_procs > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Process exporter process restarting (instance {{ $labels.instance }})
|
||||
description: "Process group {{ $labels.groupname }} has restarted (oldest process start time changed). (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: PrometheusJobMissing
|
||||
|
|
@ -13,9 +14,11 @@ groups:
|
|||
summary: Prometheus job missing (instance {{ $labels.instance }})
|
||||
description: "A Prometheus job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Only fire if at least one target in the job is still up.
|
||||
# If all targets are down, PrometheusJobMissing or PrometheusAllTargetsMissing will fire instead.
|
||||
- alert: PrometheusTargetMissing
|
||||
expr: 'up == 0'
|
||||
for: 0m
|
||||
expr: 'up == 0 unless on(job) (sum by (job) (up) == 0)'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
|
|
@ -24,7 +27,7 @@ groups:
|
|||
|
||||
- alert: PrometheusAllTargetsMissing
|
||||
expr: 'sum by (job) (up) == 0'
|
||||
for: 0m
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
|
|
@ -32,8 +35,8 @@ groups:
|
|||
description: "A Prometheus job does not have living target anymore.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusTargetMissingWithWarmupTime
|
||||
expr: 'sum by (instance, job) ((up == 0) * on (instance) group_right(job) (node_time_seconds - node_boot_time_seconds > 600))'
|
||||
for: 0m
|
||||
expr: 'sum by (instance, job) ((up == 0) * on (instance) group_left(__name__) (node_time_seconds - node_boot_time_seconds > 600))'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
|
|
@ -140,13 +143,13 @@ groups:
|
|||
description: "The Prometheus notification queue has not been empty for 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusAlertmanagerNotificationFailing
|
||||
expr: 'rate(alertmanager_notifications_failed_total[1m]) > 0'
|
||||
expr: 'rate(alertmanager_notifications_failed_total[3m]) > 0.05'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }})
|
||||
description: "Alertmanager is failing sending notifications\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Alertmanager is failing sending notifications ({{ $value }} notifications/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusTargetEmpty
|
||||
expr: 'prometheus_sd_discovered_targets == 0'
|
||||
|
|
@ -173,16 +176,16 @@ groups:
|
|||
severity: warning
|
||||
annotations:
|
||||
summary: Prometheus large scrape (instance {{ $labels.instance }})
|
||||
description: "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Prometheus has many scrapes that exceed the sample limit ({{ $value }} scrapes)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusTargetScrapeDuplicate
|
||||
expr: 'increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0'
|
||||
expr: 'increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 3'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Prometheus target scrape duplicate (instance {{ $labels.instance }})
|
||||
description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
description: "Prometheus has many samples rejected due to duplicate timestamps but different values ({{ $value }} samples)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusTsdbCheckpointCreationFailures
|
||||
expr: 'increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0'
|
||||
|
|
|
|||
Some files were not shown because too many files have changed in this diff Show more
Loading…
Reference in a new issue