diff --git a/.github/workflows/dist.yml b/.github/workflows/dist.yml
index 64a318a..85b6ce4 100644
--- a/.github/workflows/dist.yml
+++ b/.github/workflows/dist.yml
@@ -1,6 +1,7 @@
name: Publish
on:
+ workflow_dispatch:
push:
branches:
- master
@@ -13,22 +14,23 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout Repo
- uses: actions/checkout@v3
+ uses: actions/checkout@v4
- name: Set up Ruby
uses: ruby/setup-ruby@v1
with:
- ruby-version: 2.7
+ ruby-version: 3.4
- name: Set up yq
uses: mikefarah/yq@master
- name: Install liquid
- run: gem install liquid-cli
+ run: |
+ gem install liquid -v 5.5.1
+ gem install liquid-cli
- name: Build rule configuration
run: |
- gem install liquid-cli
cat _data/rules.yml | yq -I 0 -o json > _data/rules.json
rm -rf dist/rules
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 5d0cc3e..00059f0 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -8,12 +8,12 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout Repo
- uses: actions/checkout@v3
+ uses: actions/checkout@v4
- name: Set up Ruby
uses: ruby/setup-ruby@v1
with:
- ruby-version: 2.7
+ ruby-version: 3.4
- name: Set up yq
uses: mikefarah/yq@master
@@ -31,7 +31,7 @@ jobs:
mkdir -p "${subdir}"
# groupName=$(echo "{% assign groupName = name | split: ' ' %}{% capture groupNameCamelcase %}{% for word in groupName %}{{ word | capitalize }} {% endfor %}{% endcapture %} {{ groupNameCamelcase | remove: ' ' | remove: '-' }}" | liquid $(echo ${service} | base64 --decode | jq -r '.name | ascii_downcase | split(" ") | join("-")'))
-
+
for exporter in $(echo ${service} | base64 --decode | jq -r '.exporters[] | @base64'); do
exporterName=$(echo ${exporter} | base64 --decode | jq -r '.slug')
cat dist/template.yml | liquid "$(echo ${exporter} | base64 --decode)" > ${subdir}/${exporterName}.yml
diff --git a/.gitignore b/.gitignore
index 12ca387..66a746a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,4 @@ _site/
.jekyll-metadata
_data/rules.json
test/rules/
+/node_modules
\ No newline at end of file
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 1fcb24b..02b8c38 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -32,8 +32,8 @@ Or with Docker:
docker run --rm -it -p 4000:4000 -v $(pwd):/srv/jekyll jekyll/jekyll jekyll serve
```
-Or with Docker-Compose:
+Or with Docker Compose:
```
-docker-compose up -d
+docker compose up -d
```
diff --git a/FUNDING.json b/FUNDING.json
new file mode 100644
index 0000000..c4eccbf
--- /dev/null
+++ b/FUNDING.json
@@ -0,0 +1,7 @@
+{
+ "drips": {
+ "ethereum": {
+ "ownedBy": "0x1Baee8431ead537455399cC7099eBb219227C1f1"
+ }
+ }
+}
diff --git a/Gemfile b/Gemfile
index 31ddf5d..cddfa60 100644
--- a/Gemfile
+++ b/Gemfile
@@ -1,3 +1,3 @@
source 'https://rubygems.org'
-gem 'github-pages', group: :jekyll_plugins
-gem 'webrick', '~> 1.3', '>= 1.3.1'
\ No newline at end of file
+gem 'github-pages', '>= 232', group: :jekyll_plugins
+gem 'webrick', '~> 1.8'
\ No newline at end of file
diff --git a/Gemfile.lock b/Gemfile.lock
index 7b76ef9..305a897 100644
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -1,66 +1,61 @@
GEM
remote: https://rubygems.org/
specs:
- activesupport (6.0.6.1)
- concurrent-ruby (~> 1.0, >= 1.0.2)
- i18n (>= 0.7, < 2)
- minitest (~> 5.1)
- tzinfo (~> 1.1)
- zeitwerk (~> 2.2, >= 2.2.2)
- addressable (2.8.0)
- public_suffix (>= 2.0.2, < 5.0)
+ activesupport (7.2.1)
+ base64
+ bigdecimal
+ concurrent-ruby (~> 1.0, >= 1.3.1)
+ connection_pool (>= 2.2.5)
+ drb
+ i18n (>= 1.6, < 2)
+ logger (>= 1.4.2)
+ minitest (>= 5.1)
+ securerandom (>= 0.3)
+ tzinfo (~> 2.0, >= 2.0.5)
+ addressable (2.8.7)
+ public_suffix (>= 2.0.2, < 7.0)
+ base64 (0.2.0)
+ bigdecimal (3.1.8)
coffee-script (2.4.1)
coffee-script-source
execjs
- coffee-script-source (1.11.1)
+ coffee-script-source (1.12.2)
colorator (1.1.0)
commonmarker (0.23.10)
- concurrent-ruby (1.2.0)
- dnsruby (1.61.9)
- simpleidn (~> 0.1)
+ concurrent-ruby (1.3.4)
+ connection_pool (2.4.1)
+ csv (3.3.0)
+ dnsruby (1.72.2)
+ simpleidn (~> 0.2.1)
+ drb (2.2.1)
em-websocket (0.5.3)
eventmachine (>= 0.12.9)
http_parser.rb (~> 0)
- ethon (0.15.0)
+ ethon (0.16.0)
ffi (>= 1.15.0)
eventmachine (1.2.7)
- execjs (2.8.1)
- faraday (1.10.0)
- faraday-em_http (~> 1.0)
- faraday-em_synchrony (~> 1.0)
- faraday-excon (~> 1.1)
- faraday-httpclient (~> 1.0)
- faraday-multipart (~> 1.0)
- faraday-net_http (~> 1.0)
- faraday-net_http_persistent (~> 1.0)
- faraday-patron (~> 1.0)
- faraday-rack (~> 1.0)
- faraday-retry (~> 1.0)
- ruby2_keywords (>= 0.0.4)
- faraday-em_http (1.0.0)
- faraday-em_synchrony (1.0.0)
- faraday-excon (1.1.0)
- faraday-httpclient (1.0.1)
- faraday-multipart (1.0.3)
- multipart-post (>= 1.2, < 3)
- faraday-net_http (1.0.1)
- faraday-net_http_persistent (1.2.0)
- faraday-patron (1.0.0)
- faraday-rack (1.0.0)
- faraday-retry (1.0.3)
- ffi (1.15.5)
+ execjs (2.9.1)
+ faraday (2.12.0)
+ faraday-net_http (>= 2.0, < 3.4)
+ json
+ logger
+ faraday-net_http (3.3.0)
+ net-http
+ ffi (1.17.0)
+ ffi (1.17.0-x86_64-linux-gnu)
+ ffi (1.17.0-x86_64-linux-musl)
forwardable-extended (2.6.0)
- gemoji (3.0.1)
- github-pages (226)
- github-pages-health-check (= 1.17.9)
- jekyll (= 3.9.2)
- jekyll-avatar (= 0.7.0)
- jekyll-coffeescript (= 1.1.1)
- jekyll-commonmark-ghpages (= 0.2.0)
- jekyll-default-layout (= 0.1.4)
- jekyll-feed (= 0.15.1)
+ gemoji (4.1.0)
+ github-pages (232)
+ github-pages-health-check (= 1.18.2)
+ jekyll (= 3.10.0)
+ jekyll-avatar (= 0.8.0)
+ jekyll-coffeescript (= 1.2.2)
+ jekyll-commonmark-ghpages (= 0.5.1)
+ jekyll-default-layout (= 0.1.5)
+ jekyll-feed (= 0.17.0)
jekyll-gist (= 1.5.0)
- jekyll-github-metadata (= 2.13.0)
+ jekyll-github-metadata (= 2.16.1)
jekyll-include-cache (= 0.2.1)
jekyll-mentions (= 1.6.0)
jekyll-optional-front-matter (= 0.3.2)
@@ -87,32 +82,34 @@ GEM
jekyll-theme-tactile (= 0.2.0)
jekyll-theme-time-machine (= 0.2.0)
jekyll-titles-from-headings (= 0.5.3)
- jemoji (= 0.12.0)
- kramdown (= 2.3.2)
+ jemoji (= 0.13.0)
+ kramdown (= 2.4.0)
kramdown-parser-gfm (= 1.1.0)
- liquid (= 4.0.3)
+ liquid (= 4.0.4)
mercenary (~> 0.3)
minima (= 2.5.1)
- nokogiri (>= 1.13.4, < 2.0)
- rouge (= 3.26.0)
+ nokogiri (>= 1.16.2, < 2.0)
+ rouge (= 3.30.0)
terminal-table (~> 1.4)
- github-pages-health-check (1.17.9)
+ webrick (~> 1.8)
+ github-pages-health-check (1.18.2)
addressable (~> 2.3)
dnsruby (~> 1.60)
- octokit (~> 4.0)
- public_suffix (>= 3.0, < 5.0)
+ octokit (>= 4, < 8)
+ public_suffix (>= 3.0, < 6.0)
typhoeus (~> 1.3)
- html-pipeline (2.14.1)
+ html-pipeline (2.14.3)
activesupport (>= 2)
nokogiri (>= 1.4)
http_parser.rb (0.8.0)
- i18n (0.9.5)
+ i18n (1.14.6)
concurrent-ruby (~> 1.0)
- jekyll (3.9.2)
+ jekyll (3.10.0)
addressable (~> 2.4)
colorator (~> 1.0)
+ csv (~> 3.0)
em-websocket (~> 0.5)
- i18n (~> 0.7)
+ i18n (>= 0.7, < 2)
jekyll-sass-converter (~> 1.0)
jekyll-watch (~> 2.0)
kramdown (>= 1.17, < 3)
@@ -121,27 +118,28 @@ GEM
pathutil (~> 0.9)
rouge (>= 1.7, < 4)
safe_yaml (~> 1.0)
- jekyll-avatar (0.7.0)
+ webrick (>= 1.0)
+ jekyll-avatar (0.8.0)
jekyll (>= 3.0, < 5.0)
- jekyll-coffeescript (1.1.1)
+ jekyll-coffeescript (1.2.2)
coffee-script (~> 2.2)
- coffee-script-source (~> 1.11.1)
+ coffee-script-source (~> 1.12)
jekyll-commonmark (1.4.0)
commonmarker (~> 0.22)
- jekyll-commonmark-ghpages (0.2.0)
- commonmarker (~> 0.23.4)
- jekyll (~> 3.9.0)
+ jekyll-commonmark-ghpages (0.5.1)
+ commonmarker (>= 0.23.7, < 1.1.0)
+ jekyll (>= 3.9, < 4.0)
jekyll-commonmark (~> 1.4.0)
- rouge (>= 2.0, < 4.0)
- jekyll-default-layout (0.1.4)
- jekyll (~> 3.0)
- jekyll-feed (0.15.1)
+ rouge (>= 2.0, < 5.0)
+ jekyll-default-layout (0.1.5)
+ jekyll (>= 3.0, < 5.0)
+ jekyll-feed (0.17.0)
jekyll (>= 3.7, < 5.0)
jekyll-gist (1.5.0)
octokit (~> 4.2)
- jekyll-github-metadata (2.13.0)
+ jekyll-github-metadata (2.16.1)
jekyll (>= 3.4, < 5.0)
- octokit (~> 4.0, != 4.4.0)
+ octokit (>= 4, < 7, != 4.4.0)
jekyll-include-cache (0.2.1)
jekyll (>= 3.7, < 5.0)
jekyll-mentions (1.6.0)
@@ -212,40 +210,46 @@ GEM
jekyll (>= 3.3, < 5.0)
jekyll-watch (2.2.1)
listen (~> 3.0)
- jemoji (0.12.0)
- gemoji (~> 3.0)
+ jemoji (0.13.0)
+ gemoji (>= 3, < 5)
html-pipeline (~> 2.2)
jekyll (>= 3.0, < 5.0)
- kramdown (2.3.2)
+ json (2.7.2)
+ kramdown (2.4.0)
rexml
kramdown-parser-gfm (1.1.0)
kramdown (~> 2.0)
- liquid (4.0.3)
- listen (3.7.1)
+ liquid (4.0.4)
+ listen (3.9.0)
rb-fsevent (~> 0.10, >= 0.10.3)
rb-inotify (~> 0.9, >= 0.9.10)
+ logger (1.6.1)
mercenary (0.3.6)
+ mini_portile2 (2.8.7)
minima (2.5.1)
jekyll (>= 3.5, < 5.0)
jekyll-feed (~> 0.9)
jekyll-seo-tag (~> 2.1)
- minitest (5.17.0)
- multipart-post (2.1.1)
- nokogiri (1.14.3-x86_64-linux)
+ minitest (5.25.1)
+ net-http (0.4.1)
+ uri
+ nokogiri (1.16.7)
+ mini_portile2 (~> 2.8.2)
racc (~> 1.4)
- octokit (4.22.0)
- faraday (>= 0.9)
- sawyer (~> 0.8.0, >= 0.5.3)
+ nokogiri (1.16.7-x86_64-linux)
+ racc (~> 1.4)
+ octokit (4.25.1)
+ faraday (>= 1, < 3)
+ sawyer (~> 0.9)
pathutil (0.16.2)
forwardable-extended (~> 2.6)
- public_suffix (4.0.7)
- racc (1.6.2)
- rb-fsevent (0.11.1)
- rb-inotify (0.10.1)
+ public_suffix (5.1.1)
+ racc (1.8.1)
+ rb-fsevent (0.11.2)
+ rb-inotify (0.11.1)
ffi (~> 1.0)
- rexml (3.2.5)
- rouge (3.26.0)
- ruby2_keywords (0.0.5)
+ rexml (3.3.9)
+ rouge (3.30.0)
rubyzip (2.3.2)
safe_yaml (1.0.5)
sass (3.7.4)
@@ -253,32 +257,29 @@ GEM
sass-listen (4.0.0)
rb-fsevent (~> 0.9, >= 0.9.4)
rb-inotify (~> 0.9, >= 0.9.7)
- sawyer (0.8.2)
+ sawyer (0.9.2)
addressable (>= 2.3.5)
- faraday (> 0.8, < 2.0)
- simpleidn (0.2.1)
- unf (~> 0.1.4)
+ faraday (>= 0.17.3, < 3)
+ securerandom (0.3.1)
+ simpleidn (0.2.3)
terminal-table (1.8.0)
unicode-display_width (~> 1.1, >= 1.1.1)
- thread_safe (0.3.6)
- typhoeus (1.4.0)
+ typhoeus (1.4.1)
ethon (>= 0.9.0)
- tzinfo (1.2.11)
- thread_safe (~> 0.1)
- unf (0.1.4)
- unf_ext
- unf_ext (0.0.8.1)
+ tzinfo (2.0.6)
+ concurrent-ruby (~> 1.0)
unicode-display_width (1.8.0)
- webrick (1.7.0)
- zeitwerk (2.6.6)
+ uri (0.13.1)
+ webrick (1.8.2)
PLATFORMS
+ ruby
x86_64-linux
x86_64-linux-musl
DEPENDENCIES
- github-pages
- webrick (~> 1.3, >= 1.3.1)
+ github-pages (>= 232)
+ webrick (~> 1.8)
BUNDLED WITH
2.3.13
diff --git a/README.md b/README.md
index 45b4bf4..7011402 100644
--- a/README.md
+++ b/README.md
@@ -4,6 +4,21 @@
Collection available here: **[https://samber.github.io/awesome-prometheus-alerts](https://samber.github.io/awesome-prometheus-alerts)**
+
+
## ✨ Contents
- [Rules](#-rules)
@@ -18,6 +33,7 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts
- [Prometheus self-monitoring](https://samber.github.io/awesome-prometheus-alerts/rules#prometheus-internals)
- [Host/Hardware](https://samber.github.io/awesome-prometheus-alerts/rules#host-and-hardware)
+- [SMART](https://samber.github.io/awesome-prometheus-alerts/rules#smart)
- [Docker Containers](https://samber.github.io/awesome-prometheus-alerts/rules#docker-containers)
- [Blackbox](https://samber.github.io/awesome-prometheus-alerts/rules#blackbox)
- [Windows](https://samber.github.io/awesome-prometheus-alerts/rules#windows-server)
@@ -35,12 +51,15 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts
- [MongoDB](https://samber.github.io/awesome-prometheus-alerts/rules#mongodb)
- [RabbitMQ](https://samber.github.io/awesome-prometheus-alerts/rules#rabbitmq)
- [Elasticsearch](https://samber.github.io/awesome-prometheus-alerts/rules#elasticsearch)
+- [Meilisearch](https://samber.github.io/awesome-prometheus-alerts/rules#meilisearch)
- [Cassandra](https://samber.github.io/awesome-prometheus-alerts/rules#cassandra)
+- [Clickhouse](https://samber.github.io/awesome-prometheus-alerts/rules#clickhouse)
- [Zookeeper](https://samber.github.io/awesome-prometheus-alerts/rules#zookeeper)
- [Kafka](https://samber.github.io/awesome-prometheus-alerts/rules#kafka)
- [Pulsar](https://samber.github.io/awesome-prometheus-alerts/rules#pulsar)
- [Nats](https://samber.github.io/awesome-prometheus-alerts/rules#nats)
- [Solr](https://samber.github.io/awesome-prometheus-alerts/rules#solr)
+- [Hadoop](https://samber.github.io/awesome-prometheus-alerts/rules#hadoop)
#### Reverse proxies and load balancers
@@ -48,6 +67,7 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts
- [Apache](https://samber.github.io/awesome-prometheus-alerts/rules#apache)
- [HaProxy](https://samber.github.io/awesome-prometheus-alerts/rules#haproxy)
- [Traefik](https://samber.github.io/awesome-prometheus-alerts/rules#traefik)
+- [Caddy](https://samber.github.io/awesome-prometheus-alerts/rules#caddy)
#### Runtimes
@@ -83,7 +103,9 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts
- [Loki](https://samber.github.io/awesome-prometheus-alerts/rules#loki)
- [Promtail](https://samber.github.io/awesome-prometheus-alerts/rules#promtail)
- [Cortex](https://samber.github.io/awesome-prometheus-alerts/rules#cortex)
+- [Grafana Alloy](https://samber.github.io/awesome-prometheus-alerts/rules#grafana-alloy)
- [Jenkins](https://samber.github.io/awesome-prometheus-alerts/rules#jenkins)
+- [Graph Node](https://samber.github.io/awesome-prometheus-alerts/rules#graph-node)
## 🤝 Contributing
diff --git a/_data/rules.yml b/_data/rules.yml
index 748e80b..f8651d6 100644
--- a/_data/rules.yml
+++ b/_data/rules.yml
@@ -1,4 +1,3 @@
-
#
# The following yaml cannot be copy-pasted to Prometheus configuration.
# Please navigate to https://samber.github.io/awesome-prometheus-alerts/rules instead.
@@ -14,121 +13,121 @@ groups:
exporters:
- slug: embedded-exporter
rules:
- - name: Prometheus job missing
- description: A Prometheus job has disappeared
- query: 'absent(up{job="prometheus"})'
- severity: warning
- - name: Prometheus target missing
- description: A Prometheus target has disappeared. An exporter might be crashed.
- query: 'up == 0'
- severity: critical
- - name: Prometheus all targets missing
- description: A Prometheus job does not have living target anymore.
- query: 'sum by (job) (up) == 0'
- severity: critical
- - name: Prometheus target missing with warmup time
- description: Allow a job time to start up (10 minutes) before alerting that it's down.
- query: 'sum by (instance, job) ((up == 0) * on (instance) group_right(job) (node_time_seconds - node_boot_time_seconds > 600))'
- severity: critical
- - name: Prometheus configuration reload failure
- description: Prometheus configuration reload error
- query: 'prometheus_config_last_reload_successful != 1'
- severity: warning
- - name: Prometheus too many restarts
- description: Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.
- query: 'changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2'
- severity: warning
- - name: Prometheus AlertManager job missing
- description: A Prometheus AlertManager job has disappeared
- query: 'absent(up{job="alertmanager"})'
- severity: warning
- - name: Prometheus AlertManager configuration reload failure
- description: AlertManager configuration reload error
- query: 'alertmanager_config_last_reload_successful != 1'
- severity: warning
- - name: Prometheus AlertManager config not synced
- description: Configurations of AlertManager cluster instances are out of sync
- query: 'count(count_values("config_hash", alertmanager_config_hash)) > 1'
- severity: warning
- - name: Prometheus AlertManager E2E dead man switch
- description: Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.
- query: 'vector(1)'
- severity: critical
- - name: Prometheus not connected to alertmanager
- description: Prometheus cannot connect the alertmanager
- query: 'prometheus_notifications_alertmanagers_discovered < 1'
- severity: critical
- - name: Prometheus rule evaluation failures
- description: 'Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.'
- query: 'increase(prometheus_rule_evaluation_failures_total[3m]) > 0'
- severity: critical
- - name: Prometheus template text expansion failures
- description: 'Prometheus encountered {{ $value }} template text expansion failures'
- query: 'increase(prometheus_template_text_expansion_failures_total[3m]) > 0'
- severity: critical
- - name: Prometheus rule evaluation slow
- description: 'Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.'
- query: 'prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds'
- severity: warning
- for: 5m
- - name: Prometheus notifications backlog
- description: The Prometheus notification queue has not been empty for 10 minutes
- query: 'min_over_time(prometheus_notifications_queue_length[10m]) > 0'
- severity: warning
- - name: Prometheus AlertManager notification failing
- description: Alertmanager is failing sending notifications
- query: 'rate(alertmanager_notifications_failed_total[1m]) > 0'
- severity: critical
- - name: Prometheus target empty
- description: Prometheus has no target in service discovery
- query: 'prometheus_sd_discovered_targets == 0'
- severity: critical
- - name: Prometheus target scraping slow
- description: Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned.
- query: 'prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05'
- severity: warning
- for: 5m
- - name: Prometheus large scrape
- description: Prometheus has many scrapes that exceed the sample limit
- query: 'increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10'
- severity: warning
- for: 5m
- - name: Prometheus target scrape duplicate
- description: Prometheus has many samples rejected due to duplicate timestamps but different values
- query: 'increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0'
- severity: warning
- - name: Prometheus TSDB checkpoint creation failures
- description: 'Prometheus encountered {{ $value }} checkpoint creation failures'
- query: 'increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0'
- severity: critical
- - name: Prometheus TSDB checkpoint deletion failures
- description: 'Prometheus encountered {{ $value }} checkpoint deletion failures'
- query: 'increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0'
- severity: critical
- - name: Prometheus TSDB compactions failed
- description: 'Prometheus encountered {{ $value }} TSDB compactions failures'
- query: 'increase(prometheus_tsdb_compactions_failed_total[1m]) > 0'
- severity: critical
- - name: Prometheus TSDB head truncations failed
- description: 'Prometheus encountered {{ $value }} TSDB head truncation failures'
- query: 'increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0'
- severity: critical
- - name: Prometheus TSDB reload failures
- description: 'Prometheus encountered {{ $value }} TSDB reload failures'
- query: 'increase(prometheus_tsdb_reloads_failures_total[1m]) > 0'
- severity: critical
- - name: Prometheus TSDB WAL corruptions
- description: 'Prometheus encountered {{ $value }} TSDB WAL corruptions'
- query: 'increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0'
- severity: critical
- - name: Prometheus TSDB WAL truncations failed
- description: 'Prometheus encountered {{ $value }} TSDB WAL truncation failures'
- query: 'increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0'
- severity: critical
- - name: Prometheus timeseries cardinality
- description: 'The "{{ $labels.name }}" timeseries cardinality is getting very high: {{ $value }}'
- query: 'label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000'
- severity: warning
+ - name: Prometheus job missing
+ description: A Prometheus job has disappeared
+ query: 'absent(up{job="prometheus"})'
+ severity: warning
+ - name: Prometheus target missing
+ description: A Prometheus target has disappeared. An exporter might be crashed.
+ query: "up == 0"
+ severity: critical
+ - name: Prometheus all targets missing
+ description: A Prometheus job does not have living target anymore.
+ query: "sum by (job) (up) == 0"
+ severity: critical
+ - name: Prometheus target missing with warmup time
+ description: Allow a job time to start up (10 minutes) before alerting that it's down.
+ query: "sum by (instance, job) ((up == 0) * on (instance) group_left(__name__) (node_time_seconds - node_boot_time_seconds > 600))"
+ severity: critical
+ - name: Prometheus configuration reload failure
+ description: Prometheus configuration reload error
+ query: "prometheus_config_last_reload_successful != 1"
+ severity: warning
+ - name: Prometheus too many restarts
+ description: Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.
+ query: 'changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2'
+ severity: warning
+ - name: Prometheus AlertManager job missing
+ description: A Prometheus AlertManager job has disappeared
+ query: 'absent(up{job="alertmanager"})'
+ severity: warning
+ - name: Prometheus AlertManager configuration reload failure
+ description: AlertManager configuration reload error
+ query: "alertmanager_config_last_reload_successful != 1"
+ severity: warning
+ - name: Prometheus AlertManager config not synced
+ description: Configurations of AlertManager cluster instances are out of sync
+ query: 'count(count_values("config_hash", alertmanager_config_hash)) > 1'
+ severity: warning
+ - name: Prometheus AlertManager E2E dead man switch
+ description: Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.
+ query: "vector(1)"
+ severity: critical
+ - name: Prometheus not connected to alertmanager
+ description: Prometheus cannot connect the alertmanager
+ query: "prometheus_notifications_alertmanagers_discovered < 1"
+ severity: critical
+ - name: Prometheus rule evaluation failures
+ description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts."
+ query: "increase(prometheus_rule_evaluation_failures_total[3m]) > 0"
+ severity: critical
+ - name: Prometheus template text expansion failures
+ description: "Prometheus encountered {{ $value }} template text expansion failures"
+ query: "increase(prometheus_template_text_expansion_failures_total[3m]) > 0"
+ severity: critical
+ - name: Prometheus rule evaluation slow
+ description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query."
+ query: "prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds"
+ severity: warning
+ for: 5m
+ - name: Prometheus notifications backlog
+ description: The Prometheus notification queue has not been empty for 10 minutes
+ query: "min_over_time(prometheus_notifications_queue_length[10m]) > 0"
+ severity: warning
+ - name: Prometheus AlertManager notification failing
+ description: Alertmanager is failing sending notifications
+ query: "rate(alertmanager_notifications_failed_total[1m]) > 0"
+ severity: critical
+ - name: Prometheus target empty
+ description: Prometheus has no target in service discovery
+ query: "prometheus_sd_discovered_targets == 0"
+ severity: critical
+ - name: Prometheus target scraping slow
+ description: Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned.
+ query: 'prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05'
+ severity: warning
+ for: 5m
+ - name: Prometheus large scrape
+ description: Prometheus has many scrapes that exceed the sample limit
+ query: "increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10"
+ severity: warning
+ for: 5m
+ - name: Prometheus target scrape duplicate
+ description: Prometheus has many samples rejected due to duplicate timestamps but different values
+ query: "increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0"
+ severity: warning
+ - name: Prometheus TSDB checkpoint creation failures
+ description: "Prometheus encountered {{ $value }} checkpoint creation failures"
+ query: "increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0"
+ severity: critical
+ - name: Prometheus TSDB checkpoint deletion failures
+ description: "Prometheus encountered {{ $value }} checkpoint deletion failures"
+ query: "increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0"
+ severity: critical
+ - name: Prometheus TSDB compactions failed
+ description: "Prometheus encountered {{ $value }} TSDB compactions failures"
+ query: "increase(prometheus_tsdb_compactions_failed_total[1m]) > 0"
+ severity: critical
+ - name: Prometheus TSDB head truncations failed
+ description: "Prometheus encountered {{ $value }} TSDB head truncation failures"
+ query: "increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0"
+ severity: critical
+ - name: Prometheus TSDB reload failures
+ description: "Prometheus encountered {{ $value }} TSDB reload failures"
+ query: "increase(prometheus_tsdb_reloads_failures_total[1m]) > 0"
+ severity: critical
+ - name: Prometheus TSDB WAL corruptions
+ description: "Prometheus encountered {{ $value }} TSDB WAL corruptions"
+ query: "increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0"
+ severity: critical
+ - name: Prometheus TSDB WAL truncations failed
+ description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures"
+ query: "increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0"
+ severity: critical
+ - name: Prometheus timeseries cardinality
+ description: 'The "{{ $labels.name }}" timeseries cardinality is getting very high: {{ $value }}'
+ query: 'label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000'
+ severity: warning
- name: Host and hardware
exporters:
@@ -138,53 +137,43 @@ groups:
rules:
- name: Host out of memory
description: Node memory is filling up (< 10% left)
- query: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ query: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < .10)'
severity: warning
for: 2m
- name: Host memory under memory pressure
- description: The node is under heavy memory pressure. High rate of major page faults
- query: '(rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ description: The node is under heavy memory pressure. High rate of loading memory pages from disk.
+ query: '(rate(node_vmstat_pgmajfault[5m]) > 1000)'
severity: warning
- for: 2m
- name: Host Memory is underutilized
- description: 'Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})'
- query: '(100 - (rate(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ description: "Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})"
+ query: 'min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8'
severity: info
- for: 1w
comments: |
You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
- name: Host unusual network throughput in
- description: Host network interfaces are probably receiving too much data (> 100 MB/s)
- query: '(sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ description: Host receive bandwidth is high (>80%).
+ query: '((rate(node_network_receive_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)'
severity: warning
- for: 5m
- name: Host unusual network throughput out
- description: Host network interfaces are probably sending too much data (> 100 MB/s)
- query: '(sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ description: Host transmit bandwidth is high (>80%)
+ query: '((rate(node_network_transmit_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)'
severity: warning
- for: 5m
- name: Host unusual disk read rate
- description: Disk is probably reading too much data (> 50 MB/s)
- query: '(sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ description: Disk is too busy (IO wait > 80%)
+ query: '(rate(node_disk_io_time_seconds_total[5m]) > .80)'
severity: warning
- for: 5m
- - name: Host unusual disk write rate
- description: Disk is probably writing too much data (> 50 MB/s)
- query: '(sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
- severity: warning
- for: 2m
- name: Host out of disk space
description: Disk is almost full (< 10% left)
- query: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
- severity: warning
+ query: '(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0)'
+ severity: critical
comments: |
Please add ignored mountpoints in node_exporter parameters like
"--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
for: 2m
- - name: Host disk will fill in 24 hours
- description: Filesystem is predicted to run out of space within the next 24 hours at current write rate
- query: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ - name: Host disk may fill in 24 hours
+ description: Filesystem will likely run out of space within the next 24 hours.
+ query: 'predict_linear(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_avail_bytes > 0'
severity: warning
comments: |
Please add ignored mountpoints in node_exporter parameters like
@@ -193,146 +182,180 @@ groups:
for: 2m
- name: Host out of inodes
description: Disk is almost running out of available inodes (< 10% left)
- query: '(node_filesystem_files_free / node_filesystem_files * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
- severity: warning
+ query: '(node_filesystem_files_free / node_filesystem_files < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0)'
+ severity: critical
for: 2m
- name: Host filesystem device error
- description: '{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem'
- query: 'node_filesystem_device_error == 1'
+ description: "Error stat-ing the {{ $labels.mountpoint }} filesystem"
+ query: 'node_filesystem_device_error{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} == 1'
severity: critical
- - name: Host inodes will fill in 24 hours
- description: Filesystem is predicted to run out of inodes within the next 24 hours at current write rate
- query: '(node_filesystem_files_free / node_filesystem_files * 100 < 10 and predict_linear(node_filesystem_files_free[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ for: 2m
+ - name: Host inodes may fill in 24 hours
+ description: Filesystem will likely run out of inodes within the next 24 hours at current write rate
+ query: 'predict_linear(node_filesystem_files_free{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_files_free > 0'
severity: warning
for: 2m
- name: Host unusual disk read latency
description: Disk latency is growing (read operations > 100ms)
- query: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ query: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0'
severity: warning
for: 2m
- name: Host unusual disk write latency
description: Disk latency is growing (write operations > 100ms)
- query: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ query: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0)'
severity: warning
for: 2m
- name: Host high CPU load
description: CPU load is > 80%
- query: '(sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ query: '(avg by (instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > .80'
severity: warning
for: 10m
- name: Host CPU is underutilized
- description: 'CPU load is < 20% for 1 week. Consider reducing the number of CPUs.'
- query: '(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs."
+ query: '(min by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8'
severity: info
for: 1w
comments: |
You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
- name: Host CPU steal noisy neighbor
description: CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.
- query: '(avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ query: 'avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10'
severity: warning
- name: Host CPU high iowait
- description: CPU iowait > 10%. A high iowait means that you are disk or network bound.
- query: '(avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ description: CPU iowait > 10%. Your CPU is idling waiting for storage to respond.
+ query: 'avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10'
severity: warning
- name: Host unusual disk IO
- description: 'Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.'
- query: '(rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities. Check storage for issues."
+ query: 'rate(node_disk_io_time_seconds_total[5m]) > 0.8'
severity: warning
for: 5m
- - name: Host context switching
- description: Context switching is growing on the node (> 10000 / s)
- query: '((rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ - name: Host context switching high
+ description: Context switching is growing on the node (twice the daily average during the last 15m)
+ query: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2'
severity: warning
comments: |
- 10000 context switches is an arbitrary number.
+ x2 context switches is an arbitrary number.
The alert threshold depends on the nature of the application.
Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
- name: Host swap is filling up
description: Swap is filling up (>80%)
- query: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ query: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80)'
severity: warning
for: 2m
- name: Host systemd service crashed
description: "systemd service crashed"
- query: '(node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ query: '(node_systemd_unit_state{state="failed"} == 1)'
severity: warning
- name: Host physical component too hot
description: "Physical hardware component too hot"
- query: '((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl"} > 75)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ query: 'node_hwmon_temp_celsius > node_hwmon_temp_max_celsius'
severity: warning
for: 5m
- name: Host node overtemperature alarm
description: "Physical node temperature alarm triggered"
- query: '(node_hwmon_temp_crit_alarm_celsius == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ query: '((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1))'
severity: critical
- - name: Host RAID array got inactive
- description: 'RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically.'
- query: '(node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ - name: Host software RAID insufficient drives
+ description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} has insufficient drives remaining."
+ query: '((node_md_disks_required - on(device, instance) node_md_disks{state="active"}) > 0)'
severity: critical
- - name: Host RAID disk failure
- description: 'At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap'
- query: '(node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ - name: Host software RAID disk failure
+ description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} needs attention."
+ query: '(node_md_disks{state="failed"} > 0)'
severity: warning
for: 2m
- name: Host kernel version deviations
- description: Different kernel versions are running
- query: '(count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
- severity: warning
- for: 6h
+ description: Kernel version for {{ $labels.instance }} has changed.
+ query: 'changes(node_uname_info[1h]) > 0'
+ severity: info
- name: Host OOM kill detected
description: OOM kill detected
- query: '(increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ query: '(increase(node_vmstat_oom_kill[1m]) > 0)'
severity: warning
- name: Host EDAC Correctable Errors detected
description: 'Host {{ $labels.instance }} has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'
- query: '(increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ query: '(increase(node_edac_correctable_errors_total[1m]) > 0)'
severity: info
- name: Host EDAC Uncorrectable Errors detected
description: 'Host {{ $labels.instance }} has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'
- query: '(node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ query: '(node_edac_uncorrectable_errors_total > 0)'
severity: warning
- name: Host Network Receive Errors
description: 'Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.'
- query: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ query: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01)'
severity: warning
for: 2m
- name: Host Network Transmit Errors
description: 'Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.'
- query: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ query: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01)'
severity: warning
for: 2m
- - name: Host Network Interface Saturated
- description: 'The network interface "{{ $labels.device }}" on "{{ $labels.instance }}" is getting overloaded.'
- query: '((rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} > 0.8 < 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' # < to 10Gb to prevent +inf when max speed is unknown
- severity: warning
- for: 1m
- name: Host Network Bond Degraded
description: 'Bond "{{ $labels.device }}" degraded on "{{ $labels.instance }}".'
- query: '((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ query: '((node_bonding_active - node_bonding_slaves) != 0)'
severity: warning
for: 2m
- name: Host conntrack limit
- description: 'The number of conntrack is approaching limit'
- query: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ description: "The number of conntrack is approaching limit"
+ query: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8)'
severity: warning
for: 5m
- name: Host clock skew
- description: 'Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.'
- query: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host."
+ query: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0))'
severity: warning
for: 10m
- name: Host clock not synchronising
- description: 'Clock not synchronising. Ensure NTP is configured on this host.'
- query: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ description: "Clock not synchronising. Ensure NTP is configured on this host."
+ query: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16)'
severity: warning
for: 2m
- name: Host requires reboot
- description: '{{ $labels.instance }} requires a reboot.'
- query: '(node_reboot_required > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ description: "{{ $labels.instance }} requires a reboot."
+ query: '(node_reboot_required > 0)'
severity: info
for: 4h
+ - name: S.M.A.R.T Device Monitoring
+ exporters:
+ - name: smartctl-exporter
+ slug: smartctl-exporter
+ doc_url: https://github.com/prometheus-community/smartctl_exporter
+ rules:
+ - name: SMART device temperature warning
+ description: Device temperature warning on {{ $labels.instance }} drive {{ $labels.device }} over 60°C
+ query: '(avg_over_time(smartctl_device_temperature{temperature_type="current"} [5m]) unless on (instance, device) smartctl_device_temperature{temperature_type="drive_trip"}) > 60'
+ severity: warning
+ - name: SMART device temperature critical
+ description: Device temperature critical on {{ $labels.instance }} drive {{ $labels.device }} over 70°C
+ query: '(max_over_time(smartctl_device_temperature{temperature_type="current"} [5m]) unless on (instance, device) smartctl_device_temperature{temperature_type="drive_trip"}) > 70'
+ severity: critical
+ - name: SMART device temperature over trip value
+ description: Device temperature over trip value on {{ $labels.instance }} drive {{ $labels.device }})
+ query: 'max_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) >= on(device, instance) smartctl_device_temperature{temperature_type="drive_trip"}'
+ severity: critical
+ - name: SMART device temperature nearing trip value
+ description: Device temperature at 80% of trip value on {{ $labels.instance }} drive {{ $labels.device }})
+ query: 'max_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) >= on(device, instance) (smartctl_device_temperature{temperature_type="drive_trip"} * .80)'
+ severity: warning
+ - name: SMART status
+ description: Device has a SMART status failure on {{ $labels.instance }} drive {{ $labels.device }})
+ query: 'smartctl_device_smart_status != 1'
+ severity: critical
+ - name: SMART critical warning
+ description: Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }})
+ query: 'smartctl_device_critical_warning > 0'
+ severity: critical
+ - name: SMART media errors
+ description: Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }})
+ query: 'smartctl_device_media_errors > 0'
+ severity: critical
+ - name: SMART Wearout Indicator
+ description: Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }})
+ query: 'smartctl_device_available_spare < smartctl_device_available_spare_threshold'
+ severity: critical
+
- name: Docker containers
exporters:
- name: google/cAdvisor
@@ -341,20 +364,20 @@ groups:
rules:
- name: Container killed
description: A container has disappeared
- query: 'time() - container_last_seen > 60'
+ query: "time() - container_last_seen > 60"
severity: warning
comments: |
This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment.
- name: Container absent
description: A container is absent for 5 min
- query: 'absent(container_last_seen)'
+ query: "absent(container_last_seen)"
severity: warning
for: 5m
comments: |
This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment.
- name: Container High CPU utilization
description: Container CPU utilization is above 80%
- query: '(sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) > 80'
+ query: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) > 80'
severity: warning
for: 2m
- name: Container High Memory usage
@@ -370,16 +393,16 @@ groups:
for: 2m
- name: Container high throttle rate
description: Container is being throttled
- query: 'rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1'
+ query: 'sum(increase(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) by (container, pod, namespace) / sum(increase(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > ( 25 / 100 )'
severity: warning
- for: 2m
+ for: 5m
- name: Container high low change CPU usage
description: This alert rule monitors the absolute change in CPU usage within a time window and triggers an alert when the change exceeds 25%.
query: '(abs((sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[1m])) * 100) - (sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[1m] offset 1m)) * 100)) or abs((sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[1m])) * 100) - (sum by (instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[5m] offset 1m)) * 100))) > 25'
severity: info
- name: Container Low CPU utilization
description: Container CPU utilization is under 20% for 1 week. Consider reducing the allocated CPU.
- query: '(sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) < 20'
+ query: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) < 20'
severity: info
for: 7d
- name: Container Low Memory usage
@@ -388,7 +411,6 @@ groups:
severity: info
for: 7d
-
- name: Blackbox
exporters:
- name: prometheus/blackbox_exporter
@@ -401,28 +423,28 @@ groups:
severity: critical
- name: Blackbox configuration reload failure
description: Blackbox configuration reload failure
- query: 'blackbox_exporter_config_last_reload_successful != 1'
+ query: "blackbox_exporter_config_last_reload_successful != 1"
severity: warning
- name: Blackbox slow probe
description: Blackbox probe took more than 1s to complete
- query: 'avg_over_time(probe_duration_seconds[1m]) > 1'
+ query: "avg_over_time(probe_duration_seconds[1m]) > 1"
severity: warning
for: 1m
- name: Blackbox probe HTTP failure
description: HTTP status code is not 200-399
- query: 'probe_http_status_code <= 199 OR probe_http_status_code >= 400'
+ query: "probe_http_status_code <= 199 OR probe_http_status_code >= 400"
severity: critical
- name: Blackbox SSL certificate will expire soon
description: SSL certificate expires in less than 20 days
- query: '3 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 20'
+ query: "3 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 20"
severity: warning
- name: Blackbox SSL certificate will expire soon
description: SSL certificate expires in less than 3 days
- query: '0 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 3'
+ query: "0 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 3"
severity: critical
- name: Blackbox SSL certificate expired
description: SSL certificate has expired already
- query: 'round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 0'
+ query: "round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 0"
severity: critical
comments: |
For probe_ssl_earliest_cert_expiry to be exposed after expiration, you
@@ -431,12 +453,12 @@ groups:
See https://github.com/prometheus/blackbox_exporter/blob/master/CONFIGURATION.md#tls_config
- name: Blackbox probe slow HTTP
description: HTTP request took more than 1s
- query: 'avg_over_time(probe_http_duration_seconds[1m]) > 1'
+ query: "avg_over_time(probe_http_duration_seconds[1m]) > 1"
severity: warning
for: 1m
- name: Blackbox probe slow ping
description: Blackbox ping took more than 1s
- query: 'avg_over_time(probe_icmp_duration_seconds[1m]) > 1'
+ query: "avg_over_time(probe_icmp_duration_seconds[1m]) > 1"
severity: warning
for: 1m
@@ -448,7 +470,7 @@ groups:
rules:
- name: Windows Server collector Error
description: "Collector {{ $labels.collector }} was not successful"
- query: 'windows_exporter_collector_success == 0'
+ query: "windows_exporter_collector_success == 0"
severity: critical
- name: Windows Server service Status
description: Windows Service state is not OK
@@ -461,12 +483,12 @@ groups:
severity: warning
- name: Windows Server memory Usage
description: Memory usage is more than 90%
- query: '100 - ((windows_os_physical_memory_free_bytes / windows_cs_physical_memory_bytes) * 100) > 90'
+ query: "100 - ((windows_os_physical_memory_free_bytes / windows_cs_physical_memory_bytes) * 100) > 90"
severity: warning
for: 2m
- name: Windows Server disk Space Usage
description: Disk usage is more than 80%
- query: '100.0 - 100 * ((windows_logical_disk_free_bytes / 1024 / 1024 ) / (windows_logical_disk_size_bytes / 1024 / 1024)) > 80'
+ query: "100.0 - 100 * ((windows_logical_disk_free_bytes / 1024 / 1024 ) / (windows_logical_disk_size_bytes / 1024 / 1024)) > 80"
severity: critical
for: 2m
@@ -478,22 +500,22 @@ groups:
rules:
- name: Virtual Machine Memory Warning
description: 'High memory usage on {{ $labels.instance }}: {{ $value | printf "%.2f"}}%'
- query: 'vmware_vm_mem_usage_average / 100 >= 80 and vmware_vm_mem_usage_average / 100 < 90'
+ query: "vmware_vm_mem_usage_average / 100 >= 80 and vmware_vm_mem_usage_average / 100 < 90"
severity: warning
for: 5m
- name: Virtual Machine Memory Critical
description: 'High memory usage on {{ $labels.instance }}: {{ $value | printf "%.2f"}}%'
- query: 'vmware_vm_mem_usage_average / 100 >= 90'
+ query: "vmware_vm_mem_usage_average / 100 >= 90"
severity: critical
for: 1m
- name: High Number of Snapshots
description: "High snapshots number on {{ $labels.instance }}: {{ $value }}"
- query: 'vmware_vm_snapshots > 3'
+ query: "vmware_vm_snapshots > 3"
severity: warning
for: 30m
- name: Outdated Snapshots
description: 'Outdated snapshots on {{ $labels.instance }}: {{ $value | printf "%.0f"}} days'
- query: '(time() - vmware_vm_snapshot_timestamp_seconds) / (60 * 60 * 24) >= 3'
+ query: "(time() - vmware_vm_snapshot_timestamp_seconds) / (60 * 60 * 24) >= 3"
severity: warning
for: 5m
@@ -515,7 +537,7 @@ groups:
for: 5m
- name: Netdata high memory usage
description: Netdata high memory usage (> 80%)
- query: '100 / netdata_system_ram_MB_average * netdata_system_ram_MB_average{dimension=~"free|cached"} < 20'
+ query: '100 / netdata_system_ram_MiB_average * netdata_system_ram_MiB_average{dimension=~"free|cached"} < 20'
severity: warning
for: 5m
- name: Netdata low disk space
@@ -529,23 +551,22 @@ groups:
severity: warning
- name: Netdata MD mismatch cnt unsynchronized blocks
description: RAID Array have unsynchronized blocks
- query: 'netdata_md_mismatch_cnt_unsynchronized_blocks_average > 1024'
+ query: "netdata_md_mismatch_cnt_unsynchronized_blocks_average > 1024"
severity: warning
for: 2m
- name: Netdata disk reallocated sectors
description: Reallocated sectors on disk
- query: 'increase(netdata_smartd_log_reallocated_sectors_count_sectors_average[1m]) > 0'
+ query: "increase(netdata_smartd_log_reallocated_sectors_count_sectors_average[1m]) > 0"
severity: info
- name: Netdata disk current pending sector
description: Disk current pending sector
- query: 'netdata_smartd_log_current_pending_sector_count_sectors_average > 0'
+ query: "netdata_smartd_log_current_pending_sector_count_sectors_average > 0"
severity: warning
- name: Netdata reported uncorrectable disk sectors
description: Reported uncorrectable disk sectors
- query: 'increase(netdata_smartd_log_offline_uncorrectable_sector_count_sectors_average[2m]) > 0'
+ query: "increase(netdata_smartd_log_offline_uncorrectable_sector_count_sectors_average[2m]) > 0"
severity: warning
-
- name: Databases and brokers
services:
- name: MySQL
@@ -556,29 +577,34 @@ groups:
rules:
- name: MySQL down
description: MySQL instance is down on {{ $labels.instance }}
- query: 'mysql_up == 0'
+ query: "mysql_up == 0"
severity: critical
- name: MySQL too many connections (> 80%)
- description: 'More than 80% of MySQL connections are in use on {{ $labels.instance }}'
- query: 'max_over_time(mysql_global_status_threads_connected[1m]) / mysql_global_variables_max_connections * 100 > 80'
+ description: "More than 80% of MySQL connections are in use on {{ $labels.instance }}"
+ query: "max_over_time(mysql_global_status_threads_connected[1m]) / mysql_global_variables_max_connections * 100 > 80"
+ severity: warning
+ for: 2m
+ - name: MySQL high prepared statements utilization (> 80%)
+ description: "High utilization of prepared statements (>80%) on {{ $labels.instance }}"
+ query: "max_over_time(mysql_global_status_prepared_stmt_count[1m]) / mysql_global_variables_max_prepared_stmt_count * 100 > 80"
severity: warning
for: 2m
- name: MySQL high threads running
- description: 'More than 60% of MySQL connections are in running state on {{ $labels.instance }}'
- query: 'max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections * 100 > 60'
+ description: "More than 60% of MySQL connections are in running state on {{ $labels.instance }}"
+ query: "max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections * 100 > 60"
severity: warning
for: 2m
- name: MySQL Slave IO thread not running
- description: 'MySQL Slave IO thread not running on {{ $labels.instance }}'
- query: '( mysql_slave_status_slave_io_running and ON (instance) mysql_slave_status_master_server_id > 0 ) == 0'
+ description: "MySQL Slave IO thread not running on {{ $labels.instance }}"
+ query: "( mysql_slave_status_slave_io_running and ON (instance) mysql_slave_status_master_server_id > 0 ) == 0"
severity: critical
- name: MySQL Slave SQL thread not running
- description: 'MySQL Slave SQL thread not running on {{ $labels.instance }}'
- query: '( mysql_slave_status_slave_sql_running and ON (instance) mysql_slave_status_master_server_id > 0) == 0'
+ description: "MySQL Slave SQL thread not running on {{ $labels.instance }}"
+ query: "( mysql_slave_status_slave_sql_running and ON (instance) mysql_slave_status_master_server_id > 0) == 0"
severity: critical
- name: MySQL Slave replication lag
- description: 'MySQL replication lag on {{ $labels.instance }}'
- query: '( (mysql_slave_status_seconds_behind_master - mysql_slave_status_sql_delay) and ON (instance) mysql_slave_status_master_server_id > 0 ) > 30'
+ description: "MySQL replication lag on {{ $labels.instance }}"
+ query: "( (mysql_slave_status_seconds_behind_master - mysql_slave_status_sql_delay) and ON (instance) mysql_slave_status_master_server_id > 0 ) > 30"
severity: critical
for: 1m
- name: MySQL slow queries
@@ -592,8 +618,28 @@ groups:
severity: warning
- name: MySQL restarted
description: MySQL has just been restarted, less than one minute ago on {{ $labels.instance }}.
- query: 'mysql_global_status_uptime < 60'
+ query: "mysql_global_status_uptime < 60"
severity: info
+ - name: MySQL High QPS
+ description: MySQL is being overload with unusual QPS (> 10k QPS).
+ query: "irate(mysql_global_status_questions[1m]) > 10000"
+ severity: info
+ for: 2m
+ - name: MySQL too many open files
+ description: MySQL has too many open files, consider increase variables open_files_limit on {{ $labels.instance }}.
+ query: "mysql_global_status_innodb_num_open_files / mysql_global_variables_open_files_limit * 100 > 75"
+ severity: warning
+ for: 2m
+ - name: MySQL InnoDB Force Recovery is enabled
+ description: "MySQL InnoDB force recovery is enabled on {{ $labels.instance }}"
+ query: "mysql_global_variables_innodb_force_recovery != 0"
+ severity: warning
+ for: 2m
+ - name: MySQL InnoDB history_len too long
+ description: "MySQL history_len (undo log) too long on {{ $labels.instance }}"
+ query: "mysql_info_schema_innodb_metrics_transaction_trx_rseg_history_len > 50000"
+ severity: warning
+ for: 2m
- name: PostgreSQL
exporters:
@@ -603,33 +649,33 @@ groups:
rules:
- name: Postgresql down
description: Postgresql instance is down
- query: 'pg_up == 0'
+ query: "pg_up == 0"
severity: critical
- name: Postgresql restarted
description: Postgresql restarted
- query: 'time() - pg_postmaster_start_time_seconds < 60'
+ query: "time() - pg_postmaster_start_time_seconds < 60"
severity: critical
- name: Postgresql exporter error
description: Postgresql exporter is showing errors. A query may be buggy in query.yaml
- query: 'pg_exporter_last_scrape_error > 0'
+ query: "pg_exporter_last_scrape_error > 0"
severity: critical
- name: Postgresql table not auto vacuumed
description: Table {{ $labels.relname }} has not been auto vacuumed for 10 days
- query: '(pg_stat_user_tables_last_autovacuum > 0) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10'
+ query: "((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_vacuum_threshold) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10"
severity: warning
- name: Postgresql table not auto analyzed
description: Table {{ $labels.relname }} has not been auto analyzed for 10 days
- query: '(pg_stat_user_tables_last_autoanalyze > 0) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10'
+ query: "((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_analyze_threshold) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10"
severity: warning
- name: Postgresql too many connections
description: PostgreSQL instance has too many connections (> 80%).
- expr: 'sum by (instance, job, server) (pg_stat_activity_count) > min by (instance, job, server) (pg_settings_max_connections * 0.8)'
+ query: "sum by (instance, job, server) (pg_stat_activity_count) > min by (instance, job, server) (pg_settings_max_connections * 0.8)"
severity: warning
for: 2m
- name: Postgresql not enough connections
description: PostgreSQL instance should have more connections (> 5)
query: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5'
- severity: warning
+ severity: critical
for: 2m
- name: Postgresql dead locks
description: PostgreSQL has dead-locks
@@ -641,12 +687,12 @@ groups:
severity: warning
- name: Postgresql commit rate low
description: Postgresql seems to be processing very few transactions
- query: 'rate(pg_stat_database_xact_commit[1m]) < 10'
+ query: 'increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[5m]) < 5'
severity: critical
for: 2m
- name: Postgresql low XID consumption
description: Postgresql seems to be consuming transaction IDs very slowly
- query: 'rate(pg_txid_current[1m]) < 5'
+ query: "rate(pg_txid_current[1m]) < 5"
severity: warning
for: 2m
- name: Postgresql high rate statement timeout
@@ -659,41 +705,48 @@ groups:
severity: critical
- name: Postgresql unused replication slot
description: Unused Replication Slots
- query: 'pg_replication_slots_active == 0'
+ query: "pg_replication_slots_active == 0"
severity: warning
for: 1m
- name: Postgresql too many dead tuples
description: PostgreSQL dead tuples is too large
- query: '((pg_stat_user_tables_n_dead_tup > 10000) / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)) >= 0.1'
+ query: "((pg_stat_user_tables_n_dead_tup > 10000) / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)) >= 0.1"
severity: warning
for: 2m
- name: Postgresql configuration changed
description: Postgres Database configuration change has occurred
- query: '{__name__=~"pg_settings_.*"} != ON(__name__) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"} OFFSET 5m'
+ query: '{__name__=~"pg_settings_.*"} != ON(__name__, instance) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"} OFFSET 5m'
severity: info
- name: Postgresql SSL compression active
- description: Database connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.
- query: 'sum(pg_stat_ssl_compression) > 0'
+ description: Database allows connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.
+ query: "sum(pg_stat_ssl_compression) > 0"
severity: critical
- name: Postgresql too many locks acquired
description: Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.
- query: '((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20'
+ query: "((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20"
severity: critical
for: 2m
- name: Postgresql bloat index high (> 80%)
- description: 'The index {{ $labels.idxname }} is bloated. You should execute `REINDEX INDEX CONCURRENTLY {{ $labels.idxname }};`'
- query: 'pg_bloat_btree_bloat_pct > 80 and on (idxname) (pg_bloat_btree_real_size > 100000000)'
+ description: "The index {{ $labels.idxname }} is bloated. You should execute `REINDEX INDEX CONCURRENTLY {{ $labels.idxname }};`"
+ query: "pg_bloat_btree_bloat_pct > 80 and on (idxname) (pg_bloat_btree_real_size > 100000000)"
severity: warning
for: 1h
comments: |
See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
- name: Postgresql bloat table high (> 80%)
- description: 'The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`'
- query: 'pg_bloat_table_bloat_pct > 80 and on (relname) (pg_bloat_table_real_size > 200000000)'
+ description: "The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`"
+ query: "pg_bloat_table_bloat_pct > 80 and on (relname) (pg_bloat_table_real_size > 200000000)"
severity: warning
for: 1h
comments: |
See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
+ - name: Postgresql invalid index
+ description: "The table {{ $labels.relname }} has an invalid index: {{ $labels.indexrelname }}. You should execute `DROP INDEX {{ $labels.indexrelname }};`"
+ query: 'pg_general_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}'
+ severity: warning
+ for: 6h
+ comments: |
+ See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
- name: SQL Server
exporters:
@@ -729,7 +782,7 @@ groups:
rules:
- name: PGBouncer active connections
description: PGBouncer pools are filling up
- query: 'pgbouncer_pools_server_active_connections > 200'
+ query: "pgbouncer_pools_server_active_connections > 200"
severity: warning
for: 2m
- name: PGBouncer errors
@@ -749,7 +802,7 @@ groups:
rules:
- name: Redis down
description: Redis instance is down
- query: 'redis_up == 0'
+ query: "redis_up == 0"
severity: critical
- name: Redis missing master
description: Redis cluster has no node marked as master.
@@ -761,46 +814,46 @@ groups:
severity: critical
- name: Redis disconnected slaves
description: Redis not replicating for all slaves. Consider reviewing the redis replication status.
- query: 'count without (instance, job) (redis_connected_slaves) - sum without (instance, job) (redis_connected_slaves) - 1 > 0'
+ query: "count without (instance, job) (redis_connected_slaves) - sum without (instance, job) (redis_connected_slaves) - 1 > 0"
severity: critical
- name: Redis replication broken
description: Redis instance lost a slave
- query: 'delta(redis_connected_slaves[1m]) < 0'
+ query: "delta(redis_connected_slaves[1m]) < 0"
severity: critical
- name: Redis cluster flapping
description: Changes have been detected in Redis replica connection. This can occur when replica nodes lose connection to the master and reconnect (a.k.a flapping).
- query: 'changes(redis_connected_slaves[1m]) > 1'
+ query: "changes(redis_connected_slaves[1m]) > 1"
severity: critical
for: 2m
- name: Redis missing backup
description: Redis has not been backuped for 24 hours
- query: 'time() - redis_rdb_last_save_timestamp_seconds > 60 * 60 * 24'
+ query: "time() - redis_rdb_last_save_timestamp_seconds > 60 * 60 * 24"
severity: critical
- name: Redis out of system memory
description: Redis is running out of system memory (> 90%)
- query: 'redis_memory_used_bytes / redis_total_system_memory_bytes * 100 > 90'
+ query: "redis_memory_used_bytes / redis_total_system_memory_bytes * 100 > 90"
severity: warning
for: 2m
comments: |
The exporter must be started with --include-system-metrics flag or REDIS_EXPORTER_INCL_SYSTEM_METRICS=true environment variable.
- name: Redis out of configured maxmemory
description: Redis is running out of configured maxmemory (> 90%)
- query: 'redis_memory_used_bytes / redis_memory_max_bytes * 100 > 90'
+ query: "redis_memory_used_bytes / redis_memory_max_bytes * 100 > 90 and on(instance) redis_memory_max_bytes > 0"
severity: warning
for: 2m
- name: Redis too many connections
description: Redis is running out of connections (> 90% used)
- query: 'redis_connected_clients / redis_config_maxclients * 100 > 90'
+ query: "redis_connected_clients / redis_config_maxclients * 100 > 90"
severity: warning
for: 2m
- name: Redis not enough connections
description: Redis instance should have more connections (> 5)
- query: 'redis_connected_clients < 5'
+ query: "redis_connected_clients < 5"
severity: warning
for: 2m
- name: Redis rejected connections
description: Some connections to Redis has been rejected
- query: 'increase(redis_rejected_connections_total[1m]) > 0'
+ query: "increase(redis_rejected_connections_total[1m]) > 0"
severity: critical
- name: MongoDB
@@ -811,11 +864,11 @@ groups:
rules:
- name: MongoDB Down
description: MongoDB instance is down
- query: 'mongodb_up == 0'
+ query: "mongodb_up == 0"
severity: critical
- name: Mongodb replica member unhealthy
description: MongoDB replica member is not healthy
- query: 'mongodb_rs_members_health == 0'
+ query: "mongodb_rs_members_health == 0"
severity: critical
- name: MongoDB replication lag
description: Mongodb replication lag is more than 10s
@@ -832,7 +885,7 @@ groups:
for: 2m
- name: MongoDB cursors timeouts
description: Too many cursors are timing out
- query: 'increase(mongodb_ss_metrics_cursor_timedOut[1m]) > 100'
+ query: "increase(mongodb_ss_metrics_cursor_timedOut[1m]) > 100"
severity: warning
for: 2m
- name: MongoDB too many connections
@@ -840,11 +893,6 @@ groups:
query: 'avg by(instance) (rate(mongodb_ss_connections{conn_type="current"}[1m])) / avg by(instance) (sum (mongodb_ss_connections) by (instance)) * 100 > 80'
severity: warning
for: 2m
- - name: MongoDB virtual memory usage
- description: High memory usage
- query: '(sum(mongodb_ss_mem_virtual) BY (instance) / sum(mongodb_ss_mem_resident) BY (instance)) > 3'
- severity: warning
- for: 2m
- name: dcu/mongodb_exporter
slug: dcu-mongodb-exporter
@@ -856,23 +904,23 @@ groups:
severity: critical
- name: MongoDB replication Status 3
description: MongoDB Replication set member either perform startup self-checks, or transition from completing a rollback or resync
- query: 'mongodb_replset_member_state == 3'
+ query: "mongodb_replset_member_state == 3"
severity: critical
- name: MongoDB replication Status 6
description: MongoDB Replication set member as seen from another member of the set, is not yet known
- query: 'mongodb_replset_member_state == 6'
+ query: "mongodb_replset_member_state == 6"
severity: critical
- name: MongoDB replication Status 8
description: MongoDB Replication set member as seen from another member of the set, is unreachable
- query: 'mongodb_replset_member_state == 8'
+ query: "mongodb_replset_member_state == 8"
severity: critical
- name: MongoDB replication Status 9
description: MongoDB Replication set member is actively performing a rollback. Data is not available for reads
- query: 'mongodb_replset_member_state == 9'
+ query: "mongodb_replset_member_state == 9"
severity: critical
- name: MongoDB replication Status 10
description: MongoDB Replication set member was once in a replica set but was subsequently removed
- query: 'mongodb_replset_member_state == 10'
+ query: "mongodb_replset_member_state == 10"
severity: critical
- name: MongoDB number cursors open
description: Too many cursors opened by MongoDB for clients (> 10k)
@@ -881,7 +929,7 @@ groups:
for: 2m
- name: MongoDB cursors timeouts
description: Too many cursors are timing out
- query: 'increase(mongodb_metrics_cursor_timed_out_total[1m]) > 100'
+ query: "increase(mongodb_metrics_cursor_timed_out_total[1m]) > 100"
severity: warning
for: 2m
- name: MongoDB too many connections
@@ -910,47 +958,52 @@ groups:
slug: rabbitmq-exporter
doc_url: https://github.com/rabbitmq/rabbitmq-prometheus
rules:
- - name: Rabbitmq node down
+ - name: RabbitMQ node down
description: Less than 3 nodes running in RabbitMQ cluster
- query: 'sum(rabbitmq_build_info) < 3'
+ query: "sum(rabbitmq_build_info) < 3"
severity: critical
- - name: Rabbitmq node not distributed
+ - name: RabbitMQ node not distributed
description: Distribution link state is not 'up'
- query: 'erlang_vm_dist_node_state < 3'
+ query: "erlang_vm_dist_node_state < 3"
severity: critical
- - name: Rabbitmq instances different versions
- description: Running different version of Rabbitmq in the same cluster, can lead to failure.
- query: 'count(count(rabbitmq_build_info) by (rabbitmq_version)) > 1'
+ - name: RabbitMQ instances different versions
+ description: Running different version of RabbitMQ in the same cluster, can lead to failure.
+ query: "count(count(rabbitmq_build_info) by (rabbitmq_version)) > 1"
severity: warning
for: 1h
- - name: Rabbitmq memory high
+ - name: RabbitMQ memory high
description: A node use more than 90% of allocated RAM
- query: 'rabbitmq_process_resident_memory_bytes / rabbitmq_resident_memory_limit_bytes * 100 > 90'
+ query: "rabbitmq_process_resident_memory_bytes / rabbitmq_resident_memory_limit_bytes * 100 > 90"
severity: warning
for: 2m
- - name: Rabbitmq file descriptors usage
+ - name: RabbitMQ file descriptors usage
description: A node use more than 90% of file descriptors
- query: 'rabbitmq_process_open_fds / rabbitmq_process_max_fds * 100 > 90'
+ query: "rabbitmq_process_open_fds / rabbitmq_process_max_fds * 100 > 90"
severity: warning
for: 2m
- - name: Rabbitmq too many unack messages
- description: Too many unacknowledged messages
- query: 'sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000'
+ - name: RabbitMQ too many ready messages
+ description: RabbitMQ too many ready messages on {{ $labels.instace }}
+ query: "sum(rabbitmq_queue_messages_ready) BY (queue) > 1000"
severity: warning
for: 1m
- - name: Rabbitmq too many connections
+ - name: RabbitMQ too many unack messages
+ description: Too many unacknowledged messages
+ query: "sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000"
+ severity: warning
+ for: 1m
+ - name: RabbitMQ too many connections
description: The total connections of a node is too high
- query: 'rabbitmq_connections > 1000'
+ query: "rabbitmq_connections > 1000"
severity: warning
for: 2m
- - name: Rabbitmq no queue consumer
+ - name: RabbitMQ no queue consumer
description: A queue has less than 1 consumer
- query: 'rabbitmq_queue_consumers < 1'
+ query: "rabbitmq_queue_consumers < 1"
severity: warning
- for: 1m # allows a short service restart
- - name: Rabbitmq unroutable messages
+ for: 1m # allows a short service restart
+ - name: RabbitMQ unroutable messages
description: A queue has unroutable messages
- query: 'increase(rabbitmq_channel_messages_unroutable_returned_total[1m]) > 0 or increase(rabbitmq_channel_messages_unroutable_dropped_total[1m]) > 0'
+ query: "increase(rabbitmq_channel_messages_unroutable_returned_total[1m]) > 0 or increase(rabbitmq_channel_messages_unroutable_dropped_total[1m]) > 0"
severity: warning
for: 2m
@@ -958,61 +1011,61 @@ groups:
slug: kbudde-rabbitmq-exporter
doc_url: https://github.com/kbudde/rabbitmq_exporter
rules:
- - name: Rabbitmq down
+ - name: RabbitMQ down
description: RabbitMQ node down
- query: 'rabbitmq_up == 0'
+ query: "rabbitmq_up == 0"
severity: critical
- - name: Rabbitmq cluster down
+ - name: RabbitMQ cluster down
description: Less than 3 nodes running in RabbitMQ cluster
- query: 'sum(rabbitmq_running) < 3'
+ query: "sum(rabbitmq_running) < 3"
severity: critical
- - name: Rabbitmq cluster partition
+ - name: RabbitMQ cluster partition
description: Cluster partition
- query: 'rabbitmq_partitions > 0'
+ query: "rabbitmq_partitions > 0"
severity: critical
- - name: Rabbitmq out of memory
+ - name: RabbitMQ out of memory
description: Memory available for RabbmitMQ is low (< 10%)
- query: 'rabbitmq_node_mem_used / rabbitmq_node_mem_limit * 100 > 90'
+ query: "rabbitmq_node_mem_used / rabbitmq_node_mem_limit * 100 > 90"
severity: warning
for: 2m
- - name: Rabbitmq too many connections
+ - name: RabbitMQ too many connections
description: RabbitMQ instance has too many connections (> 1000)
- query: 'rabbitmq_connectionsTotal > 1000'
+ query: "rabbitmq_connectionsTotal > 1000"
severity: warning
for: 2m
- - name: Rabbitmq dead letter queue filling up
+ - name: RabbitMQ dead letter queue filling up
description: Dead letter queue is filling up (> 10 msgs)
query: 'rabbitmq_queue_messages{queue="my-dead-letter-queue"} > 10'
severity: warning
for: 1m
comments: |
Indicate the queue name in dedicated label.
- - name: Rabbitmq too many messages in queue
+ - name: RabbitMQ too many messages in queue
description: Queue is filling up (> 1000 msgs)
query: 'rabbitmq_queue_messages_ready{queue="my-queue"} > 1000'
severity: warning
for: 2m
comments: |
Indicate the queue name in dedicated label.
- - name: Rabbitmq slow queue consuming
+ - name: RabbitMQ slow queue consuming
description: Queue messages are consumed slowly (> 60s)
query: 'time() - rabbitmq_queue_head_message_timestamp{queue="my-queue"} > 60'
severity: warning
for: 2m
comments: |
Indicate the queue name in dedicated label.
- - name: Rabbitmq no consumer
+ - name: RabbitMQ no consumer
description: Queue has no consumer
- query: 'rabbitmq_queue_consumers == 0'
+ query: "rabbitmq_queue_consumers == 0"
severity: critical
- for: 1m # allows a short service restart
- - name: Rabbitmq too many consumers
+ for: 1m # allows a short service restart
+ - name: RabbitMQ too many consumers
description: Queue should have only 1 consumer
query: 'rabbitmq_queue_consumers{queue="my-queue"} > 1'
severity: critical
comments: |
Indicate the queue name in dedicated label.
- - name: Rabbitmq unactive exchange
+ - name: RabbitMQ unactive exchange
description: Exchange receive less than 5 msgs per second
query: 'rate(rabbitmq_exchange_messages_published_in_total{exchange="my-exchange"}[1m]) < 5'
severity: warning
@@ -1038,11 +1091,11 @@ groups:
for: 2m
- name: Elasticsearch disk out of space
description: The disk usage is over 90%
- query: 'elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 10'
+ query: "elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 10"
severity: critical
- name: Elasticsearch disk space low
description: The disk usage is over 80%
- query: 'elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 20'
+ query: "elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 20"
severity: warning
for: 2m
- name: Elasticsearch Cluster Red
@@ -1055,43 +1108,78 @@ groups:
severity: warning
- name: Elasticsearch Healthy Nodes
description: "Missing node in Elasticsearch cluster"
- query: 'elasticsearch_cluster_health_number_of_nodes < 3'
+ query: "elasticsearch_cluster_health_number_of_nodes < 3"
severity: critical
- name: Elasticsearch Healthy Data Nodes
description: "Missing data node in Elasticsearch cluster"
- query: 'elasticsearch_cluster_health_number_of_data_nodes < 3'
+ query: "elasticsearch_cluster_health_number_of_data_nodes < 3"
severity: critical
- name: Elasticsearch relocating shards
description: "Elasticsearch is relocating shards"
- query: 'elasticsearch_cluster_health_relocating_shards > 0'
+ query: "elasticsearch_cluster_health_relocating_shards > 0"
severity: info
- name: Elasticsearch relocating shards too long
description: "Elasticsearch has been relocating shards for 15min"
- query: 'elasticsearch_cluster_health_relocating_shards > 0'
+ query: "elasticsearch_cluster_health_relocating_shards > 0"
severity: warning
for: 15m
- name: Elasticsearch initializing shards
description: "Elasticsearch is initializing shards"
- query: 'elasticsearch_cluster_health_initializing_shards > 0'
+ query: "elasticsearch_cluster_health_initializing_shards > 0"
severity: info
- name: Elasticsearch initializing shards too long
description: "Elasticsearch has been initializing shards for 15 min"
- query: 'elasticsearch_cluster_health_initializing_shards > 0'
+ query: "elasticsearch_cluster_health_initializing_shards > 0"
severity: warning
for: 15m
- name: Elasticsearch unassigned shards
- description: 'Elasticsearch has unassigned shards'
- query: 'elasticsearch_cluster_health_unassigned_shards > 0'
+ description: "Elasticsearch has unassigned shards"
+ query: "elasticsearch_cluster_health_unassigned_shards > 0"
severity: critical
- name: Elasticsearch pending tasks
- description: 'Elasticsearch has pending tasks. Cluster works slowly.'
- query: 'elasticsearch_cluster_health_number_of_pending_tasks > 0'
+ description: "Elasticsearch has pending tasks. Cluster works slowly."
+ query: "elasticsearch_cluster_health_number_of_pending_tasks > 0"
severity: warning
for: 15m
- name: Elasticsearch no new documents
- description: No new documents for 10 min!
+ description: "No new documents for 10 min!"
query: 'increase(elasticsearch_indices_indexing_index_total{es_data_node="true"}[10m]) < 1'
severity: warning
+ - name: Elasticsearch High Indexing Latency
+ description: "The indexing latency on Elasticsearch cluster is higher than the threshold."
+ query: "elasticsearch_indices_indexing_index_time_seconds_total / elasticsearch_indices_indexing_index_total > 0.0005"
+ severity: warning
+ for: 10m
+ - name: Elasticsearch High Indexing Rate
+ description: "The indexing rate on Elasticsearch cluster is higher than the threshold."
+ query: "sum(rate(elasticsearch_indices_indexing_index_total[1m]))> 10000"
+ severity: warning
+ for: 5m
+ - name: Elasticsearch High Query Rate
+ description: "The query rate on Elasticsearch cluster is higher than the threshold."
+ query: "sum(rate(elasticsearch_indices_search_query_total[1m])) > 100"
+ severity: warning
+ for: 5m
+ - name: Elasticsearch High Query Latency
+ description: "The query latency on Elasticsearch cluster is higher than the threshold."
+ query: "elasticsearch_indices_search_fetch_time_seconds / elasticsearch_indices_search_fetch_total > 1"
+ severity: warning
+ for: 5m
+
+ - name: Meilisearch
+ exporters:
+ - name: Embedded exporter
+ slug: embedded-exporter
+ doc_url: https://github.com/orgs/meilisearch/discussions/625
+ rules:
+ - name: Meilisearch index is empty
+ description: Meilisearch instance is down
+ query: 'meilisearch_index_docs_count == 0'
+ severity: warning
+ - name: Meilisearch http response time
+ description: Meilisearch http response time is too high
+ query: "meilisearch_http_response_time_seconds > 0.5"
+ severity: warning
- name: Cassandra
exporters:
@@ -1099,60 +1187,60 @@ groups:
slug: instaclustr-cassandra-exporter
doc_url: https://github.com/instaclustr/cassandra-exporter
rules:
- - name: 'Cassandra Node is unavailable'
- description: 'Cassandra Node is unavailable - {{ $labels.cassandra_cluster }} {{ $labels.exported_endpoint }}'
- query: 'sum(cassandra_endpoint_active) by (cassandra_cluster,instance,exported_endpoint) < 1'
+ - name: "Cassandra Node is unavailable"
+ description: "Cassandra Node is unavailable - {{ $labels.cassandra_cluster }} {{ $labels.exported_endpoint }}"
+ query: "sum(cassandra_endpoint_active) by (cassandra_cluster,instance,exported_endpoint) < 1"
severity: critical
- - name: 'Cassandra many compaction tasks are pending'
- description: 'Many Cassandra compaction tasks are pending - {{ $labels.cassandra_cluster }}'
- query: 'cassandra_table_estimated_pending_compactions > 100'
+ - name: "Cassandra many compaction tasks are pending"
+ description: "Many Cassandra compaction tasks are pending - {{ $labels.cassandra_cluster }}"
+ query: "cassandra_table_estimated_pending_compactions > 100"
severity: warning
- - name: 'Cassandra commitlog pending tasks'
- description: 'Cassandra commitlog pending tasks - {{ $labels.cassandra_cluster }}'
- query: 'cassandra_commit_log_pending_tasks > 15'
+ - name: "Cassandra commitlog pending tasks"
+ description: "Cassandra commitlog pending tasks - {{ $labels.cassandra_cluster }}"
+ query: "cassandra_commit_log_pending_tasks > 15"
for: 2m
severity: warning
- - name: 'Cassandra compaction executor blocked tasks'
- description: 'Some Cassandra compaction executor tasks are blocked - {{ $labels.cassandra_cluster }}'
+ - name: "Cassandra compaction executor blocked tasks"
+ description: "Some Cassandra compaction executor tasks are blocked - {{ $labels.cassandra_cluster }}"
query: 'cassandra_thread_pool_blocked_tasks{pool="CompactionExecutor"} > 15'
for: 2m
severity: warning
- - name: 'Cassandra flush writer blocked tasks'
- description: 'Some Cassandra flush writer tasks are blocked - {{ $labels.cassandra_cluster }}'
+ - name: "Cassandra flush writer blocked tasks"
+ description: "Some Cassandra flush writer tasks are blocked - {{ $labels.cassandra_cluster }}"
query: 'cassandra_thread_pool_blocked_tasks{pool="MemtableFlushWriter"} > 15'
for: 2m
severity: warning
- - name: 'Cassandra connection timeouts total'
- description: 'Some connection between nodes are ending in timeout - {{ $labels.cassandra_cluster }}'
- query: 'avg(cassandra_client_request_timeouts_total) by (cassandra_cluster,instance) > 5'
+ - name: "Cassandra connection timeouts total"
+ description: "Some connection between nodes are ending in timeout - {{ $labels.cassandra_cluster }}"
+ query: "avg(cassandra_client_request_timeouts_total) by (cassandra_cluster,instance) > 5"
for: 2m
severity: critical
- - name: 'Cassandra storage exceptions'
- description: 'Something is going wrong with cassandra storage - {{ $labels.cassandra_cluster }}'
- query: 'changes(cassandra_storage_exceptions_total[1m]) > 1'
+ - name: "Cassandra storage exceptions"
+ description: "Something is going wrong with cassandra storage - {{ $labels.cassandra_cluster }}"
+ query: "changes(cassandra_storage_exceptions_total[1m]) > 1"
severity: critical
- - name: 'Cassandra tombstone dump'
- description: 'Cassandra tombstone dump - {{ $labels.cassandra_cluster }}'
+ - name: "Cassandra tombstone dump"
+ description: "Cassandra tombstone dump - {{ $labels.cassandra_cluster }}"
query: 'avg(cassandra_table_tombstones_scanned{quantile="0.99"}) by (instance,cassandra_cluster,keyspace) > 100'
for: 2m
severity: critical
- - name: 'Cassandra client request unavailable write'
- description: 'Some Cassandra client requests are unavailable to write - {{ $labels.cassandra_cluster }}'
+ - name: "Cassandra client request unavailable write"
+ description: "Some Cassandra client requests are unavailable to write - {{ $labels.cassandra_cluster }}"
query: 'changes(cassandra_client_request_unavailable_exceptions_total{operation="write"}[1m]) > 0'
for: 2m
severity: critical
- - name: 'Cassandra client request unavailable read'
- description: 'Some Cassandra client requests are unavailable to read - {{ $labels.cassandra_cluster }}'
+ - name: "Cassandra client request unavailable read"
+ description: "Some Cassandra client requests are unavailable to read - {{ $labels.cassandra_cluster }}"
query: 'changes(cassandra_client_request_unavailable_exceptions_total{operation="read"}[1m]) > 0'
for: 2m
severity: critical
- - name: 'Cassandra client request write failure'
- description: 'Read failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}'
+ - name: "Cassandra client request write failure"
+ description: "Read failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}"
query: 'increase(cassandra_client_request_failures_total{operation="write"}[1m]) > 0'
for: 2m
severity: critical
- - name: 'Cassandra client request read failure'
- description: 'Read failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}'
+ - name: "Cassandra client request read failure"
+ description: "Read failures have occurred, ensure there are not too many unavailable nodes - {{ $labels.cassandra_cluster }}"
query: 'increase(cassandra_client_request_failures_total{operation="read"}[1m]) > 0'
for: 2m
severity: critical
@@ -1244,31 +1332,113 @@ groups:
severity: critical
for: 2m
+ - name: Clickhouse
+ exporters:
+ - name: Embedded Exporter
+ slug: embedded-exporter
+ doc_url: https://clickhouse.com/docs/en/operations/system-tables/metrics
+ rules:
+ - name: ClickHouse Memory Usage Critical
+ description: Memory usage is critically high, over 90%.
+ query: "ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 90"
+ severity: critical
+ for: 5m
+ - name: ClickHouse Memory Usage Warning
+ description: Memory usage is over 80%.
+ query: "ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 80"
+ severity: warning
+ for: 5m
+ - name: ClickHouse Disk Space Low on Default
+ description: Disk space on default is below 20%.
+ query: "ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 20"
+ severity: warning
+ for: 2m
+ - name: ClickHouse Disk Space Critical on Default
+ description: Disk space on default disk is critically low, below 10%.
+ query: "ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 10"
+ severity: critical
+ for: 2m
+ - name: ClickHouse Disk Space Low on Backups
+ description: Disk space on backups is below 20%.
+ query: "ClickHouseAsyncMetrics_DiskAvailable_backups / (ClickHouseAsyncMetrics_DiskAvailable_backups + ClickHouseAsyncMetrics_DiskUsed_backups) * 100 < 20"
+ severity: warning
+ for: 2m
+ - name: ClickHouse Replica Errors
+ description: Critical replica errors detected, either all replicas are stale or lost.
+ query: "ClickHouseErrorMetric_ALL_REPLICAS_ARE_STALE == 1 or ClickHouseErrorMetric_ALL_REPLICAS_LOST == 1"
+ severity: critical
+ for: 0m
+ - name: ClickHouse No Available Replicas
+ description: No available replicas in ClickHouse.
+ query: "ClickHouseErrorMetric_NO_AVAILABLE_REPLICA == 1"
+ severity: critical
+ for: 0m
+ - name: ClickHouse No Live Replicas
+ description: There are too few live replicas available, risking data loss and service disruption.
+ query: "ClickHouseErrorMetric_TOO_FEW_LIVE_REPLICAS == 1"
+ severity: critical
+ for: 0m
+ - name: ClickHouse High Network Traffic
+ description: Network traffic is unusually high, may affect cluster performance.
+ query: "ClickHouseMetrics_NetworkSend > 250 or ClickHouseMetrics_NetworkReceive > 250"
+ severity: warning
+ for: 5m
+ comments: |
+ Please replace the threshold with an appropriate value
+ - name: ClickHouse High TCP Connections
+ description: High number of TCP connections, indicating heavy client or inter-cluster communication.
+ query: "ClickHouseMetrics_TCPConnection > 400"
+ severity: warning
+ for: 5m
+ comments: |
+ Please replace the threshold with an appropriate value
+ - name: ClickHouse Interserver Connection Issues
+ description: An increase in interserver connections may indicate replication or distributed query handling issues.
+ query: "increase(ClickHouseMetrics_InterserverConnection[5m]) > 0"
+ severity: warning
+ for: 1m
+ - name: ClickHouse ZooKeeper Connection Issues
+ description: ClickHouse is experiencing issues with ZooKeeper connections, which may affect cluster state and coordination.
+ query: "avg(ClickHouseMetrics_ZooKeeperSession) != 1"
+ severity: warning
+ for: 3m
+ - name: ClickHouse Authentication Failures
+ description: Authentication failures detected, indicating potential security issues or misconfiguration.
+ query: "increase(ClickHouseErrorMetric_AUTHENTICATION_FAILED[5m]) > 0"
+ severity: info
+ for: 0m
+ - name: ClickHouse Access Denied Errors
+ description: Access denied errors have been logged, which could indicate permission issues or unauthorized access attempts.
+ query: "increase(ClickHouseErrorMetric_RESOURCE_ACCESS_DENIED[5m]) > 0"
+ severity: info
+ for: 0m
+
+
- name: Zookeeper
exporters:
- name: cloudflare/kafka_zookeeper_exporter
slug: cloudflare-kafka-zookeeper-exporter
doc_url: https://github.com/cloudflare/kafka_zookeeper_exporter
rules:
- - name: dabealu/zookeeper-exporter
+ - name: dabealu/zookeeper-exporter
slug: dabealu-zookeeper-exporter
doc_url: https://github.com/dabealu/zookeeper-exporter
rules:
- name: Zookeeper Down
description: "Zookeeper down on instance {{ $labels.instance }}"
- query: 'zk_up == 0'
+ query: "zk_up == 0"
severity: critical
- name: Zookeeper missing leader
- description: "Zookeeper cluster has no node marked as leader"
- query: 'sum(zk_server_leader) == 0'
+ description: "Zookeeper cluster has no node marked as leader"
+ query: "sum(zk_server_leader) == 0"
severity: critical
- name: Zookeeper Too Many Leaders
description: "Zookeeper cluster has too many nodes marked as leader"
- query: 'sum(zk_server_leader) > 1'
+ query: "sum(zk_server_leader) > 1"
severity: critical
- name: Zookeeper Not Ok
description: "Zookeeper instance is not ok"
- query: 'zk_ruok == 0'
+ query: "zk_ruok == 0"
severity: warning
for: 3m
@@ -1280,11 +1450,11 @@ groups:
rules:
- name: Kafka topics replicas
description: Kafka topic in-sync partition
- query: 'sum(kafka_topic_partition_in_sync_replica) by (topic) < 3'
+ query: "sum(kafka_topic_partition_in_sync_replica) by (topic) < 3"
severity: critical
- name: Kafka consumers group
description: Kafka consumers group
- query: 'sum(kafka_consumergroup_lag) by (consumergroup) > 50'
+ query: "sum(kafka_consumergroup_lag) by (consumergroup) > 50"
severity: critical
for: 1m
- name: linkedin/Burrow
@@ -1293,11 +1463,11 @@ groups:
rules:
- name: Kafka topic offset decreased
description: Kafka topic offset has decreased
- query: 'delta(kafka_burrow_partition_current_offset[1m]) < 0'
+ query: "delta(kafka_burrow_partition_current_offset[1m]) < 0"
severity: warning
- name: Kafka consumer lag
description: Kafka consumer has a 30 minutes and increasing lag
- query: 'kafka_burrow_topic_partition_offset - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset >= (kafka_burrow_topic_partition_offset offset 15m - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset offset 15m) AND kafka_burrow_topic_partition_offset - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset > 0'
+ query: "kafka_burrow_topic_partition_offset - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset >= (kafka_burrow_topic_partition_offset offset 15m - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset offset 15m) AND kafka_burrow_topic_partition_offset - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset > 0"
severity: warning
for: 15m
@@ -1366,24 +1536,105 @@ groups:
rules:
- name: Nats high connection count
description: High number of NATS connections ({{ $value }}) for {{ $labels.instance }}
- query: 'gnatsd_varz_connections > 100'
+ query: "gnatsd_varz_connections > 100"
severity: warning
for: 3m
- name: Nats high pending bytes
description: High number of NATS pending bytes ({{ $value }}) for {{ $labels.instance }}
- query: 'gnatsd_connz_pending_bytes > 100000'
+ query: "gnatsd_connz_pending_bytes > 100000"
severity: warning
for: 3m
- name: Nats high subscriptions count
description: High number of NATS subscriptions ({{ $value }}) for {{ $labels.instance }}
- query: 'gnatsd_connz_subscriptions > 50'
+ query: "gnatsd_connz_subscriptions > 50"
severity: warning
for: 3m
- name: Nats high routes count
description: High number of NATS routes ({{ $value }}) for {{ $labels.instance }}
- query: 'gnatsd_routez_num_routes > 10'
+ query: "gnatsd_varz_routes > 10"
severity: warning
for: 3m
+ - name: Nats high memory usage
+ description: NATS server memory usage is above 200MB for {{ $labels.instance }}
+ query: "gnatsd_varz_mem > 200 * 1024 * 1024"
+ severity: warning
+ for: 5m
+ - name: Nats slow consumers
+ description: There are slow consumers in NATS for {{ $labels.instance }}
+ query: "gnatsd_varz_slow_consumers > 0"
+ severity: critical
+ for: 3m
+ - name: Nats server down
+ description: NATS server has been down for more than 5 minutes
+ query: 'absent(up{job="nats"})'
+ severity: critical
+ for: 5m
+ - name: Nats high CPU usage
+ description: NATS server is using more than 80% CPU for the last 5 minutes
+ query: "rate(gnatsd_varz_cpu[5m]) > 0.8"
+ severity: warning
+ for: 5m
+ - name: Nats high number of connections
+ description: NATS server has more than 1000 active connections
+ query: "gnatsd_connz_num_connections > 1000"
+ severity: warning
+ for: 5m
+ - name: Nats high JetStream store usage
+ description: JetStream store usage is over 80%
+ query: "gnatsd_varz_jetstream_stats_storage / gnatsd_varz_jetstream_config_max_storage > 0.8"
+ severity: warning
+ for: 5m
+ - name: Nats high JetStream memory usage
+ description: JetStream memory usage is over 80%
+ query: "gnatsd_varz_jetstream_stats_memory / gnatsd_varz_jetstream_config_max_memory > 0.8"
+ severity: warning
+ for: 5m
+ - name: Nats high number of subscriptions
+ description: NATS server has more than 1000 active subscriptions
+ query: "gnatsd_connz_subscriptions > 1000"
+ severity: warning
+ for: 5m
+ - name: Nats high pending bytes
+ description: NATS server has more than 100,000 pending bytes
+ query: "gnatsd_connz_pending_bytes > 100000"
+ severity: warning
+ for: 5m
+ - name: Nats too many errors
+ description: NATS server has encountered errors in the last 5 minutes
+ query: "increase(gnatsd_varz_jetstream_stats_api_errors[5m]) > 0"
+ severity: warning
+ for: 5m
+ - name: Nats JetStream consumers exceeded
+ description: JetStream has more than 100 active consumers
+ query: "sum(gnatsd_varz_jetstream_stats_accounts) > 100"
+ severity: warning
+ for: 5m
+ - name: Nats frequent authentication timeouts
+ description: There have been more than 5 authentication timeouts in the last 5 minutes
+ query: "increase(gnatsd_varz_auth_timeout[5m]) > 5"
+ severity: warning
+ for: 5m
+ - name: Nats max payload size exceeded
+ description: The max payload size allowed by NATS has been exceeded (1MB)
+ query: "max(gnatsd_varz_max_payload) > 1024 * 1024"
+ severity: critical
+ for: 5m
+ - name: Nats leaf node connection issue
+ description: No leaf node connections have been established in the last 5 minutes
+ query: "increase(gnatsd_varz_leafnodes[5m]) == 0"
+ severity: critical
+ for: 5m
+ - name: Nats max ping operations exceeded
+ description: The maximum number of ping operations in NATS has exceeded 50
+ query: "gnatsd_varz_ping_max > 50"
+ severity: warning
+ for: 5m
+ - name: Nats write deadline exceeded
+ description: The write deadline has been exceeded in NATS, indicating potential message delivery issues
+ query: "gnatsd_varz_write_deadline > 10"
+ severity: critical
+ for: 5m
+
- name: Solr
exporters:
@@ -1393,7 +1644,7 @@ groups:
rules:
- name: Solr update errors
description: Solr collection {{ $labels.collection }} has failed updates for replica {{ $labels.replica }} on {{ $labels.base_url }}.
- query: 'increase(solr_metrics_core_update_handler_errors_total[1m]) > 1'
+ query: "increase(solr_metrics_core_update_handler_errors_total[1m]) > 1"
severity: critical
- name: Solr query errors
description: Solr has increased query errors in collection {{ $labels.collection }} for replica {{ $labels.replica }} on {{ $labels.base_url }}.
@@ -1406,9 +1657,85 @@ groups:
severity: critical
- name: Solr low live node count
description: Solr collection {{ $labels.collection }} has less than two live nodes for replica {{ $labels.replica }} on {{ $labels.base_url }}.
- query: 'solr_collections_live_nodes < 2'
+ query: "solr_collections_live_nodes < 2"
severity: critical
+ - name: Hadoop
+ exporters:
+ - name: hadoop/jmx_exporter
+ slug: jmx_exporter
+ doc_url: https://github.com/prometheus/jmx_exporter
+ rules:
+ # Alert rule for NameNode availability
+ - name: Hadoop Name Node Down
+ query: up{job="hadoop-namenode"} == 0
+ for: 5m
+ severity: critical
+ description: "The Hadoop NameNode service is unavailable."
+
+ # Alert rule for ResourceManager availability
+ - name: Hadoop Resource Manager Down
+ query: up{job="hadoop-resourcemanager"} == 0
+ for: 5m
+ severity: critical
+ description: "The Hadoop ResourceManager service is unavailable."
+
+ # Alert rule for DataNode status
+ - name: Hadoop Data Node Out Of Service
+ query: hadoop_datanode_last_heartbeat == 0
+ for: 10m
+ severity: warning
+ description: "The Hadoop DataNode is not sending heartbeats."
+
+ # Alert rule for low HDFS disk space
+ - name: Hadoop HDFS Disk Space Low
+ query: (hadoop_hdfs_bytes_total - hadoop_hdfs_bytes_used) / hadoop_hdfs_bytes_total < 0.1
+ for: 15m
+ severity: warning
+ description: "Available HDFS disk space is running low."
+
+ # Alert rule for excessive MapReduce task failures
+ - name: Hadoop Map Reduce Task Failures
+ query: hadoop_mapreduce_task_failures_total > 100
+ for: 10m
+ severity: critical
+ description: "There is an unusually high number of MapReduce task failures."
+
+ # Alert rule for high ResourceManager memory usage
+ - name: Hadoop Resource Manager Memory High
+ query: hadoop_resourcemanager_memory_bytes / hadoop_resourcemanager_memory_max_bytes > 0.8
+ for: 15m
+ severity: warning
+ description: "The Hadoop ResourceManager is approaching its memory limit."
+
+ # Alert rule for high YARN container allocation failures
+ - name: Hadoop YARN Container Allocation Failures
+ query: hadoop_yarn_container_allocation_failures_total > 10
+ for: 10m
+ severity: warning
+ description: "There is a significant number of YARN container allocation failures."
+
+ # Alert rule for excessive HBase region server region count
+ - name: Hadoop HBase Region Count High
+ query: hadoop_hbase_region_count > 5000
+ for: 15m
+ severity: warning
+ description: "The HBase cluster has an unusually high number of regions."
+
+ # Alert rule for low HBase region server heap space
+ - name: Hadoop HBase Region Server Heap Low
+ query: hadoop_hbase_region_server_heap_bytes / hadoop_hbase_region_server_max_heap_bytes < 0.2
+ for: 10m
+ severity: critical
+ description: "HBase Region Servers are running low on heap space."
+
+ # Alert rule for high HBase Write Requests latency
+ - name: Hadoop HBase Write Requests Latency High
+ query: hadoop_hbase_write_requests_latency_seconds > 0.5
+ for: 10m
+ severity: warning
+ description: "HBase Write Requests are experiencing high latency."
+
- name: Reverse proxies and load balancers
services:
- name: Nginx
@@ -1429,7 +1756,7 @@ groups:
for: 1m
- name: Nginx latency high
description: Nginx p99 latency is higher than 3 seconds
- query: 'histogram_quantile(0.99, sum(rate(nginx_http_request_duration_seconds_bucket[2m])) by (host, node, le)) > 3'
+ query: "histogram_quantile(0.99, sum(rate(nginx_http_request_duration_seconds_bucket[2m])) by (host, node, le)) > 3"
severity: warning
for: 2m
@@ -1441,7 +1768,7 @@ groups:
rules:
- name: Apache down
description: Apache down
- query: 'apache_up == 0'
+ query: "apache_up == 0"
severity: critical
- name: Apache workers load
description: Apache workers in busy state approach the max workers count 80% workers busy on {{ $labels.instance }}
@@ -1450,7 +1777,7 @@ groups:
for: 2m
- name: Apache restart
description: Apache has just been restarted.
- query: 'apache_uptime_seconds_total / 60 < 1'
+ query: "apache_uptime_seconds_total / 60 < 1"
severity: warning
- name: HaProxy
@@ -1557,54 +1884,54 @@ groups:
for: 1m
- name: HAProxy server response errors
description: Too many response errors to {{ $labels.server }} server (> 5%).
- query: 'sum by (server) (rate(haproxy_server_response_errors_total[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5'
+ query: "sum by (server) (rate(haproxy_server_response_errors_total[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5"
severity: critical
for: 1m
- name: HAProxy backend connection errors
description: Too many connection errors to {{ $labels.fqdn }}/{{ $labels.backend }} backend (> 100 req/s). Request throughput may be too high.
- query: 'sum by (backend) (rate(haproxy_backend_connection_errors_total[1m])) > 100'
+ query: "sum by (backend) (rate(haproxy_backend_connection_errors_total[1m])) > 100"
severity: critical
for: 1m
- name: HAProxy server connection errors
description: Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high.
- query: 'sum by (server) (rate(haproxy_server_connection_errors_total[1m])) > 100'
+ query: "sum by (server) (rate(haproxy_server_connection_errors_total[1m])) > 100"
severity: critical
- name: HAProxy backend max active session
description: HAproxy backend {{ $labels.fqdn }}/{{ $labels.backend }} is reaching session limit (> 80%).
- query: '((sum by (backend) (avg_over_time(haproxy_backend_max_sessions[2m]) * 100) / sum by (backend) (avg_over_time(haproxy_backend_limit_sessions[2m])))) > 80'
+ query: "((sum by (backend) (avg_over_time(haproxy_backend_current_sessions[2m]) * 100) / sum by (backend) (avg_over_time(haproxy_backend_limit_sessions[2m])))) > 80"
severity: warning
for: 2m
- name: HAProxy pending requests
description: Some HAProxy requests are pending on {{ $labels.fqdn }}/{{ $labels.backend }} backend
- query: 'sum by (backend) (haproxy_backend_current_queue) > 0'
+ query: "sum by (backend) (haproxy_backend_current_queue) > 0"
severity: warning
for: 2m
- name: HAProxy HTTP slowing down
description: Average request time is increasing
- query: 'avg by (backend) (haproxy_backend_http_total_time_average_seconds) > 1'
+ query: "avg by (backend) (haproxy_backend_http_total_time_average_seconds) > 1"
severity: warning
for: 1m
- name: HAProxy retry high
description: High rate of retry on {{ $labels.fqdn }}/{{ $labels.backend }} backend
- query: 'sum by (backend) (rate(haproxy_backend_retry_warnings_total[1m])) > 10'
+ query: "sum by (backend) (rate(haproxy_backend_retry_warnings_total[1m])) > 10"
severity: warning
for: 2m
- name: HAProxy backend down
description: HAProxy backend is down
- query: 'haproxy_backend_up == 0'
+ query: "haproxy_backend_up == 0"
severity: critical
- name: HAProxy server down
description: HAProxy server is down
- query: 'haproxy_server_up == 0'
+ query: "haproxy_server_up == 0"
severity: critical
- name: HAProxy frontend security blocked requests
description: HAProxy is blocking requests for security reason
- query: 'sum by (frontend) (rate(haproxy_frontend_requests_denied_total[2m])) > 10'
+ query: "sum by (frontend) (rate(haproxy_frontend_requests_denied_total[2m])) > 10"
severity: warning
for: 2m
- name: HAProxy server healthcheck failure
description: Some server healthcheck are failing on {{ $labels.server }}
- query: 'increase(haproxy_server_check_failures_total[1m]) > 0'
+ query: "increase(haproxy_server_check_failures_total[1m]) > 0"
severity: warning
for: 1m
@@ -1616,7 +1943,7 @@ groups:
rules:
- name: Traefik service down
description: All Traefik services are down
- query: 'count(traefik_service_server_up) by (service) == 0'
+ query: "count(traefik_service_server_up) by (service) == 0"
severity: critical
- name: Traefik high HTTP 4xx error rate service
description: Traefik service 4xx error rate is above 5%
@@ -1634,7 +1961,7 @@ groups:
rules:
- name: Traefik backend down
description: All Traefik backends are down
- query: 'count(traefik_backend_server_up) by (backend) == 0'
+ query: "count(traefik_backend_server_up) by (backend) == 0"
severity: critical
- name: Traefik high HTTP 4xx error rate backend
description: Traefik backend 4xx error rate is above 5%
@@ -1647,6 +1974,27 @@ groups:
severity: critical
for: 1m
+ - name: Caddy
+ exporters:
+ - name: Embedded exporter
+ doc_url: https://caddyserver.com/docs/metrics
+ rules:
+ - name: Caddy Reverse Proxy Down
+ description: "All Caddy reverse proxies are down"
+ query: "count(caddy_reverse_proxy_upstreams_healthy) by (upstream) == 0"
+ severity: critical
+ for: 0m
+ - name: Caddy high HTTP 4xx error rate service
+ description: "Caddy service 4xx error rate is above 5%"
+ query: 'sum(rate(caddy_http_request_duration_seconds_count{code=~"4.."}[3m])) by (instance) / sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) * 100 > 5'
+ severity: critical
+ for: 1m
+ - name: Caddy high HTTP 5xx error rate service
+ description: "Caddy service 5xx error rate is above 5%"
+ query: 'sum(rate(caddy_http_request_duration_seconds_count{code=~"5.."}[3m])) by (instance) / sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) * 100 > 5'
+ severity: critical
+ for: 1m
+
- name: Runtimes
services:
- name: PHP-FPM
@@ -1657,7 +2005,7 @@ groups:
rules:
- name: PHP-FPM max-children reached
description: PHP-FPM reached max children - {{ $labels.instance }}
- query: 'sum(phpfpm_max_children_reached_total) by (instance) > 0'
+ query: "sum(phpfpm_max_children_reached_total) by (instance) > 0"
severity: warning
- name: JVM
@@ -1680,15 +2028,14 @@ groups:
rules:
- name: Sidekiq queue size
description: Sidekiq queue {{ $labels.name }} is growing
- query: 'sidekiq_queue_size > 100'
+ query: "sidekiq_queue_size > 100"
severity: warning
for: 1m
- name: Sidekiq scheduling latency too high
description: Sidekiq jobs are taking more than 1min to be picked up. Users may be seeing delays in background processing.
- query: 'max(sidekiq_queue_latency) > 60'
+ query: "max(sidekiq_queue_latency) > 60"
severity: critical
-
- name: Orchestrators
services:
- name: Kubernetes
@@ -1697,148 +2044,173 @@ groups:
slug: kubestate-exporter
doc_url: https://github.com/kubernetes/kube-state-metrics/tree/master/docs
rules:
- - name: Kubernetes node not ready
+ - name: Kubernetes Node not ready
+ summary: Kubernetes Node ready (node {{ $labels.node }})
description: Node {{ $labels.node }} has been unready for a long time
query: 'kube_node_status_condition{condition="Ready",status="true"} == 0'
severity: critical
for: 10m
- - name: Kubernetes memory pressure
- description: "{{ $labels.node }} has MemoryPressure condition"
+ - name: Kubernetes Node memory pressure
+ summary: Kubernetes memory pressure (node {{ $labels.node }})
+ description: "Node {{ $labels.node }} has MemoryPressure condition"
query: 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1'
severity: critical
for: 2m
- - name: Kubernetes disk pressure
- description: "{{ $labels.node }} has DiskPressure condition"
+ - name: Kubernetes Node disk pressure
+ summary: Kubernetes disk pressure (node {{ $labels.node }})
+ description: "Node {{ $labels.node }} has DiskPressure condition"
query: 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1'
severity: critical
for: 2m
- - name: Kubernetes network unavailable
- description: "{{ $labels.node }} has NetworkUnavailable condition"
+ - name: Kubernetes Node network unavailable
+ description: "Node {{ $labels.node }} has NetworkUnavailable condition"
query: 'kube_node_status_condition{condition="NetworkUnavailable",status="true"} == 1'
severity: critical
for: 2m
- - name: Kubernetes out of capacity
- description: "{{ $labels.node }} is out of capacity"
- query: 'sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90'
+ - name: Kubernetes Node out of pod capacity
+ description: "Node {{ $labels.node }} is out of pod capacity"
+ query: 'sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid, instance) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90'
severity: warning
for: 2m
- - name: Kubernetes container oom killer
+ - name: Kubernetes Container oom killer
+ summary: Kubernetes container oom killer ({{ $labels.namespace }}/{{ $labels.pod }}:{{ $labels.container }})
description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes."
query: '(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1'
severity: warning
- name: Kubernetes Job failed
+ summary: Kubernetes Job failed ({{ $labels.namespace }}/{{ $labels.job_name }})
description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete"
- query: 'kube_job_status_failed > 0'
+ query: "kube_job_status_failed > 0"
+ severity: warning
+ - name: Kubernetes Job not starting
+ summary: Kubernetes Job not starting ({{ $labels.namespace }}/{{ $labels.job_name }})
+ description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} did not start for 10 minutes"
+ query: "kube_job_status_active == 0 and kube_job_status_failed == 0 and kube_job_status_succeeded == 0 and (time() - kube_job_status_start_time) > 600"
severity: warning
- name: Kubernetes CronJob suspended
+ summary: Kubernetes CronJob suspended ({{ $labels.namespace }}/{{ $labels.cronjob }})
description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is suspended"
- query: 'kube_cronjob_spec_suspend != 0'
+ query: "kube_cronjob_spec_suspend != 0"
severity: warning
- name: Kubernetes PersistentVolumeClaim pending
+ summary: Kubernetes PersistentVolumeClaim pending ({{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }})
description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pending"
query: 'kube_persistentvolumeclaim_status_phase{phase="Pending"} == 1'
severity: warning
for: 2m
- name: Kubernetes Volume out of disk space
description: Volume is almost full (< 10% left)
- query: 'kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes * 100 < 10'
+ query: "kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes * 100 < 10"
severity: warning
for: 2m
- name: Kubernetes Volume full in four days
- description: "{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available."
- query: 'predict_linear(kubelet_volume_stats_available_bytes[6h], 4 * 24 * 3600) < 0'
+ description: "Volume under {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available."
+ query: "predict_linear(kubelet_volume_stats_available_bytes[6h:5m], 4 * 24 * 3600) < 0"
severity: critical
- name: Kubernetes PersistentVolume error
- description: "Persistent volume is in bad state"
+ summary: Kubernetes PersistentVolumeClaim pending ({{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }})
+ description: "Persistent volume {{ $labels.persistentvolume }} is in bad state"
query: 'kube_persistentvolume_status_phase{phase=~"Failed|Pending", job="kube-state-metrics"} > 0'
severity: critical
- name: Kubernetes StatefulSet down
- description: A StatefulSet went down
- query: 'kube_statefulset_replicas != kube_statefulset_status_replicas_ready > 0'
+ summary: Kubernetes StatefulSet down ({{ $labels.namespace }}/{{ $labels.statefulset }})
+ description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} went down
+ query: "kube_statefulset_replicas != kube_statefulset_status_replicas_ready > 0"
severity: critical
for: 1m
- - name: Kubernetes HPA scaling ability
- description: Pod is unable to scale
- query: 'kube_horizontalpodautoscaler_status_condition{status="false", condition="AbleToScale"} == 1'
+ - name: Kubernetes HPA scale inability
+ description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to scale
+ query: '(kube_horizontalpodautoscaler_spec_max_replicas - kube_horizontalpodautoscaler_status_desired_replicas) * on (horizontalpodautoscaler,namespace) (kube_horizontalpodautoscaler_status_condition{condition="ScalingLimited", status="true"} == 1) == 0'
severity: warning
for: 2m
- - name: Kubernetes HPA metric availability
- description: HPA is not able to collect metrics
+ - name: Kubernetes HPA metrics unavailability
+ description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to collect metrics
query: 'kube_horizontalpodautoscaler_status_condition{status="false", condition="ScalingActive"} == 1'
severity: warning
- - name: Kubernetes HPA scale capability
- description: The maximum number of desired Pods has been hit
- query: 'kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas'
+ - name: Kubernetes HPA scale maximum
+ description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has hit maximum number of desired pods
+ query: '(kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas) and (kube_horizontalpodautoscaler_spec_max_replicas > 1) and (kube_horizontalpodautoscaler_spec_min_replicas != kube_horizontalpodautoscaler_spec_max_replicas)'
severity: info
for: 2m
- name: Kubernetes HPA underutilized
- description: HPA is constantly at minimum replicas for 50% of the time. Potential cost saving here.
- query: 'max(quantile_over_time(0.5, kube_horizontalpodautoscaler_status_desired_replicas[1d]) == kube_horizontalpodautoscaler_spec_min_replicas) by (horizontalpodautoscaler) > 3' # allow minimum 3 replicas running
+ description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is constantly at minimum replicas for 50% of the time. Potential cost saving here.
+ query: "max(quantile_over_time(0.5, kube_horizontalpodautoscaler_status_desired_replicas[1d]) == kube_horizontalpodautoscaler_spec_min_replicas) by (horizontalpodautoscaler) > 3" # allow minimum 3 replicas running
severity: info
- name: Kubernetes Pod not healthy
- description: Pod has been in a non-ready state for longer than 15 minutes.
+ summary: Kubernetes Pod not healthy ({{ $labels.namespace }}/{{ $labels.pod }})
+ description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-running state for longer than 15 minutes.
query: 'sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"}) > 0'
severity: critical
for: 15m
- name: Kubernetes pod crash looping
- description: Pod {{ $labels.pod }} is crash looping
- query: 'increase(kube_pod_container_status_restarts_total[1m]) > 3'
+ summary: Kubernetes pod crash looping ({{ $labels.namespace }}/{{ $labels.pod }})
+ description: Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping
+ query: "increase(kube_pod_container_status_restarts_total[1m]) > 3"
severity: warning
for: 2m
- - name: Kubernetes ReplicasSet mismatch
- description: Deployment Replicas mismatch
- query: 'kube_replicaset_spec_replicas != kube_replicaset_status_ready_replicas'
+ - name: Kubernetes ReplicaSet replicas mismatch
+ summary: Kubernetes ReplicasSet mismatch ({{ $labels.namespace }}/{{ $labels.replicaset }})
+ description: ReplicaSet {{ $labels.namespace }}/{{ $labels.replicaset }} replicas mismatch
+ query: "kube_replicaset_spec_replicas != kube_replicaset_status_ready_replicas"
severity: warning
for: 10m
- name: Kubernetes Deployment replicas mismatch
- description: Deployment Replicas mismatch
- query: 'kube_deployment_spec_replicas != kube_deployment_status_replicas_available'
+ summary: Kubernetes Deployment replicas mismatch ({{ $labels.namespace }}/{{ $labels.deployment }})
+ description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replicas mismatch
+ query: "kube_deployment_spec_replicas != kube_deployment_status_replicas_available"
severity: warning
for: 10m
- name: Kubernetes StatefulSet replicas mismatch
- description: A StatefulSet does not match the expected number of replicas.
- query: 'kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas'
+ description: StatefulSet does not match the expected number of replicas.
+ query: "kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas"
severity: warning
for: 10m
- name: Kubernetes Deployment generation mismatch
- description: A Deployment has failed but has not been rolled back.
- query: 'kube_deployment_status_observed_generation != kube_deployment_metadata_generation'
+ summary: Kubernetes Deployment generation mismatch ({{ $labels.namespace }}/{{ $labels.deployment }})
+ description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has failed but has not been rolled back.
+ query: "kube_deployment_status_observed_generation != kube_deployment_metadata_generation"
severity: critical
for: 10m
- name: Kubernetes StatefulSet generation mismatch
- description: A StatefulSet has failed but has not been rolled back.
- query: 'kube_statefulset_status_observed_generation != kube_statefulset_metadata_generation'
+ summary: Kubernetes StatefulSet generation mismatch ({{ $labels.namespace }}/{{ $labels.statefulset }})
+ description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has failed but has not been rolled back.
+ query: "kube_statefulset_status_observed_generation != kube_statefulset_metadata_generation"
severity: critical
for: 10m
- name: Kubernetes StatefulSet update not rolled out
- description: StatefulSet update has not been rolled out.
- query: 'max without (revision) (kube_statefulset_status_current_revision unless kube_statefulset_status_update_revision) * (kube_statefulset_replicas != kube_statefulset_status_replicas_updated)'
+ summary: Kubernetes StatefulSet update not rolled out ({{ $labels.namespace }}/{{ $labels.statefulset }})
+ description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.
+ query: "max without (revision) (kube_statefulset_status_current_revision unless kube_statefulset_status_update_revision) * (kube_statefulset_replicas != kube_statefulset_status_replicas_updated)"
severity: warning
for: 10m
- name: Kubernetes DaemonSet rollout stuck
- description: Some Pods of DaemonSet are not scheduled or not ready
- query: 'kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 or kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0'
+ summary: Kubernetes DaemonSet rollout stuck ({{ $labels.namespace }}/{{ $labels.daemonset }})
+ description: Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled or not ready
+ query: "kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 or kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0"
severity: warning
for: 10m
- name: Kubernetes DaemonSet misscheduled
- description: Some DaemonSet Pods are running where they are not supposed to run
- query: 'kube_daemonset_status_number_misscheduled > 0'
+ summary: Kubernetes DaemonSet misscheduled ({{ $labels.namespace }}/{{ $labels.daemonset }})
+ description: Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run
+ query: "kube_daemonset_status_number_misscheduled > 0"
severity: critical
for: 1m
- name: Kubernetes CronJob too long
+ summary: Kubernetes CronJob too long ({{ $labels.namespace }}/{{ $labels.cronjob }})
description: CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete.
- query: 'time() - kube_cronjob_next_schedule_time > 3600'
+ query: "time() - kube_cronjob_next_schedule_time > 3600"
severity: warning
comments: |
Threshold should be customized for each cronjob name.
- - name: Kubernetes job slow completion
+ - name: Kubernetes Job slow completion
+ summary: Kubernetes job slow completion ({{ $labels.namespace }}/{{ $labels.job_name }})
description: Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in time.
- query: 'kube_job_spec_completions - kube_job_status_succeeded - kube_job_status_failed > 0'
+ query: "kube_job_spec_completions - kube_job_status_succeeded - kube_job_status_failed > 0"
severity: critical
for: 12h
- name: Kubernetes API server errors
description: Kubernetes API server is experiencing high error rate
- query: 'sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[1m])) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) * 100 > 3'
+ query: 'sum(rate(apiserver_request_total{job="apiserver",code=~"(?:5..)"}[1m])) by (instance, job) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) by (instance, job) * 100 > 3'
severity: critical
for: 2m
- name: Kubernetes API client errors
@@ -1855,12 +2227,11 @@ groups:
query: 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 24*60*60'
severity: critical
- name: Kubernetes API server latency
- description: 'Kubernetes API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}.'
- query: 'histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"} [10m])) WITHOUT (instance, resource)) / 1e+06 > 1'
+ description: "Kubernetes API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}."
+ query: 'histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{verb!~"(?:CONNECT|WATCHLIST|WATCH|PROXY)"} [10m])) WITHOUT (subresource)) > 1'
severity: warning
for: 2m
-
- name: Nomad
exporters:
- name: Embedded exporter
@@ -1868,20 +2239,20 @@ groups:
rules:
- name: Nomad job failed
description: Nomad job failed
- query: 'nomad_nomad_job_summary_failed > 0'
+ query: "nomad_nomad_job_summary_failed > 0"
severity: warning
- name: Nomad job lost
description: Nomad job lost
- query: 'nomad_nomad_job_summary_lost > 0'
+ query: "nomad_nomad_job_summary_lost > 0"
severity: warning
- name: Nomad job queued
description: Nomad job queued
- query: 'nomad_nomad_job_summary_queued > 0'
+ query: "nomad_nomad_job_summary_queued > 0"
severity: warning
for: 2m
- name: Nomad blocked evaluation
description: Nomad blocked evaluation
- query: 'nomad_nomad_blocked_evals_total_blocked > 0'
+ query: "nomad_nomad_blocked_evals_total_blocked > 0"
severity: warning
- name: Consul
@@ -1892,12 +2263,12 @@ groups:
rules:
- name: Consul service healthcheck failed
description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`"
- query: 'consul_catalog_service_node_healthy == 0'
+ query: "consul_catalog_service_node_healthy == 0"
severity: critical
- for: 1m # allows a short service restart
+ for: 1m # allows a short service restart
- name: Consul missing master node
description: Numbers of consul raft peers should be 3, in order to preserve quorum.
- query: 'consul_raft_peers < 3'
+ query: "consul_raft_peers < 3"
severity: critical
- name: Consul agent unhealthy
description: A Consul agent is down
@@ -1911,15 +2282,15 @@ groups:
rules:
- name: Etcd insufficient Members
description: Etcd cluster should have an odd number of members
- query: 'count(etcd_server_id) % 2 == 0'
+ query: "count(etcd_server_id) % 2 == 0"
severity: critical
- name: Etcd no Leader
description: Etcd cluster have no leader
- query: 'etcd_server_has_leader == 0'
+ query: "etcd_server_has_leader == 0"
severity: critical
- name: Etcd high number of leader changes
description: Etcd leader changed more than 2 times during 10 minutes
- query: 'increase(etcd_server_leader_changes_seen_total[10m]) > 2'
+ query: "increase(etcd_server_leader_changes_seen_total[10m]) > 2"
severity: warning
- name: Etcd high number of failed GRPC requests
description: More than 1% GRPC request failure detected in Etcd
@@ -1938,37 +2309,37 @@ groups:
for: 2m
- name: Etcd high number of failed HTTP requests
description: More than 1% HTTP failure detected in Etcd
- query: 'sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.01'
+ query: "sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.01"
severity: warning
for: 2m
- name: Etcd high number of failed HTTP requests
description: More than 5% HTTP failure detected in Etcd
- query: 'sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.05'
+ query: "sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.05"
severity: critical
for: 2m
- name: Etcd HTTP requests slow
description: HTTP requests slowing down, 99th percentile is over 0.15s
- query: 'histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[1m])) > 0.15'
+ query: "histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[1m])) > 0.15"
severity: warning
for: 2m
- name: Etcd member communication slow
description: Etcd member communication slowing down, 99th percentile is over 0.15s
- query: 'histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) > 0.15'
+ query: "histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) > 0.15"
severity: warning
for: 2m
- name: Etcd high number of failed proposals
description: Etcd server got more than 5 failed proposals past hour
- query: 'increase(etcd_server_proposals_failed_total[1h]) > 5'
+ query: "increase(etcd_server_proposals_failed_total[1h]) > 5"
severity: warning
for: 2m
- name: Etcd high fsync durations
description: Etcd WAL fsync duration increasing, 99th percentile is over 0.5s
- query: 'histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) > 0.5'
+ query: "histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) > 0.5"
severity: warning
for: 2m
- name: Etcd high commit durations
description: Etcd commit duration increasing, 99th percentile is over 0.25s
- query: 'histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[1m])) > 0.25'
+ query: "histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[1m])) > 0.25"
severity: warning
for: 2m
@@ -1980,7 +2351,7 @@ groups:
rules:
- name: Linkerd high error rate
description: Linkerd error rate for {{ $labels.deployment | $labels.statefulset | $labels.daemonset }} is over 10%
- query: 'sum(rate(request_errors_total[1m])) by (deployment, statefulset, daemonset) / sum(rate(request_total[1m])) by (deployment, statefulset, daemonset) * 100 > 10'
+ query: "sum(rate(request_errors_total[1m])) by (deployment, statefulset, daemonset) / sum(rate(request_total[1m])) by (deployment, statefulset, daemonset) * 100 > 10"
severity: warning
for: 1m
@@ -1997,7 +2368,7 @@ groups:
for: 1m
- name: Istio Pilot high total request rate
description: Number of Istio Pilot push errors is too high (> 5%). Envoy sidecars might have outdated configuration.
- query: 'sum(rate(pilot_xds_push_errors[1m])) / sum(rate(pilot_xds_pushes[1m])) * 100 > 5'
+ query: "sum(rate(pilot_xds_push_errors[1m])) / sum(rate(pilot_xds_pushes[1m])) * 100 > 5"
severity: warning
for: 1m
- name: Istio Mixer Prometheus dispatches low
@@ -2032,12 +2403,12 @@ groups:
for: 1m
- name: Istio latency 99 percentile
description: Istio 1% slowest requests are longer than 1000ms.
- query: 'histogram_quantile(0.99, sum(rate(istio_request_duration_milliseconds_bucket[1m])) by (destination_canonical_service, destination_workload_namespace, source_canonical_service, source_workload_namespace, le)) > 1000'
+ query: "histogram_quantile(0.99, sum(rate(istio_request_duration_milliseconds_bucket[1m])) by (destination_canonical_service, destination_workload_namespace, source_canonical_service, source_workload_namespace, le)) > 1000"
severity: warning
for: 1m
- name: Istio Pilot Duplicate Entry
description: Istio pilot duplicate entry error.
- query: 'sum(rate(pilot_duplicate_envoy_clusters{}[5m])) > 0'
+ query: "sum(rate(pilot_duplicate_envoy_clusters{}[5m])) > 0"
severity: critical
- name: ArgoCD
@@ -2057,7 +2428,6 @@ groups:
severity: warning
for: 15m
-
- name: Network, security and storage
services:
- name: Ceph
@@ -2068,25 +2438,25 @@ groups:
rules:
- name: Ceph State
description: Ceph instance unhealthy
- query: 'ceph_health_status != 0'
+ query: "ceph_health_status != 0"
severity: critical
- name: Ceph monitor clock skew
description: Ceph monitor clock skew detected. Please check ntp and hardware clock settings
- query: 'abs(ceph_monitor_clock_skew_seconds) > 0.2'
+ query: "abs(ceph_monitor_clock_skew_seconds) > 0.2"
severity: warning
for: 2m
- name: Ceph monitor low space
description: Ceph monitor storage is low.
- query: 'ceph_monitor_avail_percent < 10'
+ query: "ceph_monitor_avail_percent < 10"
severity: warning
for: 2m
- name: Ceph OSD Down
description: Ceph Object Storage Daemon Down
- query: 'ceph_osd_up == 0'
+ query: "ceph_osd_up == 0"
severity: critical
- name: Ceph high OSD latency
description: "Ceph Object Storage Daemon latency is high. Please check if it doesn't stuck in weird state."
- query: 'ceph_osd_perf_apply_latency_seconds > 5'
+ query: "ceph_osd_perf_apply_latency_seconds > 5"
severity: warning
for: 1m
- name: Ceph OSD low space
@@ -2096,16 +2466,16 @@ groups:
for: 2m
- name: Ceph OSD reweighted
description: Ceph Object Storage Daemon takes too much time to resize.
- query: 'ceph_osd_weight < 1'
+ query: "ceph_osd_weight < 1"
severity: warning
for: 2m
- name: Ceph PG down
description: Some Ceph placement groups are down. Please ensure that all the data are available.
- query: 'ceph_pg_down > 0'
+ query: "ceph_pg_down > 0"
severity: critical
- name: Ceph PG incomplete
description: Some Ceph placement groups are incomplete. Please ensure that all the data are available.
- query: 'ceph_pg_incomplete > 0'
+ query: "ceph_pg_incomplete > 0"
severity: critical
- name: Ceph PG inconsistent
description: Some Ceph placement groups are inconsistent. Data is available but inconsistent across nodes.
@@ -2113,17 +2483,17 @@ groups:
severity: warning
- name: Ceph PG activation long
description: Some Ceph placement groups are too long to activate.
- query: 'ceph_pg_activating > 0'
+ query: "ceph_pg_activating > 0"
severity: warning
for: 2m
- name: Ceph PG backfill full
description: Some Ceph placement groups are located on full Object Storage Daemon on cluster. Those PGs can be unavailable shortly. Please check OSDs, change weight or reconfigure CRUSH rules.
- query: 'ceph_pg_backfill_toofull > 0'
+ query: "ceph_pg_backfill_toofull > 0"
severity: warning
for: 2m
- name: Ceph PG unavailable
description: Some Ceph placement groups are unavailable.
- query: 'ceph_pg_total - ceph_pg_active > 0'
+ query: "ceph_pg_total - ceph_pg_active > 0"
severity: critical
- name: SpeedTest
@@ -2134,11 +2504,11 @@ groups:
rules:
- name: SpeedTest Slow Internet Download
description: Internet download speed is currently {{humanize $value}} Mbps.
- query: 'avg_over_time(speedtest_download[10m]) < 100'
+ query: "avg_over_time(speedtest_download[10m]) < 100"
severity: warning
- name: SpeedTest Slow Internet Upload
description: Internet upload speed is currently {{humanize $value}} Mbps.
- query: 'avg_over_time(speedtest_upload[10m]) < 20'
+ query: "avg_over_time(speedtest_upload[10m]) < 20"
severity: warning
- name: ZFS
@@ -2148,7 +2518,7 @@ groups:
doc_url: https://github.com/prometheus/node_exporter
rules:
- name: ZFS offline pool
- description: 'A ZFS zpool is in a unexpected state: {{ $labels.state }}.'
+ description: "A ZFS zpool is in a unexpected state: {{ $labels.state }}."
query: 'node_zfs_zpool_state{state!="online"} > 0'
severity: critical
for: 1m
@@ -2158,11 +2528,11 @@ groups:
rules:
- name: ZFS pool out of space
description: Disk is almost full (< 10% left)
- query: 'zfs_pool_free_bytes * 100 / zfs_pool_size_bytes < 10 and ON (instance, device, mountpoint) zfs_pool_readonly == 0'
+ query: "zfs_pool_free_bytes * 100 / zfs_pool_size_bytes < 10 and ON (instance, device, mountpoint) zfs_pool_readonly == 0"
severity: warning
- name: ZFS pool unhealthy
description: ZFS pool state is {{ $value }}. See comments for more information.
- query: 'zfs_pool_health > 0'
+ query: "zfs_pool_health > 0"
severity: critical
comments: |
0: ONLINE
@@ -2174,7 +2544,7 @@ groups:
6: SUSPENDED
- name: ZFS collector failed
description: ZFS collector for {{ $labels.instance }} has failed to collect information
- query: 'zfs_scrape_collector_success != 1'
+ query: "zfs_scrape_collector_success != 1"
severity: warning
- name: OpenEBS
@@ -2183,8 +2553,8 @@ groups:
slug: embedded-exporter
rules:
- name: OpenEBS used pool capacity
- description: 'OpenEBS Pool use more than 80% of his capacity'
- query: 'openebs_used_pool_capacity_percent > 80'
+ description: "OpenEBS Pool use more than 80% of his capacity"
+ query: "openebs_used_pool_capacity_percent > 80"
severity: warning
for: 2m
@@ -2194,15 +2564,15 @@ groups:
slug: embedded-exporter
rules:
- name: Minio cluster disk offline
- description: 'Minio cluster disk is offline'
- query: 'minio_cluster_disk_offline_total > 0'
+ description: "Minio cluster disk is offline"
+ query: "minio_cluster_drive_offline_total > 0"
severity: critical
- name: Minio node disk offline
- description: 'Minio cluster node disk is offline'
- query: 'minio_cluster_nodes_offline_total > 0'
+ description: "Minio cluster node disk is offline"
+ query: "minio_cluster_nodes_offline_total > 0"
severity: critical
- name: Minio disk space usage
- description: 'Minio available free space is low (< 10%)'
+ description: "Minio available free space is low (< 10%)"
query: disk_storage_available / disk_storage_total * 100 < 10
severity: warning
@@ -2225,7 +2595,7 @@ groups:
query: ssl_ocsp_response_status == 1
severity: critical
- name: SSL certificate expiry (< 7 days)
- description: '{{ $labels.instance }} Certificate is expiring in 7 days'
+ description: "{{ $labels.instance }} Certificate is expiring in 7 days"
query: ssl_verified_cert_not_after{chain_no="0"} - time() < 86400 * 7
severity: warning
@@ -2241,12 +2611,12 @@ groups:
severity: critical
- name: Juniper high Bandwidth Usage 1GiB
description: Interface is highly saturated. (> 0.90GiB/s)
- query: 'rate(junos_interface_transmit_bytes[1m]) * 8 > 1e+9 * 0.90'
+ query: "rate(junos_interface_transmit_bytes[1m]) * 8 > 1e+9 * 0.90"
severity: critical
for: 1m
- name: Juniper high Bandwidth Usage 1GiB
description: Interface is getting saturated. (> 0.80GiB/s)
- query: 'rate(junos_interface_transmit_bytes[1m]) * 8 > 1e+9 * 0.80'
+ query: "rate(junos_interface_transmit_bytes[1m]) * 8 > 1e+9 * 0.80"
severity: warning
for: 1m
@@ -2257,7 +2627,7 @@ groups:
rules:
- name: CoreDNS Panic Count
description: Number of CoreDNS panics encountered
- query: 'increase(coredns_panics_total[1m]) > 0'
+ query: "increase(coredns_panics_total[1m]) > 0"
severity: critical
- name: Freeswitch
@@ -2268,16 +2638,16 @@ groups:
rules:
- name: Freeswitch down
description: Freeswitch is unresponsive
- query: 'freeswitch_up == 0'
+ query: "freeswitch_up == 0"
severity: critical
- name: Freeswitch Sessions Warning
description: 'High sessions usage on {{ $labels.instance }}: {{ $value | printf "%.2f"}}%'
- query: '(freeswitch_session_active * 100 / freeswitch_session_limit) > 80'
+ query: "(freeswitch_session_active * 100 / freeswitch_session_limit) > 80"
severity: warning
for: 10m
- name: Freeswitch Sessions Critical
description: 'High sessions usage on {{ $labels.instance }}: {{ $value | printf "%.2f"}}%'
- query: '(freeswitch_session_active * 100 / freeswitch_session_limit) > 90'
+ query: "(freeswitch_session_active * 100 / freeswitch_session_limit) > 90"
severity: critical
for: 5m
@@ -2288,12 +2658,12 @@ groups:
doc_url: https://github.com/hashicorp/vault/blob/master/website/content/docs/configuration/telemetry.mdx#prometheus
rules:
- name: Vault sealed
- description: 'Vault instance is sealed on {{ $labels.instance }}'
- query: 'vault_core_unsealed == 0'
+ description: "Vault instance is sealed on {{ $labels.instance }}"
+ query: "vault_core_unsealed == 0"
severity: critical
- name: Vault too many pending tokens
description: 'Too many pending tokens {{ $labels.instance }}: {{ $value | printf "%.2f"}}%'
- query: 'avg(vault_token_create_count - vault_token_store_count) > 0'
+ query: "avg(vault_token_create_count - vault_token_store_count) > 0"
severity: warning
for: 5m
- name: Vault too many infinity tokens
@@ -2303,10 +2673,9 @@ groups:
for: 5m
- name: Vault cluster health
description: 'Vault cluster is not healthy {{ $labels.instance }}: {{ $value | printf "%.2f"}}%'
- query: 'sum(vault_core_active) / count(vault_core_active) <= 0.5'
+ query: "sum(vault_core_active) / count(vault_core_active) <= 0.5"
severity: critical
-
- name: Cloudflare
exporters:
- name: lablabs/cloudflare-exporter
@@ -2314,15 +2683,14 @@ groups:
doc_url: https://github.com/lablabs/cloudflare-exporter
rules:
- name: Cloudflare http 4xx error rate
- description: 'Cloudflare high HTTP 4xx error rate (> 5% for domain {{ $labels.zone }})'
+ description: "Cloudflare high HTTP 4xx error rate (> 5% for domain {{ $labels.zone }})"
query: '(sum by(zone) (rate(cloudflare_zone_requests_status{status=~"^4.."}[15m])) / on (zone) sum by (zone) (rate(cloudflare_zone_requests_status[15m]))) * 100 > 5'
severity: warning
- name: Cloudflare http 5xx error rate
- description: 'Cloudflare high HTTP 5xx error rate (> 5% for domain {{ $labels.zone }})'
+ description: "Cloudflare high HTTP 5xx error rate (> 5% for domain {{ $labels.zone }})"
query: '(sum by (zone) (rate(cloudflare_zone_requests_status{status=~"^5.."}[5m])) / on (zone) sum by (zone) (rate(cloudflare_zone_requests_status[5m]))) * 100 > 5'
severity: critical
-
- name: Other
services:
- name: Thanos
@@ -2330,252 +2698,252 @@ groups:
- name: Thanos Compactor
slug: thanos-compactor
rules:
- - name: Thanos Compactor Multiple Running
- description: 'No more than one Thanos Compact instance should be running at once. There are {{$value}} instances running.'
- query: 'sum by (job) (up{job=~".*thanos-compact.*"}) > 1'
- severity: warning
- for: 5m
- - name: Thanos Compactor Halted
- description: 'Thanos Compact {{$labels.job}} has failed to run and now is halted.'
- query: 'thanos_compact_halted{job=~".*thanos-compact.*"} == 1'
- severity: warning
- for: 5m
- - name: Thanos Compactor High Compaction Failures
- description: 'Thanos Compact {{$labels.job}} is failing to execute {{$value | humanize}}% of compactions.'
- query: '(sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~".*thanos-compact.*"}[5m])) / sum by (job) (rate(thanos_compact_group_compactions_total{job=~".*thanos-compact.*"}[5m])) * 100 > 5)'
- severity: warning
- for: 15m
- - name: Thanos Compact Bucket High Operation Failures
- description: 'Thanos Compact {{$labels.job}} Bucket is failing to execute {{$value | humanize}}% of operations.'
- query: '(sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-compact.*"}[5m])) / sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-compact.*"}[5m])) * 100 > 5)'
- severity: warning
- for: 15m
- - name: Thanos Compact Has Not Run
- description: 'Thanos Compact {{$labels.job}} has not uploaded anything for 24 hours.'
- query: '(time() - max by (job) (max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~".*thanos-compact.*"}[24h]))) / 60 / 60 > 24'
- severity: warning
- for: 0m
+ - name: Thanos Compactor Multiple Running
+ description: "No more than one Thanos Compact instance should be running at once. There are {{$value}} instances running."
+ query: 'sum by (job) (up{job=~".*thanos-compact.*"}) > 1'
+ severity: warning
+ for: 5m
+ - name: Thanos Compactor Halted
+ description: "Thanos Compact {{$labels.job}} has failed to run and now is halted."
+ query: 'thanos_compact_halted{job=~".*thanos-compact.*"} == 1'
+ severity: warning
+ for: 5m
+ - name: Thanos Compactor High Compaction Failures
+ description: "Thanos Compact {{$labels.job}} is failing to execute {{$value | humanize}}% of compactions."
+ query: '(sum by (job) (rate(thanos_compact_group_compactions_failures_total{job=~".*thanos-compact.*"}[5m])) / sum by (job) (rate(thanos_compact_group_compactions_total{job=~".*thanos-compact.*"}[5m])) * 100 > 5)'
+ severity: warning
+ for: 15m
+ - name: Thanos Compact Bucket High Operation Failures
+ description: "Thanos Compact {{$labels.job}} Bucket is failing to execute {{$value | humanize}}% of operations."
+ query: '(sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-compact.*"}[5m])) / sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-compact.*"}[5m])) * 100 > 5)'
+ severity: warning
+ for: 15m
+ - name: Thanos Compact Has Not Run
+ description: "Thanos Compact {{$labels.job}} has not uploaded anything for 24 hours."
+ query: '(time() - max by (job) (max_over_time(thanos_objstore_bucket_last_successful_upload_time{job=~".*thanos-compact.*"}[24h]))) / 60 / 60 > 24'
+ severity: warning
+ for: 0m
- name: Thanos Query
slug: thanos-query
rules:
- - name: Thanos Query Http Request Query Error Rate High
- description: 'Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of "query" requests.'
- query: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*", handler="query"}[5m]))/ sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query"}[5m]))) * 100 > 5'
- severity: critical
- for: 5m
- - name: Thanos Query Http Request Query Range Error Rate High
- description: 'Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of "query_range" requests.'
- query: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*", handler="query_range"}[5m]))/ sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query_range"}[5m]))) * 100 > 5'
- severity: critical
- for: 5m
- - name: Thanos Query Grpc Server Error Rate
- description: 'Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.'
- query: '(sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-query.*"}[5m]))/ sum by (job) (rate(grpc_server_started_total{job=~".*thanos-query.*"}[5m])) * 100 > 5)'
- severity: warning
- for: 5m
- - name: Thanos Query Grpc Client Error Rate
- description: 'Thanos Query {{$labels.job}} is failing to send {{$value | humanize}}% of requests.'
- query: '(sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~".*thanos-query.*"}[5m])) / sum by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*"}[5m]))) * 100 > 5'
- severity: warning
- for: 5m
- - name: Thanos Query High D N S Failures
- description: 'Thanos Query {{$labels.job}} have {{$value | humanize}}% of failing DNS queries for store endpoints.'
- query: '(sum by (job) (rate(thanos_query_store_apis_dns_failures_total{job=~".*thanos-query.*"}[5m])) / sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~".*thanos-query.*"}[5m]))) * 100 > 1'
- severity: warning
- for: 15m
- - name: Thanos Query Instant Latency High
- description: 'Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for instant queries.'
- query: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m]))) > 40 and sum by (job) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m])) > 0)'
- severity: critical
- for: 10m
- - name: Thanos Query Range Latency High
- description: 'Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for range queries.'
- query: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query_range"}[5m]))) > 90 and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-query.*", handler="query_range"}[5m])) > 0)'
- severity: critical
- for: 10m
- - name: Thanos Query Overload
- description: 'Thanos Query {{$labels.job}} has been overloaded for more than 15 minutes. This may be a symptom of excessive simultanous complex requests, low performance of the Prometheus API, or failures within these components. Assess the health of the Thanos query instances, the connnected Prometheus instances, look for potential senders of these requests and then contact support.'
- query: '(max_over_time(thanos_query_concurrent_gate_queries_max[5m]) - avg_over_time(thanos_query_concurrent_gate_queries_in_flight[5m]) < 1)'
- severity: warning
- for: 15m
+ - name: Thanos Query Http Request Query Error Rate High
+ description: 'Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of "query" requests.'
+ query: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*", handler="query"}[5m]))/ sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query"}[5m]))) * 100 > 5'
+ severity: critical
+ for: 5m
+ - name: Thanos Query Http Request Query Range Error Rate High
+ description: 'Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of "query_range" requests.'
+ query: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-query.*", handler="query_range"}[5m]))/ sum by (job) (rate(http_requests_total{job=~".*thanos-query.*", handler="query_range"}[5m]))) * 100 > 5'
+ severity: critical
+ for: 5m
+ - name: Thanos Query Grpc Server Error Rate
+ description: "Thanos Query {{$labels.job}} is failing to handle {{$value | humanize}}% of requests."
+ query: '(sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-query.*"}[5m]))/ sum by (job) (rate(grpc_server_started_total{job=~".*thanos-query.*"}[5m])) * 100 > 5)'
+ severity: warning
+ for: 5m
+ - name: Thanos Query Grpc Client Error Rate
+ description: "Thanos Query {{$labels.job}} is failing to send {{$value | humanize}}% of requests."
+ query: '(sum by (job) (rate(grpc_client_handled_total{grpc_code!="OK", job=~".*thanos-query.*"}[5m])) / sum by (job) (rate(grpc_client_started_total{job=~".*thanos-query.*"}[5m]))) * 100 > 5'
+ severity: warning
+ for: 5m
+ - name: Thanos Query High D N S Failures
+ description: "Thanos Query {{$labels.job}} have {{$value | humanize}}% of failing DNS queries for store endpoints."
+ query: '(sum by (job) (rate(thanos_query_store_apis_dns_failures_total{job=~".*thanos-query.*"}[5m])) / sum by (job) (rate(thanos_query_store_apis_dns_lookups_total{job=~".*thanos-query.*"}[5m]))) * 100 > 1'
+ severity: warning
+ for: 15m
+ - name: Thanos Query Instant Latency High
+ description: "Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for instant queries."
+ query: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m]))) > 40 and sum by (job) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query"}[5m])) > 0)'
+ severity: critical
+ for: 10m
+ - name: Thanos Query Range Latency High
+ description: "Thanos Query {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for range queries."
+ query: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-query.*", handler="query_range"}[5m]))) > 90 and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-query.*", handler="query_range"}[5m])) > 0)'
+ severity: critical
+ for: 10m
+ - name: Thanos Query Overload
+ description: "Thanos Query {{$labels.job}} has been overloaded for more than 15 minutes. This may be a symptom of excessive simultanous complex requests, low performance of the Prometheus API, or failures within these components. Assess the health of the Thanos query instances, the connnected Prometheus instances, look for potential senders of these requests and then contact support."
+ query: "(max_over_time(thanos_query_concurrent_gate_queries_max[5m]) - avg_over_time(thanos_query_concurrent_gate_queries_in_flight[5m]) < 1)"
+ severity: warning
+ for: 15m
- name: Thanos Receiver
slug: thanos-receiver
rules:
- - name: Thanos Receive Http Request Error Rate High
- description: 'Thanos Receive {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.'
- query: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-receive.*", handler="receive"}[5m]))/ sum by (job) (rate(http_requests_total{job=~".*thanos-receive.*", handler="receive"}[5m]))) * 100 > 5'
- severity: critical
- for: 5m
- - name: Thanos Receive Http Request Latency High
- description: 'Thanos Receive {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for requests.'
- query: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-receive.*", handler="receive"}[5m]))) > 10 and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-receive.*", handler="receive"}[5m])) > 0)'
- severity: critical
- for: 10m
- - name: Thanos Receive High Replication Failures
- description: 'Thanos Receive {{$labels.job}} is failing to replicate {{$value | humanize}}% of requests.'
- query: 'thanos_receive_replication_factor > 1 and ((sum by (job) (rate(thanos_receive_replications_total{result="error", job=~".*thanos-receive.*"}[5m])) / sum by (job) (rate(thanos_receive_replications_total{job=~".*thanos-receive.*"}[5m]))) > (max by (job) (floor((thanos_receive_replication_factor{job=~".*thanos-receive.*"}+1)/ 2)) / max by (job) (thanos_receive_hashring_nodes{job=~".*thanos-receive.*"}))) * 100'
- severity: warning
- for: 5m
- - name: Thanos Receive High Forward Request Failures
- description: 'Thanos Receive {{$labels.job}} is failing to forward {{$value | humanize}}% of requests.'
- query: '(sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~".*thanos-receive.*"}[5m]))/ sum by (job) (rate(thanos_receive_forward_requests_total{job=~".*thanos-receive.*"}[5m]))) * 100 > 20'
- severity: info
- for: 5m
- - name: Thanos Receive High Hashring File Refresh Failures
- description: 'Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{$value | humanize}} of attempts failed.'
- query: '(sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~".*thanos-receive.*"}[5m])) / sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{job=~".*thanos-receive.*"}[5m])) > 0)'
- severity: warning
- for: 15m
- - name: Thanos Receive Config Reload Failure
- description: 'Thanos Receive {{$labels.job}} has not been able to reload hashring configurations.'
- query: 'avg by (job) (thanos_receive_config_last_reload_successful{job=~".*thanos-receive.*"}) != 1'
- severity: warning
- for: 5m
- - name: Thanos Receive No Upload
- description: 'Thanos Receive {{$labels.instance}} has not uploaded latest data to object storage.'
- query: '(up{job=~".*thanos-receive.*"} - 1) + on (job, instance) (sum by (job, instance) (increase(thanos_shipper_uploads_total{job=~".*thanos-receive.*"}[3h])) == 0)'
- severity: critical
- for: 3h
+ - name: Thanos Receive Http Request Error Rate High
+ description: "Thanos Receive {{$labels.job}} is failing to handle {{$value | humanize}}% of requests."
+ query: '(sum by (job) (rate(http_requests_total{code=~"5..", job=~".*thanos-receive.*", handler="receive"}[5m]))/ sum by (job) (rate(http_requests_total{job=~".*thanos-receive.*", handler="receive"}[5m]))) * 100 > 5'
+ severity: critical
+ for: 5m
+ - name: Thanos Receive Http Request Latency High
+ description: "Thanos Receive {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for requests."
+ query: '(histogram_quantile(0.99, sum by (job, le) (rate(http_request_duration_seconds_bucket{job=~".*thanos-receive.*", handler="receive"}[5m]))) > 10 and sum by (job) (rate(http_request_duration_seconds_count{job=~".*thanos-receive.*", handler="receive"}[5m])) > 0)'
+ severity: critical
+ for: 10m
+ - name: Thanos Receive High Replication Failures
+ description: "Thanos Receive {{$labels.job}} is failing to replicate {{$value | humanize}}% of requests."
+ query: 'thanos_receive_replication_factor > 1 and ((sum by (job) (rate(thanos_receive_replications_total{result="error", job=~".*thanos-receive.*"}[5m])) / sum by (job) (rate(thanos_receive_replications_total{job=~".*thanos-receive.*"}[5m]))) > (max by (job) (floor((thanos_receive_replication_factor{job=~".*thanos-receive.*"}+1)/ 2)) / max by (job) (thanos_receive_hashring_nodes{job=~".*thanos-receive.*"}))) * 100'
+ severity: warning
+ for: 5m
+ - name: Thanos Receive High Forward Request Failures
+ description: "Thanos Receive {{$labels.job}} is failing to forward {{$value | humanize}}% of requests."
+ query: '(sum by (job) (rate(thanos_receive_forward_requests_total{result="error", job=~".*thanos-receive.*"}[5m]))/ sum by (job) (rate(thanos_receive_forward_requests_total{job=~".*thanos-receive.*"}[5m]))) * 100 > 20'
+ severity: info
+ for: 5m
+ - name: Thanos Receive High Hashring File Refresh Failures
+ description: "Thanos Receive {{$labels.job}} is failing to refresh hashring file, {{$value | humanize}} of attempts failed."
+ query: '(sum by (job) (rate(thanos_receive_hashrings_file_errors_total{job=~".*thanos-receive.*"}[5m])) / sum by (job) (rate(thanos_receive_hashrings_file_refreshes_total{job=~".*thanos-receive.*"}[5m])) > 0)'
+ severity: warning
+ for: 15m
+ - name: Thanos Receive Config Reload Failure
+ description: "Thanos Receive {{$labels.job}} has not been able to reload hashring configurations."
+ query: 'avg by (job) (thanos_receive_config_last_reload_successful{job=~".*thanos-receive.*"}) != 1'
+ severity: warning
+ for: 5m
+ - name: Thanos Receive No Upload
+ description: "Thanos Receive {{$labels.instance}} has not uploaded latest data to object storage."
+ query: '(up{job=~".*thanos-receive.*"} - 1) + on (job, instance) (sum by (job, instance) (increase(thanos_shipper_uploads_total{job=~".*thanos-receive.*"}[3h])) == 0)'
+ severity: critical
+ for: 3h
- name: Thanos Sidecar
slug: thanos-sidecar
rules:
- - name: Thanos Sidecar Bucket Operations Failed
- description: 'Thanos Sidecar {{$labels.instance}} bucket operations are failing'
- query: 'sum by (job, instance) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-sidecar.*"}[5m])) > 0'
- severity: critical
- for: 5m
- - name: Thanos Sidecar No Connection To Started Prometheus
- description: 'Thanos Sidecar {{$labels.instance}} is unhealthy.'
- query: 'thanos_sidecar_prometheus_up{job=~".*thanos-sidecar.*"} == 0 and on (namespace, pod)prometheus_tsdb_data_replay_duration_seconds != 0'
- severity: critical
- for: 5m
+ - name: Thanos Sidecar Bucket Operations Failed
+ description: "Thanos Sidecar {{$labels.instance}} bucket operations are failing"
+ query: 'sum by (job, instance) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-sidecar.*"}[5m])) > 0'
+ severity: critical
+ for: 5m
+ - name: Thanos Sidecar No Connection To Started Prometheus
+ description: "Thanos Sidecar {{$labels.instance}} is unhealthy."
+ query: 'thanos_sidecar_prometheus_up{job=~".*thanos-sidecar.*"} == 0 and on (namespace, pod)prometheus_tsdb_data_replay_duration_seconds != 0'
+ severity: critical
+ for: 5m
- name: Thanos Store
slug: thanos-store
rules:
- - name: Thanos Store Grpc Error Rate
- description: 'Thanos Store {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.'
- query: '(sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-store.*"}[5m]))/ sum by (job) (rate(grpc_server_started_total{job=~".*thanos-store.*"}[5m])) * 100 > 5)'
- severity: warning
- for: 5m
- - name: Thanos Store Series Gate Latency High
- description: 'Thanos Store {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for store series gate requests.'
- query: '(histogram_quantile(0.99, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=~".*thanos-store.*"}[5m]))) > 2 and sum by (job) (rate(thanos_bucket_store_series_gate_duration_seconds_count{job=~".*thanos-store.*"}[5m])) > 0)'
- severity: warning
- for: 10m
- - name: Thanos Store Bucket High Operation Failures
- description: 'Thanos Store {{$labels.job}} Bucket is failing to execute {{$value | humanize}}% of operations.'
- query: '(sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-store.*"}[5m])) / sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-store.*"}[5m])) * 100 > 5)'
- severity: warning
- for: 15m
- - name: Thanos Store Objstore Operation Latency High
- description: 'Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of {{$value}} seconds for the bucket operations.'
- query: '(histogram_quantile(0.99, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~".*thanos-store.*"}[5m]))) > 2 and sum by (job) (rate(thanos_objstore_bucket_operation_duration_seconds_count{job=~".*thanos-store.*"}[5m])) > 0)'
- severity: warning
- for: 10m
+ - name: Thanos Store Grpc Error Rate
+ description: "Thanos Store {{$labels.job}} is failing to handle {{$value | humanize}}% of requests."
+ query: '(sum by (job) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-store.*"}[5m]))/ sum by (job) (rate(grpc_server_started_total{job=~".*thanos-store.*"}[5m])) * 100 > 5)'
+ severity: warning
+ for: 5m
+ - name: Thanos Store Series Gate Latency High
+ description: "Thanos Store {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for store series gate requests."
+ query: '(histogram_quantile(0.99, sum by (job, le) (rate(thanos_bucket_store_series_gate_duration_seconds_bucket{job=~".*thanos-store.*"}[5m]))) > 2 and sum by (job) (rate(thanos_bucket_store_series_gate_duration_seconds_count{job=~".*thanos-store.*"}[5m])) > 0)'
+ severity: warning
+ for: 10m
+ - name: Thanos Store Bucket High Operation Failures
+ description: "Thanos Store {{$labels.job}} Bucket is failing to execute {{$value | humanize}}% of operations."
+ query: '(sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{job=~".*thanos-store.*"}[5m])) / sum by (job) (rate(thanos_objstore_bucket_operations_total{job=~".*thanos-store.*"}[5m])) * 100 > 5)'
+ severity: warning
+ for: 15m
+ - name: Thanos Store Objstore Operation Latency High
+ description: "Thanos Store {{$labels.job}} Bucket has a 99th percentile latency of {{$value}} seconds for the bucket operations."
+ query: '(histogram_quantile(0.99, sum by (job, le) (rate(thanos_objstore_bucket_operation_duration_seconds_bucket{job=~".*thanos-store.*"}[5m]))) > 2 and sum by (job) (rate(thanos_objstore_bucket_operation_duration_seconds_count{job=~".*thanos-store.*"}[5m])) > 0)'
+ severity: warning
+ for: 10m
- name: Thanos Ruler
slug: thanos-ruler
rules:
- - name: Thanos Rule Queue Is Dropping Alerts
- description: 'Thanos Rule {{$labels.instance}} is failing to queue alerts.'
- query: 'sum by (job, instance) (rate(thanos_alert_queue_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0'
- severity: critical
- for: 5m
- - name: Thanos Rule Sender Is Failing Alerts
- description: 'Thanos Rule {{$labels.instance}} is failing to send alerts to alertmanager.'
- query: 'sum by (job, instance) (rate(thanos_alert_sender_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0'
- severity: critical
- for: 5m
- - name: Thanos Rule High Rule Evaluation Failures
- description: 'Thanos Rule {{$labels.instance}} is failing to evaluate rules.'
- query: '(sum by (job, instance) (rate(prometheus_rule_evaluation_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) * 100 > 5)'
- severity: critical
- for: 5m
- - name: Thanos Rule High Rule Evaluation Warnings
- description: 'Thanos Rule {{$labels.instance}} has high number of evaluation warnings.'
- query: 'sum by (job, instance) (rate(thanos_rule_evaluation_with_warnings_total{job=~".*thanos-rule.*"}[5m])) > 0'
- severity: info
- for: 15m
- - name: Thanos Rule Rule Evaluation Latency High
- description: 'Thanos Rule {{$labels.instance}} has higher evaluation latency than interval for {{$labels.rule_group}}.'
- query: '(sum by (job, instance, rule_group) (prometheus_rule_group_last_duration_seconds{job=~".*thanos-rule.*"}) > sum by (job, instance, rule_group) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"}))'
- severity: warning
- for: 5m
- - name: Thanos Rule Grpc Error Rate
- description: 'Thanos Rule {{$labels.job}} is failing to handle {{$value | humanize}}% of requests.'
- query: '(sum by (job, instance) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-rule.*"}[5m]))/ sum by (job, instance) (rate(grpc_server_started_total{job=~".*thanos-rule.*"}[5m])) * 100 > 5)'
- severity: warning
- for: 5m
- - name: Thanos Rule Config Reload Failure
- description: 'Thanos Rule {{$labels.job}} has not been able to reload its configuration.'
- query: 'avg by (job, instance) (thanos_rule_config_last_reload_successful{job=~".*thanos-rule.*"}) != 1'
- severity: info
- for: 5m
- - name: Thanos Rule Query High D N S Failures
- description: 'Thanos Rule {{$labels.job}} has {{$value | humanize}}% of failing DNS queries for query endpoints.'
- query: '(sum by (job, instance) (rate(thanos_rule_query_apis_dns_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) * 100 > 1)'
- severity: warning
- for: 15m
- - name: Thanos Rule Alertmanager High D N S Failures
- description: 'Thanos Rule {{$labels.instance}} has {{$value | humanize}}% of failing DNS queries for Alertmanager endpoints.'
- query: '(sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) * 100 > 1)'
- severity: warning
- for: 15m
- - name: Thanos Rule No Evaluation For10 Intervals
- description: 'Thanos Rule {{$labels.job}} has rule groups that did not evaluate for at least 10x of their expected interval.'
- query: 'time() - max by (job, instance, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job=~".*thanos-rule.*"})>10 * max by (job, instance, group) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"})'
- severity: info
- for: 5m
- - name: Thanos No Rule Evaluations
- description: 'Thanos Rule {{$labels.instance}} did not perform any rule evaluations in the past 10 minutes.'
- query: 'sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) <= 0 and sum by (job, instance) (thanos_rule_loaded_rules{job=~".*thanos-rule.*"}) > 0'
- severity: critical
- for: 5m
+ - name: Thanos Rule Queue Is Dropping Alerts
+ description: "Thanos Rule {{$labels.instance}} is failing to queue alerts."
+ query: 'sum by (job, instance) (rate(thanos_alert_queue_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0'
+ severity: critical
+ for: 5m
+ - name: Thanos Rule Sender Is Failing Alerts
+ description: "Thanos Rule {{$labels.instance}} is failing to send alerts to alertmanager."
+ query: 'sum by (job, instance) (rate(thanos_alert_sender_alerts_dropped_total{job=~".*thanos-rule.*"}[5m])) > 0'
+ severity: critical
+ for: 5m
+ - name: Thanos Rule High Rule Evaluation Failures
+ description: "Thanos Rule {{$labels.instance}} is failing to evaluate rules."
+ query: '(sum by (job, instance) (rate(prometheus_rule_evaluation_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) * 100 > 5)'
+ severity: critical
+ for: 5m
+ - name: Thanos Rule High Rule Evaluation Warnings
+ description: "Thanos Rule {{$labels.instance}} has high number of evaluation warnings."
+ query: 'sum by (job, instance) (rate(thanos_rule_evaluation_with_warnings_total{job=~".*thanos-rule.*"}[5m])) > 0'
+ severity: info
+ for: 15m
+ - name: Thanos Rule Rule Evaluation Latency High
+ description: "Thanos Rule {{$labels.instance}} has higher evaluation latency than interval for {{$labels.rule_group}}."
+ query: '(sum by (job, instance, rule_group) (prometheus_rule_group_last_duration_seconds{job=~".*thanos-rule.*"}) > sum by (job, instance, rule_group) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"}))'
+ severity: warning
+ for: 5m
+ - name: Thanos Rule Grpc Error Rate
+ description: "Thanos Rule {{$labels.job}} is failing to handle {{$value | humanize}}% of requests."
+ query: '(sum by (job, instance) (rate(grpc_server_handled_total{grpc_code=~"Unknown|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded", job=~".*thanos-rule.*"}[5m]))/ sum by (job, instance) (rate(grpc_server_started_total{job=~".*thanos-rule.*"}[5m])) * 100 > 5)'
+ severity: warning
+ for: 5m
+ - name: Thanos Rule Config Reload Failure
+ description: "Thanos Rule {{$labels.job}} has not been able to reload its configuration."
+ query: 'avg by (job, instance) (thanos_rule_config_last_reload_successful{job=~".*thanos-rule.*"}) != 1'
+ severity: info
+ for: 5m
+ - name: Thanos Rule Query High D N S Failures
+ description: "Thanos Rule {{$labels.job}} has {{$value | humanize}}% of failing DNS queries for query endpoints."
+ query: '(sum by (job, instance) (rate(thanos_rule_query_apis_dns_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(thanos_rule_query_apis_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) * 100 > 1)'
+ severity: warning
+ for: 15m
+ - name: Thanos Rule Alertmanager High D N S Failures
+ description: "Thanos Rule {{$labels.instance}} has {{$value | humanize}}% of failing DNS queries for Alertmanager endpoints."
+ query: '(sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_failures_total{job=~".*thanos-rule.*"}[5m])) / sum by (job, instance) (rate(thanos_rule_alertmanagers_dns_lookups_total{job=~".*thanos-rule.*"}[5m])) * 100 > 1)'
+ severity: warning
+ for: 15m
+ - name: Thanos Rule No Evaluation For10 Intervals
+ description: "Thanos Rule {{$labels.job}} has rule groups that did not evaluate for at least 10x of their expected interval."
+ query: 'time() - max by (job, instance, group) (prometheus_rule_group_last_evaluation_timestamp_seconds{job=~".*thanos-rule.*"})>10 * max by (job, instance, group) (prometheus_rule_group_interval_seconds{job=~".*thanos-rule.*"})'
+ severity: info
+ for: 5m
+ - name: Thanos No Rule Evaluations
+ description: "Thanos Rule {{$labels.instance}} did not perform any rule evaluations in the past 10 minutes."
+ query: 'sum by (job, instance) (rate(prometheus_rule_evaluations_total{job=~".*thanos-rule.*"}[5m])) <= 0 and sum by (job, instance) (thanos_rule_loaded_rules{job=~".*thanos-rule.*"}) > 0'
+ severity: critical
+ for: 5m
- name: Thanos Bucket Replicate
slug: thanos-bucket-replicate
rules:
- - name: Thanos Bucket Replicate Error Rate
- description: 'Thanos Replicate is failing to run, {{$value | humanize}}% of attempts failed.'
- query: '(sum by (job) (rate(thanos_replicate_replication_runs_total{result="error", job=~".*thanos-bucket-replicate.*"}[5m]))/ on (job) group_left sum by (job) (rate(thanos_replicate_replication_runs_total{job=~".*thanos-bucket-replicate.*"}[5m]))) * 100 >= 10'
- severity: critical
- for: 5m
- - name: Thanos Bucket Replicate Run Latency
- description: 'Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for the replicate operations.'
- query: '(histogram_quantile(0.99, sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m]))) > 20 and sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m])) > 0)'
- severity: critical
- for: 5m
+ - name: Thanos Bucket Replicate Error Rate
+ description: "Thanos Replicate is failing to run, {{$value | humanize}}% of attempts failed."
+ query: '(sum by (job) (rate(thanos_replicate_replication_runs_total{result="error", job=~".*thanos-bucket-replicate.*"}[5m]))/ on (job) group_left sum by (job) (rate(thanos_replicate_replication_runs_total{job=~".*thanos-bucket-replicate.*"}[5m]))) * 100 >= 10'
+ severity: critical
+ for: 5m
+ - name: Thanos Bucket Replicate Run Latency
+ description: "Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{$value}} seconds for the replicate operations."
+ query: '(histogram_quantile(0.99, sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m]))) > 20 and sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{job=~".*thanos-bucket-replicate.*"}[5m])) > 0)'
+ severity: critical
+ for: 5m
- name: Thanos Component Absent
slug: thanos-component-absent
rules:
- - name: Thanos Compact Is Down
- description: 'ThanosCompact has disappeared. Prometheus target for the component cannot be discovered.'
- query: 'absent(up{job=~".*thanos-compact.*"} == 1)'
- severity: critical
- for: 5m
- - name: Thanos Query Is Down
- description: 'ThanosQuery has disappeared. Prometheus target for the component cannot be discovered.'
- query: 'absent(up{job=~".*thanos-query.*"} == 1)'
- severity: critical
- for: 5m
- - name: Thanos Receive Is Down
- description: 'ThanosReceive has disappeared. Prometheus target for the component cannot be discovered.'
- query: 'absent(up{job=~".*thanos-receive.*"} == 1)'
- severity: critical
- for: 5m
- - name: Thanos Rule Is Down
- description: 'ThanosRule has disappeared. Prometheus target for the component cannot be discovered.'
- query: 'absent(up{job=~".*thanos-rule.*"} == 1)'
- severity: critical
- for: 5m
- - name: Thanos Sidecar Is Down
- description: 'ThanosSidecar has disappeared. Prometheus target for the component cannot be discovered.'
- query: 'absent(up{job=~".*thanos-sidecar.*"} == 1)'
- severity: critical
- for: 5m
- - name: Thanos Store Is Down
- description: 'ThanosStore has disappeared. Prometheus target for the component cannot be discovered.'
- query: absent(up{job=~".*thanos-store.*"} == 1)
- severity: critical
- for: 5m
+ - name: Thanos Compact Is Down
+ description: "ThanosCompact has disappeared. Prometheus target for the component cannot be discovered."
+ query: 'absent(up{job=~".*thanos-compact.*"} == 1)'
+ severity: critical
+ for: 5m
+ - name: Thanos Query Is Down
+ description: "ThanosQuery has disappeared. Prometheus target for the component cannot be discovered."
+ query: 'absent(up{job=~".*thanos-query.*"} == 1)'
+ severity: critical
+ for: 5m
+ - name: Thanos Receive Is Down
+ description: "ThanosReceive has disappeared. Prometheus target for the component cannot be discovered."
+ query: 'absent(up{job=~".*thanos-receive.*"} == 1)'
+ severity: critical
+ for: 5m
+ - name: Thanos Rule Is Down
+ description: "ThanosRule has disappeared. Prometheus target for the component cannot be discovered."
+ query: 'absent(up{job=~".*thanos-rule.*"} == 1)'
+ severity: critical
+ for: 5m
+ - name: Thanos Sidecar Is Down
+ description: "ThanosSidecar has disappeared. Prometheus target for the component cannot be discovered."
+ query: 'absent(up{job=~".*thanos-sidecar.*"} == 1)'
+ severity: critical
+ for: 5m
+ - name: Thanos Store Is Down
+ description: "ThanosStore has disappeared. Prometheus target for the component cannot be discovered."
+ query: absent(up{job=~".*thanos-store.*"} == 1)
+ severity: critical
+ for: 5m
- name: Loki
exporters:
@@ -2647,6 +3015,15 @@ groups:
severity: critical
for: 5m
+ - name: Grafana Alloy
+ exporters:
+ - slug: embedded-exporter
+ rules:
+ - name: Grafana Alloy service down
+ description: Alloy on (instance {{ $labels.instance }}) is not responding or has stopped running.
+ query: 'count by (instance) (alloy_build_info) unless count by (instance) (alloy_build_info offset 2m) '
+ severity: critical
+
- name: Jenkins
exporters:
- name: Metric plugin
@@ -2655,32 +3032,32 @@ groups:
rules:
- name: Jenkins offline
description: "Jenkins offline: `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})"
- query: 'jenkins_node_offline_value > 1'
+ query: "jenkins_node_offline_value > 1"
severity: critical
- name: Jenkins healthcheck
description: "Jenkins healthcheck score: {{$value}}. Healthcheck failure for `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})"
- query: 'jenkins_health_check_score < 1'
+ query: "jenkins_health_check_score < 1"
severity: critical
- name: Jenkins outdated plugins
description: "{{ $value }} plugins need update"
- query: 'sum(jenkins_plugins_withUpdate) by (instance) > 3'
+ query: "sum(jenkins_plugins_withUpdate) by (instance) > 3"
severity: warning
for: 1d
- name: Jenkins builds health score
description: "Healthcheck failure for `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})"
- query: 'default_jenkins_builds_health_score < 1'
+ query: "default_jenkins_builds_health_score < 1"
severity: critical
- name: Jenkins run failure total
description: "Job run failures: ({{$value}}) {{$labels.jenkins_job}}. Healthcheck failure for `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})"
- query: 'delta(jenkins_runs_failure_total[1h]) > 100'
+ query: "delta(jenkins_runs_failure_total[1h]) > 100"
severity: warning
- name: Jenkins build tests failing
description: "Last build tests failed: {{$labels.jenkins_job}}. Failed build Tests for job `{{$labels.jenkins_job}}` on {{$labels.instance}}/{{$labels.env}} ({{$labels.region}})"
- query: 'default_jenkins_builds_last_build_tests_failing > 0'
+ query: "default_jenkins_builds_last_build_tests_failing > 0"
severity: warning
- name: Jenkins last build failed
description: "Last build failed: {{$labels.jenkins_job}}. Failed build for job `{{$labels.jenkins_job}}` on {{$labels.instance}}/{{$labels.env}} ({{$labels.region}})"
- query: 'default_jenkins_builds_last_build_result_ordinal == 2'
+ query: "default_jenkins_builds_last_build_result_ordinal == 2"
severity: warning
comments: |
* RUNNING -1 true - The build had no errors.
@@ -2698,26 +3075,56 @@ groups:
rules:
- name: APC UPS Battery nearly empty
description: Battery is almost empty (< 10% left)
- query: 'apcupsd_battery_charge_percent < 10'
+ query: "apcupsd_battery_charge_percent < 10"
severity: critical
- name: APC UPS Less than 15 Minutes of battery time remaining
description: Battery is almost empty (< 15 Minutes remaining)
- query: 'apcupsd_battery_time_left_seconds < 900'
+ query: "apcupsd_battery_time_left_seconds < 900"
severity: critical
- name: APC UPS AC input outage
description: UPS now running on battery (since {{$value | humanizeDuration}})
- query: 'apcupsd_battery_time_on_seconds > 0'
+ query: "apcupsd_battery_time_on_seconds > 0"
severity: warning
- name: APC UPS low battery voltage
description: Battery voltage is lower than nominal (< 95%)
- query: '(apcupsd_battery_volts / apcupsd_battery_nominal_volts) < 0.95'
+ query: "(apcupsd_battery_volts / apcupsd_battery_nominal_volts) < 0.95"
severity: warning
- name: APC UPS high temperature
description: Internal temperature is high ({{$value}}°C)
- query: 'apcupsd_internal_temperature_celsius >= 40'
+ query: "apcupsd_internal_temperature_celsius >= 40"
severity: warning
for: 2m
- name: APC UPS high load
description: UPS load is > 80%
- query: 'apcupsd_ups_load_percent > 80'
+ query: "apcupsd_ups_load_percent > 80"
severity: warning
+
+ - name: Graph Node
+ exporters:
+ - name: Embedded exporter
+ slug: embedded-exporter
+ rules:
+ - name: Provider failed because net_version failed
+ description: "Failed net_version for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`"
+ query: "eth_rpc_status == 1"
+ severity: critical
+ - name: Provider failed because get genesis failed
+ description: "Failed to get genesis for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`"
+ query: "eth_rpc_status == 2"
+ severity: critical
+ - name: Provider failed because net_version timeout
+ description: "net_version timeout for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`"
+ query: "eth_rpc_status == 3"
+ severity: critical
+ - name: Provider failed because get genesis timeout
+ description: "Timeout to get genesis for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`"
+ query: "eth_rpc_status == 4"
+ severity: critical
+ - name: Store connection is too slow
+ description: "Store connection is too slow to `{{$labels.pool}}` pool, `{{$labels.shard}}` shard in Graph node `{{$labels.instance}}`"
+ query: "store_connection_wait_time_ms > 10"
+ severity: warning
+ - name: Store connection is too slow
+ description: "Store connection is too slow to `{{$labels.pool}}` pool, `{{$labels.shard}}` shard in Graph node `{{$labels.instance}}`"
+ query: "store_connection_wait_time_ms > 20"
+ severity: critical
diff --git a/_layouts/default.html b/_layouts/default.html
index 8e73c74..56f4a92 100644
--- a/_layouts/default.html
+++ b/_layouts/default.html
@@ -125,6 +125,18 @@
class="fa fa-linkedin" target="_blank">
+
+
+
@@ -147,7 +159,7 @@
s['ScreebObject']=r;s[r]=s[r]||function(){(s[r].q=s[r].q||[]).push(arguments)};
b=c.createElement('script');b.type='text/javascript';
b.id=r;b.src=ee;b.async=1;c.getElementsByTagName("head")[0].appendChild(b);
- }(window,document,'$screeb','https://t.screeb.app/tag.js'));
+ }(window,document,'$screeb','https://t2.screeb.app/tag.js'));
$screeb('init', '232450e3-d3fe-4240-b543-649a5041a7db');
diff --git a/alertmanager.md b/alertmanager.md
index 7de03e9..d350945 100644
--- a/alertmanager.md
+++ b/alertmanager.md
@@ -80,7 +80,7 @@ route:
- receiver: "pager"
group_wait: 10s
match_re:
- severity: critial
+ severity: critical
continue: true
receivers:
@@ -135,4 +135,7 @@ If the notification takes too much time to be triggered, check the following del
- `for: 5m` (alerts/example-mysql.yml)
- `group_wait = 10s` (alertmanager.yml)
-Also read [https://pracucci.com/prometheus-understanding-the-delays-on-alerting.html](https://pracucci.com/prometheus-understanding-the-delays-on-alerting.html).
+Also read:
+- [https://pracucci.com/prometheus-understanding-the-delays-on-alerting.html](https://pracucci.com/prometheus-understanding-the-delays-on-alerting.html).
+- [https://hodovi.cc/blog/creating-awesome-alertmanager-templates-for-slack/](https://hodovi.cc/blog/creating-awesome-alertmanager-templates-for-slack/)
+- [https://grafana.com/blog/2024/10/03/how-to-use-prometheus-to-efficiently-detect-anomalies-at-scale/](https://grafana.com/blog/2024/10/03/how-to-use-prometheus-to-efficiently-detect-anomalies-at-scale/)
diff --git a/assets/css/app.css b/assets/css/app.css
index 81f18ee..e42ff66 100644
--- a/assets/css/app.css
+++ b/assets/css/app.css
@@ -115,3 +115,29 @@ h2 {
max-width: 85rem;
}
}
+
+ul#sponsoring {
+ display: flex;
+ align-items: center;
+ justify-content: center;
+ margin-top: 50px;
+}
+
+ul#sponsoring li {
+ display: flex;
+ padding: 0px 15px;
+ font-size: 16px;
+}
+
+ul#sponsoring li a {
+ display: flex;
+}
+
+ul#sponsoring li a img {
+ max-width: 180px;
+ max-height: 80px;
+}
+
+.page-header {
+ padding-bottom: 30px;
+}
\ No newline at end of file
diff --git a/assets/sponsor-betterstack.png b/assets/sponsor-betterstack.png
new file mode 100644
index 0000000..b0c12b0
Binary files /dev/null and b/assets/sponsor-betterstack.png differ
diff --git a/dist/rules/caddy/null.yml b/dist/rules/caddy/null.yml
new file mode 100644
index 0000000..64b0230
--- /dev/null
+++ b/dist/rules/caddy/null.yml
@@ -0,0 +1,32 @@
+groups:
+
+- name:
+
+ rules:
+
+ - alert: CaddyReverseProxyDown
+ expr: 'count(caddy_reverse_proxy_upstreams_healthy) by (upstream) == 0'
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: Caddy Reverse Proxy Down (instance {{ $labels.instance }})
+ description: "All Caddy reverse proxies are down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: CaddyHighHttp4xxErrorRateService
+ expr: 'sum(rate(caddy_http_request_duration_seconds_count{code=~"4.."}[3m])) by (instance) / sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) * 100 > 5'
+ for: 1m
+ labels:
+ severity: critical
+ annotations:
+ summary: Caddy high HTTP 4xx error rate service (instance {{ $labels.instance }})
+ description: "Caddy service 4xx error rate is above 5%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: CaddyHighHttp5xxErrorRateService
+ expr: 'sum(rate(caddy_http_request_duration_seconds_count{code=~"5.."}[3m])) by (instance) / sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) * 100 > 5'
+ for: 1m
+ labels:
+ severity: critical
+ annotations:
+ summary: Caddy high HTTP 5xx error rate service (instance {{ $labels.instance }})
+ description: "Caddy service 5xx error rate is above 5%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
diff --git a/dist/rules/clickhouse/embedded-exporter.yml b/dist/rules/clickhouse/embedded-exporter.yml
new file mode 100644
index 0000000..3efe551
--- /dev/null
+++ b/dist/rules/clickhouse/embedded-exporter.yml
@@ -0,0 +1,131 @@
+groups:
+
+- name: EmbeddedExporter
+
+ rules:
+
+ - alert: ClickhouseMemoryUsageCritical
+ expr: 'ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 90'
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: ClickHouse Memory Usage Critical (instance {{ $labels.instance }})
+ description: "Memory usage is critically high, over 90%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: ClickhouseMemoryUsageWarning
+ expr: 'ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 80'
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: ClickHouse Memory Usage Warning (instance {{ $labels.instance }})
+ description: "Memory usage is over 80%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: ClickhouseDiskSpaceLowOnDefault
+ expr: 'ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 20'
+ for: 2m
+ labels:
+ severity: warning
+ annotations:
+ summary: ClickHouse Disk Space Low on Default (instance {{ $labels.instance }})
+ description: "Disk space on default is below 20%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: ClickhouseDiskSpaceCriticalOnDefault
+ expr: 'ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 10'
+ for: 2m
+ labels:
+ severity: critical
+ annotations:
+ summary: ClickHouse Disk Space Critical on Default (instance {{ $labels.instance }})
+ description: "Disk space on default disk is critically low, below 10%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: ClickhouseDiskSpaceLowOnBackups
+ expr: 'ClickHouseAsyncMetrics_DiskAvailable_backups / (ClickHouseAsyncMetrics_DiskAvailable_backups + ClickHouseAsyncMetrics_DiskUsed_backups) * 100 < 20'
+ for: 2m
+ labels:
+ severity: warning
+ annotations:
+ summary: ClickHouse Disk Space Low on Backups (instance {{ $labels.instance }})
+ description: "Disk space on backups is below 20%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: ClickhouseReplicaErrors
+ expr: 'ClickHouseErrorMetric_ALL_REPLICAS_ARE_STALE == 1 or ClickHouseErrorMetric_ALL_REPLICAS_LOST == 1'
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: ClickHouse Replica Errors (instance {{ $labels.instance }})
+ description: "Critical replica errors detected, either all replicas are stale or lost.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: ClickhouseNoAvailableReplicas
+ expr: 'ClickHouseErrorMetric_NO_AVAILABLE_REPLICA == 1'
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: ClickHouse No Available Replicas (instance {{ $labels.instance }})
+ description: "No available replicas in ClickHouse.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: ClickhouseNoLiveReplicas
+ expr: 'ClickHouseErrorMetric_TOO_FEW_LIVE_REPLICAS == 1'
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: ClickHouse No Live Replicas (instance {{ $labels.instance }})
+ description: "There are too few live replicas available, risking data loss and service disruption.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: ClickhouseHighNetworkTraffic
+ expr: 'ClickHouseMetrics_NetworkSend > 250 or ClickHouseMetrics_NetworkReceive > 250'
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: ClickHouse High Network Traffic (instance {{ $labels.instance }})
+ description: "Network traffic is unusually high, may affect cluster performance.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: ClickhouseHighTcpConnections
+ expr: 'ClickHouseMetrics_TCPConnection > 400'
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: ClickHouse High TCP Connections (instance {{ $labels.instance }})
+ description: "High number of TCP connections, indicating heavy client or inter-cluster communication.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: ClickhouseInterserverConnectionIssues
+ expr: 'increase(ClickHouseMetrics_InterserverConnection[5m]) > 0'
+ for: 1m
+ labels:
+ severity: warning
+ annotations:
+ summary: ClickHouse Interserver Connection Issues (instance {{ $labels.instance }})
+ description: "An increase in interserver connections may indicate replication or distributed query handling issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: ClickhouseZookeeperConnectionIssues
+ expr: 'avg(ClickHouseMetrics_ZooKeeperSession) != 1'
+ for: 3m
+ labels:
+ severity: warning
+ annotations:
+ summary: ClickHouse ZooKeeper Connection Issues (instance {{ $labels.instance }})
+ description: "ClickHouse is experiencing issues with ZooKeeper connections, which may affect cluster state and coordination.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: ClickhouseAuthenticationFailures
+ expr: 'increase(ClickHouseErrorMetric_AUTHENTICATION_FAILED[5m]) > 0'
+ for: 0m
+ labels:
+ severity: info
+ annotations:
+ summary: ClickHouse Authentication Failures (instance {{ $labels.instance }})
+ description: "Authentication failures detected, indicating potential security issues or misconfiguration.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: ClickhouseAccessDeniedErrors
+ expr: 'increase(ClickHouseErrorMetric_RESOURCE_ACCESS_DENIED[5m]) > 0'
+ for: 0m
+ labels:
+ severity: info
+ annotations:
+ summary: ClickHouse Access Denied Errors (instance {{ $labels.instance }})
+ description: "Access denied errors have been logged, which could indicate permission issues or unauthorized access attempts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
diff --git a/dist/rules/docker-containers/google-cadvisor.yml b/dist/rules/docker-containers/google-cadvisor.yml
index d0c343b..02ca9c3 100644
--- a/dist/rules/docker-containers/google-cadvisor.yml
+++ b/dist/rules/docker-containers/google-cadvisor.yml
@@ -23,7 +23,7 @@ groups:
description: "A container is absent for 5 min\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ContainerHighCpuUtilization
- expr: '(sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) > 80'
+ expr: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) > 80'
for: 2m
labels:
severity: warning
@@ -50,8 +50,8 @@ groups:
description: "Container Volume usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ContainerHighThrottleRate
- expr: 'rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1'
- for: 2m
+ expr: 'sum(increase(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) by (container, pod, namespace) / sum(increase(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > ( 25 / 100 )'
+ for: 5m
labels:
severity: warning
annotations:
@@ -69,7 +69,7 @@ groups:
- alert: ContainerLowCpuUtilization
- expr: '(sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) < 20'
+ expr: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) < 20'
for: 7d
labels:
severity: info
diff --git a/dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml b/dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml
index 4ed5660..5e6bb9d 100644
--- a/dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml
+++ b/dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml
@@ -138,3 +138,39 @@ groups:
annotations:
summary: Elasticsearch no new documents (instance {{ $labels.instance }})
description: "No new documents for 10 min!\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: ElasticsearchHighIndexingLatency
+ expr: 'elasticsearch_indices_indexing_index_time_seconds_total / elasticsearch_indices_indexing_index_total > 0.0005'
+ for: 10m
+ labels:
+ severity: warning
+ annotations:
+ summary: Elasticsearch High Indexing Latency (instance {{ $labels.instance }})
+ description: "The indexing latency on Elasticsearch cluster is higher than the threshold.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: ElasticsearchHighIndexingRate
+ expr: 'sum(rate(elasticsearch_indices_indexing_index_total[1m]))> 10000'
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: Elasticsearch High Indexing Rate (instance {{ $labels.instance }})
+ description: "The indexing rate on Elasticsearch cluster is higher than the threshold.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: ElasticsearchHighQueryRate
+ expr: 'sum(rate(elasticsearch_indices_search_query_total[1m])) > 100'
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: Elasticsearch High Query Rate (instance {{ $labels.instance }})
+ description: "The query rate on Elasticsearch cluster is higher than the threshold.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: ElasticsearchHighQueryLatency
+ expr: 'elasticsearch_indices_search_fetch_time_seconds / elasticsearch_indices_search_fetch_total > 1'
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: Elasticsearch High Query Latency (instance {{ $labels.instance }})
+ description: "The query latency on Elasticsearch cluster is higher than the threshold.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
diff --git a/dist/rules/grafana-alloy/embedded-exporter.yml b/dist/rules/grafana-alloy/embedded-exporter.yml
new file mode 100644
index 0000000..d86c8a4
--- /dev/null
+++ b/dist/rules/grafana-alloy/embedded-exporter.yml
@@ -0,0 +1,14 @@
+groups:
+
+- name: EmbeddedExporter
+
+ rules:
+
+ - alert: GrafanaAlloyServiceDown
+ expr: 'count by (instance) (alloy_build_info) unless count by (instance) (alloy_build_info offset 2m) '
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: Grafana Alloy service down (instance {{ $labels.instance }})
+ description: "Alloy on (instance {{ $labels.instance }}) is not responding or has stopped running.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
diff --git a/dist/rules/graph-node/embedded-exporter.yml b/dist/rules/graph-node/embedded-exporter.yml
new file mode 100644
index 0000000..a8d0768
--- /dev/null
+++ b/dist/rules/graph-node/embedded-exporter.yml
@@ -0,0 +1,59 @@
+groups:
+
+- name: EmbeddedExporter
+
+ rules:
+
+ - alert: ProviderFailedBecauseNet_versionFailed
+ expr: 'eth_rpc_status == 1'
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: Provider failed because net_version failed (instance {{ $labels.instance }})
+ description: "Failed net_version for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: ProviderFailedBecauseGetGenesisFailed
+ expr: 'eth_rpc_status == 2'
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: Provider failed because get genesis failed (instance {{ $labels.instance }})
+ description: "Failed to get genesis for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: ProviderFailedBecauseNet_versionTimeout
+ expr: 'eth_rpc_status == 3'
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: Provider failed because net_version timeout (instance {{ $labels.instance }})
+ description: "net_version timeout for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: ProviderFailedBecauseGetGenesisTimeout
+ expr: 'eth_rpc_status == 4'
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: Provider failed because get genesis timeout (instance {{ $labels.instance }})
+ description: "Timeout to get genesis for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: StoreConnectionIsTooSlow
+ expr: 'store_connection_wait_time_ms > 10'
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: Store connection is too slow (instance {{ $labels.instance }})
+ description: "Store connection is too slow to `{{$labels.pool}}` pool, `{{$labels.shard}}` shard in Graph node `{{$labels.instance}}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: StoreConnectionIsTooSlow
+ expr: 'store_connection_wait_time_ms > 20'
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: Store connection is too slow (instance {{ $labels.instance }})
+ description: "Store connection is too slow to `{{$labels.pool}}` pool, `{{$labels.shard}}` shard in Graph node `{{$labels.instance}}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
diff --git a/dist/rules/hadoop/jmx_exporter.yml b/dist/rules/hadoop/jmx_exporter.yml
new file mode 100644
index 0000000..42d6ee3
--- /dev/null
+++ b/dist/rules/hadoop/jmx_exporter.yml
@@ -0,0 +1,95 @@
+groups:
+
+- name: Jmx_exporter
+
+ rules:
+
+ - alert: HadoopNameNodeDown
+ expr: 'up{job="hadoop-namenode"} == 0'
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: Hadoop Name Node Down (instance {{ $labels.instance }})
+ description: "The Hadoop NameNode service is unavailable.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: HadoopResourceManagerDown
+ expr: 'up{job="hadoop-resourcemanager"} == 0'
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: Hadoop Resource Manager Down (instance {{ $labels.instance }})
+ description: "The Hadoop ResourceManager service is unavailable.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: HadoopDataNodeOutOfService
+ expr: 'hadoop_datanode_last_heartbeat == 0'
+ for: 10m
+ labels:
+ severity: warning
+ annotations:
+ summary: Hadoop Data Node Out Of Service (instance {{ $labels.instance }})
+ description: "The Hadoop DataNode is not sending heartbeats.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: HadoopHdfsDiskSpaceLow
+ expr: '(hadoop_hdfs_bytes_total - hadoop_hdfs_bytes_used) / hadoop_hdfs_bytes_total < 0.1'
+ for: 15m
+ labels:
+ severity: warning
+ annotations:
+ summary: Hadoop HDFS Disk Space Low (instance {{ $labels.instance }})
+ description: "Available HDFS disk space is running low.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: HadoopMapReduceTaskFailures
+ expr: 'hadoop_mapreduce_task_failures_total > 100'
+ for: 10m
+ labels:
+ severity: critical
+ annotations:
+ summary: Hadoop Map Reduce Task Failures (instance {{ $labels.instance }})
+ description: "There is an unusually high number of MapReduce task failures.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: HadoopResourceManagerMemoryHigh
+ expr: 'hadoop_resourcemanager_memory_bytes / hadoop_resourcemanager_memory_max_bytes > 0.8'
+ for: 15m
+ labels:
+ severity: warning
+ annotations:
+ summary: Hadoop Resource Manager Memory High (instance {{ $labels.instance }})
+ description: "The Hadoop ResourceManager is approaching its memory limit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: HadoopYarnContainerAllocationFailures
+ expr: 'hadoop_yarn_container_allocation_failures_total > 10'
+ for: 10m
+ labels:
+ severity: warning
+ annotations:
+ summary: Hadoop YARN Container Allocation Failures (instance {{ $labels.instance }})
+ description: "There is a significant number of YARN container allocation failures.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: HadoopHbaseRegionCountHigh
+ expr: 'hadoop_hbase_region_count > 5000'
+ for: 15m
+ labels:
+ severity: warning
+ annotations:
+ summary: Hadoop HBase Region Count High (instance {{ $labels.instance }})
+ description: "The HBase cluster has an unusually high number of regions.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: HadoopHbaseRegionServerHeapLow
+ expr: 'hadoop_hbase_region_server_heap_bytes / hadoop_hbase_region_server_max_heap_bytes < 0.2'
+ for: 10m
+ labels:
+ severity: critical
+ annotations:
+ summary: Hadoop HBase Region Server Heap Low (instance {{ $labels.instance }})
+ description: "HBase Region Servers are running low on heap space.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: HadoopHbaseWriteRequestsLatencyHigh
+ expr: 'hadoop_hbase_write_requests_latency_seconds > 0.5'
+ for: 10m
+ labels:
+ severity: warning
+ annotations:
+ summary: Hadoop HBase Write Requests Latency High (instance {{ $labels.instance }})
+ description: "HBase Write Requests are experiencing high latency.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
diff --git a/dist/rules/haproxy/haproxy-exporter-v1.yml b/dist/rules/haproxy/haproxy-exporter-v1.yml
index 2b2f93f..7be81a0 100644
--- a/dist/rules/haproxy/haproxy-exporter-v1.yml
+++ b/dist/rules/haproxy/haproxy-exporter-v1.yml
@@ -77,7 +77,7 @@ groups:
description: "Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyBackendMaxActiveSession
- expr: '((sum by (backend) (avg_over_time(haproxy_backend_max_sessions[2m]) * 100) / sum by (backend) (avg_over_time(haproxy_backend_limit_sessions[2m])))) > 80'
+ expr: '((sum by (backend) (avg_over_time(haproxy_backend_current_sessions[2m]) * 100) / sum by (backend) (avg_over_time(haproxy_backend_limit_sessions[2m])))) > 80'
for: 2m
labels:
severity: warning
diff --git a/dist/rules/host-and-hardware/node-exporter.yml b/dist/rules/host-and-hardware/node-exporter.yml
index 791d893..7b553eb 100644
--- a/dist/rules/host-and-hardware/node-exporter.yml
+++ b/dist/rules/host-and-hardware/node-exporter.yml
@@ -5,7 +5,7 @@ groups:
rules:
- alert: HostOutOfMemory
- expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < .10)'
for: 2m
labels:
severity: warning
@@ -14,106 +14,97 @@ groups:
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostMemoryUnderMemoryPressure
- expr: '(rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
- for: 2m
+ expr: '(rate(node_vmstat_pgmajfault[5m]) > 1000)'
+ for: 0m
labels:
severity: warning
annotations:
summary: Host memory under memory pressure (instance {{ $labels.instance }})
- description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ description: "The node is under heavy memory pressure. High rate of loading memory pages from disk.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostMemoryIsUnderutilized
- expr: '(100 - (rate(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
- for: 1w
+ expr: 'min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8'
+ for: 0m
labels:
severity: info
annotations:
summary: Host Memory is underutilized (instance {{ $labels.instance }})
- description: "Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ description: "Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualNetworkThroughputIn
- expr: '(sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
- for: 5m
+ expr: '((rate(node_network_receive_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)'
+ for: 0m
labels:
severity: warning
annotations:
summary: Host unusual network throughput in (instance {{ $labels.instance }})
- description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ description: "Host receive bandwidth is high (>80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualNetworkThroughputOut
- expr: '(sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
- for: 5m
+ expr: '((rate(node_network_transmit_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)'
+ for: 0m
labels:
severity: warning
annotations:
summary: Host unusual network throughput out (instance {{ $labels.instance }})
- description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ description: "Host transmit bandwidth is high (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskReadRate
- expr: '(sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
- for: 5m
+ expr: '(rate(node_disk_io_time_seconds_total[5m]) > .80)'
+ for: 0m
labels:
severity: warning
annotations:
summary: Host unusual disk read rate (instance {{ $labels.instance }})
- description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
-
- - alert: HostUnusualDiskWriteRate
- expr: '(sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
- for: 2m
- labels:
- severity: warning
- annotations:
- summary: Host unusual disk write rate (instance {{ $labels.instance }})
- description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ description: "Disk is too busy (IO wait > 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOutOfDiskSpace
- expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ expr: '(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0)'
for: 2m
labels:
- severity: warning
+ severity: critical
annotations:
summary: Host out of disk space (instance {{ $labels.instance }})
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- - alert: HostDiskWillFillIn24Hours
- expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ - alert: HostDiskMayFillIn24Hours
+ expr: 'predict_linear(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_avail_bytes > 0'
for: 2m
labels:
severity: warning
annotations:
- summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
- description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ summary: Host disk may fill in 24 hours (instance {{ $labels.instance }})
+ description: "Filesystem will likely run out of space within the next 24 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOutOfInodes
- expr: '(node_filesystem_files_free / node_filesystem_files * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ expr: '(node_filesystem_files_free / node_filesystem_files < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0)'
for: 2m
labels:
- severity: warning
+ severity: critical
annotations:
summary: Host out of inodes (instance {{ $labels.instance }})
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostFilesystemDeviceError
- expr: 'node_filesystem_device_error == 1'
- for: 0m
+ expr: 'node_filesystem_device_error{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} == 1'
+ for: 2m
labels:
severity: critical
annotations:
summary: Host filesystem device error (instance {{ $labels.instance }})
- description: "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ description: "Error stat-ing the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- - alert: HostInodesWillFillIn24Hours
- expr: '(node_filesystem_files_free / node_filesystem_files * 100 < 10 and predict_linear(node_filesystem_files_free[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ - alert: HostInodesMayFillIn24Hours
+ expr: 'predict_linear(node_filesystem_files_free{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_files_free > 0'
for: 2m
labels:
severity: warning
annotations:
- summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
- description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ summary: Host inodes may fill in 24 hours (instance {{ $labels.instance }})
+ description: "Filesystem will likely run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskReadLatency
- expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0'
for: 2m
labels:
severity: warning
@@ -122,7 +113,7 @@ groups:
description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskWriteLatency
- expr: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ expr: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0)'
for: 2m
labels:
severity: warning
@@ -131,7 +122,7 @@ groups:
description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostHighCpuLoad
- expr: '(sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > .80'
for: 10m
labels:
severity: warning
@@ -140,16 +131,16 @@ groups:
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostCpuIsUnderutilized
- expr: '(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ expr: '(min by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8'
for: 1w
labels:
severity: info
annotations:
summary: Host CPU is underutilized (instance {{ $labels.instance }})
- description: "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostCpuStealNoisyNeighbor
- expr: '(avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ expr: 'avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10'
for: 0m
labels:
severity: warning
@@ -158,34 +149,34 @@ groups:
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostCpuHighIowait
- expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ expr: 'avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10'
for: 0m
labels:
severity: warning
annotations:
summary: Host CPU high iowait (instance {{ $labels.instance }})
- description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ description: "CPU iowait > 10%. Your CPU is idling waiting for storage to respond.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskIo
- expr: '(rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ expr: 'rate(node_disk_io_time_seconds_total[5m]) > 0.8'
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual disk IO (instance {{ $labels.instance }})
- description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- - alert: HostContextSwitching
- expr: '((rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ - alert: HostContextSwitchingHigh
+ expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2'
for: 0m
labels:
severity: warning
annotations:
- summary: Host context switching (instance {{ $labels.instance }})
- description: "Context switching is growing on the node (> 10000 / s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ summary: Host context switching high (instance {{ $labels.instance }})
+ description: "Context switching is growing on the node (twice the daily average during the last 15m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostSwapIsFillingUp
- expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80)'
for: 2m
labels:
severity: warning
@@ -194,7 +185,7 @@ groups:
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostSystemdServiceCrashed
- expr: '(node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ expr: '(node_systemd_unit_state{state="failed"} == 1)'
for: 0m
labels:
severity: warning
@@ -203,7 +194,7 @@ groups:
description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostPhysicalComponentTooHot
- expr: '((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl"} > 75)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ expr: 'node_hwmon_temp_celsius > node_hwmon_temp_max_celsius'
for: 5m
labels:
severity: warning
@@ -212,7 +203,7 @@ groups:
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNodeOvertemperatureAlarm
- expr: '(node_hwmon_temp_crit_alarm_celsius == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ expr: '((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1))'
for: 0m
labels:
severity: critical
@@ -220,35 +211,35 @@ groups:
summary: Host node overtemperature alarm (instance {{ $labels.instance }})
description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- - alert: HostRaidArrayGotInactive
- expr: '(node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ - alert: HostSoftwareRaidInsufficientDrives
+ expr: '((node_md_disks_required - on(device, instance) node_md_disks{state="active"}) > 0)'
for: 0m
labels:
severity: critical
annotations:
- summary: Host RAID array got inactive (instance {{ $labels.instance }})
- description: "RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ summary: Host software RAID insufficient drives (instance {{ $labels.instance }})
+ description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} has insufficient drives remaining.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- - alert: HostRaidDiskFailure
- expr: '(node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ - alert: HostSoftwareRaidDiskFailure
+ expr: '(node_md_disks{state="failed"} > 0)'
for: 2m
labels:
severity: warning
annotations:
- summary: Host RAID disk failure (instance {{ $labels.instance }})
- description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ summary: Host software RAID disk failure (instance {{ $labels.instance }})
+ description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} needs attention.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostKernelVersionDeviations
- expr: '(count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
- for: 6h
+ expr: 'changes(node_uname_info[1h]) > 0'
+ for: 0m
labels:
- severity: warning
+ severity: info
annotations:
summary: Host kernel version deviations (instance {{ $labels.instance }})
- description: "Different kernel versions are running\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ description: "Kernel version for {{ $labels.instance }} has changed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOomKillDetected
- expr: '(increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ expr: '(increase(node_vmstat_oom_kill[1m]) > 0)'
for: 0m
labels:
severity: warning
@@ -257,7 +248,7 @@ groups:
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostEdacCorrectableErrorsDetected
- expr: '(increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ expr: '(increase(node_edac_correctable_errors_total[1m]) > 0)'
for: 0m
labels:
severity: info
@@ -266,7 +257,7 @@ groups:
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostEdacUncorrectableErrorsDetected
- expr: '(node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ expr: '(node_edac_uncorrectable_errors_total > 0)'
for: 0m
labels:
severity: warning
@@ -275,7 +266,7 @@ groups:
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkReceiveErrors
- expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01)'
for: 2m
labels:
severity: warning
@@ -284,7 +275,7 @@ groups:
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkTransmitErrors
- expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01)'
for: 2m
labels:
severity: warning
@@ -292,17 +283,8 @@ groups:
summary: Host Network Transmit Errors (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- - alert: HostNetworkInterfaceSaturated
- expr: '((rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} > 0.8 < 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
- for: 1m
- labels:
- severity: warning
- annotations:
- summary: Host Network Interface Saturated (instance {{ $labels.instance }})
- description: "The network interface \"{{ $labels.device }}\" on \"{{ $labels.instance }}\" is getting overloaded.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
-
- alert: HostNetworkBondDegraded
- expr: '((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ expr: '((node_bonding_active - node_bonding_slaves) != 0)'
for: 2m
labels:
severity: warning
@@ -311,7 +293,7 @@ groups:
description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostConntrackLimit
- expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8)'
for: 5m
labels:
severity: warning
@@ -320,7 +302,7 @@ groups:
description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostClockSkew
- expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0))'
for: 10m
labels:
severity: warning
@@ -329,7 +311,7 @@ groups:
description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostClockNotSynchronising
- expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16)'
for: 2m
labels:
severity: warning
@@ -338,7 +320,7 @@ groups:
description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostRequiresReboot
- expr: '(node_reboot_required > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+ expr: '(node_reboot_required > 0)'
for: 4h
labels:
severity: info
diff --git a/dist/rules/kubernetes/kubestate-exporter.yml b/dist/rules/kubernetes/kubestate-exporter.yml
index 783682e..7e32694 100644
--- a/dist/rules/kubernetes/kubestate-exporter.yml
+++ b/dist/rules/kubernetes/kubestate-exporter.yml
@@ -10,44 +10,44 @@ groups:
labels:
severity: critical
annotations:
- summary: Kubernetes node not ready (instance {{ $labels.instance }})
+ summary: Kubernetes Node ready (node {{ $labels.node }})
description: "Node {{ $labels.node }} has been unready for a long time\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- - alert: KubernetesMemoryPressure
+ - alert: KubernetesNodeMemoryPressure
expr: 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1'
for: 2m
labels:
severity: critical
annotations:
- summary: Kubernetes memory pressure (instance {{ $labels.instance }})
- description: "{{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ summary: Kubernetes memory pressure (node {{ $labels.node }})
+ description: "Node {{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- - alert: KubernetesDiskPressure
+ - alert: KubernetesNodeDiskPressure
expr: 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1'
for: 2m
labels:
severity: critical
annotations:
- summary: Kubernetes disk pressure (instance {{ $labels.instance }})
- description: "{{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ summary: Kubernetes disk pressure (node {{ $labels.node }})
+ description: "Node {{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- - alert: KubernetesNetworkUnavailable
+ - alert: KubernetesNodeNetworkUnavailable
expr: 'kube_node_status_condition{condition="NetworkUnavailable",status="true"} == 1'
for: 2m
labels:
severity: critical
annotations:
- summary: Kubernetes network unavailable (instance {{ $labels.instance }})
- description: "{{ $labels.node }} has NetworkUnavailable condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ summary: Kubernetes Node network unavailable (instance {{ $labels.instance }})
+ description: "Node {{ $labels.node }} has NetworkUnavailable condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- - alert: KubernetesOutOfCapacity
- expr: 'sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90'
+ - alert: KubernetesNodeOutOfPodCapacity
+ expr: 'sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid, instance) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90'
for: 2m
labels:
severity: warning
annotations:
- summary: Kubernetes out of capacity (instance {{ $labels.instance }})
- description: "{{ $labels.node }} is out of capacity\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ summary: Kubernetes Node out of pod capacity (instance {{ $labels.instance }})
+ description: "Node {{ $labels.node }} is out of pod capacity\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesContainerOomKiller
expr: '(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1'
@@ -55,7 +55,7 @@ groups:
labels:
severity: warning
annotations:
- summary: Kubernetes container oom killer (instance {{ $labels.instance }})
+ summary: Kubernetes container oom killer ({{ $labels.namespace }}/{{ $labels.pod }}:{{ $labels.container }})
description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesJobFailed
@@ -64,16 +64,25 @@ groups:
labels:
severity: warning
annotations:
- summary: Kubernetes Job failed (instance {{ $labels.instance }})
+ summary: Kubernetes Job failed ({{ $labels.namespace }}/{{ $labels.job_name }})
description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ - alert: KubernetesJobNotStarting
+ expr: 'kube_job_status_active == 0 and kube_job_status_failed == 0 and kube_job_status_succeeded == 0 and (time() - kube_job_status_start_time) > 600'
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: Kubernetes Job not starting ({{ $labels.namespace }}/{{ $labels.job_name }})
+ description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} did not start for 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
- alert: KubernetesCronjobSuspended
expr: 'kube_cronjob_spec_suspend != 0'
for: 0m
labels:
severity: warning
annotations:
- summary: Kubernetes CronJob suspended (instance {{ $labels.instance }})
+ summary: Kubernetes CronJob suspended ({{ $labels.namespace }}/{{ $labels.cronjob }})
description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is suspended\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesPersistentvolumeclaimPending
@@ -82,7 +91,7 @@ groups:
labels:
severity: warning
annotations:
- summary: Kubernetes PersistentVolumeClaim pending (instance {{ $labels.instance }})
+ summary: Kubernetes PersistentVolumeClaim pending ({{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }})
description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pending\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesVolumeOutOfDiskSpace
@@ -95,13 +104,13 @@ groups:
description: "Volume is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesVolumeFullInFourDays
- expr: 'predict_linear(kubelet_volume_stats_available_bytes[6h], 4 * 24 * 3600) < 0'
+ expr: 'predict_linear(kubelet_volume_stats_available_bytes[6h:5m], 4 * 24 * 3600) < 0'
for: 0m
labels:
severity: critical
annotations:
summary: Kubernetes Volume full in four days (instance {{ $labels.instance }})
- description: "{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ description: "Volume under {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesPersistentvolumeError
expr: 'kube_persistentvolume_status_phase{phase=~"Failed|Pending", job="kube-state-metrics"} > 0'
@@ -109,8 +118,8 @@ groups:
labels:
severity: critical
annotations:
- summary: Kubernetes PersistentVolume error (instance {{ $labels.instance }})
- description: "Persistent volume is in bad state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ summary: Kubernetes PersistentVolumeClaim pending ({{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }})
+ description: "Persistent volume {{ $labels.persistentvolume }} is in bad state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesStatefulsetDown
expr: 'kube_statefulset_replicas != kube_statefulset_status_replicas_ready > 0'
@@ -118,35 +127,35 @@ groups:
labels:
severity: critical
annotations:
- summary: Kubernetes StatefulSet down (instance {{ $labels.instance }})
- description: "A StatefulSet went down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ summary: Kubernetes StatefulSet down ({{ $labels.namespace }}/{{ $labels.statefulset }})
+ description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} went down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- - alert: KubernetesHpaScalingAbility
- expr: 'kube_horizontalpodautoscaler_status_condition{status="false", condition="AbleToScale"} == 1'
+ - alert: KubernetesHpaScaleInability
+ expr: '(kube_horizontalpodautoscaler_spec_max_replicas - kube_horizontalpodautoscaler_status_desired_replicas) * on (horizontalpodautoscaler,namespace) (kube_horizontalpodautoscaler_status_condition{condition="ScalingLimited", status="true"} == 1) == 0'
for: 2m
labels:
severity: warning
annotations:
- summary: Kubernetes HPA scaling ability (instance {{ $labels.instance }})
- description: "Pod is unable to scale\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ summary: Kubernetes HPA scale inability (instance {{ $labels.instance }})
+ description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to scale\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- - alert: KubernetesHpaMetricAvailability
+ - alert: KubernetesHpaMetricsUnavailability
expr: 'kube_horizontalpodautoscaler_status_condition{status="false", condition="ScalingActive"} == 1'
for: 0m
labels:
severity: warning
annotations:
- summary: Kubernetes HPA metric availability (instance {{ $labels.instance }})
- description: "HPA is not able to collect metrics\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ summary: Kubernetes HPA metrics unavailability (instance {{ $labels.instance }})
+ description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to collect metrics\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- - alert: KubernetesHpaScaleCapability
- expr: 'kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas'
+ - alert: KubernetesHpaScaleMaximum
+ expr: '(kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas) and (kube_horizontalpodautoscaler_spec_max_replicas > 1) and (kube_horizontalpodautoscaler_spec_min_replicas != kube_horizontalpodautoscaler_spec_max_replicas)'
for: 2m
labels:
severity: info
annotations:
- summary: Kubernetes HPA scale capability (instance {{ $labels.instance }})
- description: "The maximum number of desired Pods has been hit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ summary: Kubernetes HPA scale maximum (instance {{ $labels.instance }})
+ description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has hit maximum number of desired pods\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesHpaUnderutilized
expr: 'max(quantile_over_time(0.5, kube_horizontalpodautoscaler_status_desired_replicas[1d]) == kube_horizontalpodautoscaler_spec_min_replicas) by (horizontalpodautoscaler) > 3'
@@ -155,7 +164,7 @@ groups:
severity: info
annotations:
summary: Kubernetes HPA underutilized (instance {{ $labels.instance }})
- description: "HPA is constantly at minimum replicas for 50% of the time. Potential cost saving here.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is constantly at minimum replicas for 50% of the time. Potential cost saving here.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesPodNotHealthy
expr: 'sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"}) > 0'
@@ -163,8 +172,8 @@ groups:
labels:
severity: critical
annotations:
- summary: Kubernetes Pod not healthy (instance {{ $labels.instance }})
- description: "Pod has been in a non-ready state for longer than 15 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ summary: Kubernetes Pod not healthy ({{ $labels.namespace }}/{{ $labels.pod }})
+ description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-running state for longer than 15 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesPodCrashLooping
expr: 'increase(kube_pod_container_status_restarts_total[1m]) > 3'
@@ -172,17 +181,17 @@ groups:
labels:
severity: warning
annotations:
- summary: Kubernetes pod crash looping (instance {{ $labels.instance }})
- description: "Pod {{ $labels.pod }} is crash looping\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ summary: Kubernetes pod crash looping ({{ $labels.namespace }}/{{ $labels.pod }})
+ description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- - alert: KubernetesReplicassetMismatch
+ - alert: KubernetesReplicasetReplicasMismatch
expr: 'kube_replicaset_spec_replicas != kube_replicaset_status_ready_replicas'
for: 10m
labels:
severity: warning
annotations:
- summary: Kubernetes ReplicasSet mismatch (instance {{ $labels.instance }})
- description: "Deployment Replicas mismatch\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ summary: Kubernetes ReplicasSet mismatch ({{ $labels.namespace }}/{{ $labels.replicaset }})
+ description: "ReplicaSet {{ $labels.namespace }}/{{ $labels.replicaset }} replicas mismatch\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesDeploymentReplicasMismatch
expr: 'kube_deployment_spec_replicas != kube_deployment_status_replicas_available'
@@ -190,8 +199,8 @@ groups:
labels:
severity: warning
annotations:
- summary: Kubernetes Deployment replicas mismatch (instance {{ $labels.instance }})
- description: "Deployment Replicas mismatch\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ summary: Kubernetes Deployment replicas mismatch ({{ $labels.namespace }}/{{ $labels.deployment }})
+ description: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replicas mismatch\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesStatefulsetReplicasMismatch
expr: 'kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas'
@@ -200,7 +209,7 @@ groups:
severity: warning
annotations:
summary: Kubernetes StatefulSet replicas mismatch (instance {{ $labels.instance }})
- description: "A StatefulSet does not match the expected number of replicas.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ description: "StatefulSet does not match the expected number of replicas.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesDeploymentGenerationMismatch
expr: 'kube_deployment_status_observed_generation != kube_deployment_metadata_generation'
@@ -208,8 +217,8 @@ groups:
labels:
severity: critical
annotations:
- summary: Kubernetes Deployment generation mismatch (instance {{ $labels.instance }})
- description: "A Deployment has failed but has not been rolled back.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ summary: Kubernetes Deployment generation mismatch ({{ $labels.namespace }}/{{ $labels.deployment }})
+ description: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has failed but has not been rolled back.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesStatefulsetGenerationMismatch
expr: 'kube_statefulset_status_observed_generation != kube_statefulset_metadata_generation'
@@ -217,8 +226,8 @@ groups:
labels:
severity: critical
annotations:
- summary: Kubernetes StatefulSet generation mismatch (instance {{ $labels.instance }})
- description: "A StatefulSet has failed but has not been rolled back.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ summary: Kubernetes StatefulSet generation mismatch ({{ $labels.namespace }}/{{ $labels.statefulset }})
+ description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has failed but has not been rolled back.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesStatefulsetUpdateNotRolledOut
expr: 'max without (revision) (kube_statefulset_status_current_revision unless kube_statefulset_status_update_revision) * (kube_statefulset_replicas != kube_statefulset_status_replicas_updated)'
@@ -226,8 +235,8 @@ groups:
labels:
severity: warning
annotations:
- summary: Kubernetes StatefulSet update not rolled out (instance {{ $labels.instance }})
- description: "StatefulSet update has not been rolled out.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ summary: Kubernetes StatefulSet update not rolled out ({{ $labels.namespace }}/{{ $labels.statefulset }})
+ description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesDaemonsetRolloutStuck
expr: 'kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 or kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0'
@@ -235,8 +244,8 @@ groups:
labels:
severity: warning
annotations:
- summary: Kubernetes DaemonSet rollout stuck (instance {{ $labels.instance }})
- description: "Some Pods of DaemonSet are not scheduled or not ready\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ summary: Kubernetes DaemonSet rollout stuck ({{ $labels.namespace }}/{{ $labels.daemonset }})
+ description: "Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled or not ready\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesDaemonsetMisscheduled
expr: 'kube_daemonset_status_number_misscheduled > 0'
@@ -244,8 +253,8 @@ groups:
labels:
severity: critical
annotations:
- summary: Kubernetes DaemonSet misscheduled (instance {{ $labels.instance }})
- description: "Some DaemonSet Pods are running where they are not supposed to run\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ summary: Kubernetes DaemonSet misscheduled ({{ $labels.namespace }}/{{ $labels.daemonset }})
+ description: "Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesCronjobTooLong
expr: 'time() - kube_cronjob_next_schedule_time > 3600'
@@ -253,7 +262,7 @@ groups:
labels:
severity: warning
annotations:
- summary: Kubernetes CronJob too long (instance {{ $labels.instance }})
+ summary: Kubernetes CronJob too long ({{ $labels.namespace }}/{{ $labels.cronjob }})
description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesJobSlowCompletion
@@ -262,11 +271,11 @@ groups:
labels:
severity: critical
annotations:
- summary: Kubernetes job slow completion (instance {{ $labels.instance }})
+ summary: Kubernetes job slow completion ({{ $labels.namespace }}/{{ $labels.job_name }})
description: "Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in time.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesApiServerErrors
- expr: 'sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[1m])) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) * 100 > 3'
+ expr: 'sum(rate(apiserver_request_total{job="apiserver",code=~"(?:5..)"}[1m])) by (instance, job) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) by (instance, job) * 100 > 3'
for: 2m
labels:
severity: critical
@@ -302,7 +311,7 @@ groups:
description: "A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesApiServerLatency
- expr: 'histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"} [10m])) WITHOUT (instance, resource)) / 1e+06 > 1'
+ expr: 'histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{verb!~"(?:CONNECT|WATCHLIST|WATCH|PROXY)"} [10m])) WITHOUT (subresource)) > 1'
for: 2m
labels:
severity: warning
diff --git a/dist/rules/meilisearch/embedded-exporter.yml b/dist/rules/meilisearch/embedded-exporter.yml
new file mode 100644
index 0000000..8da2803
--- /dev/null
+++ b/dist/rules/meilisearch/embedded-exporter.yml
@@ -0,0 +1,23 @@
+groups:
+
+- name: EmbeddedExporter
+
+ rules:
+
+ - alert: MeilisearchIndexIsEmpty
+ expr: 'meilisearch_index_docs_count == 0'
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: Meilisearch index is empty (instance {{ $labels.instance }})
+ description: "Meilisearch instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: MeilisearchHttpResponseTime
+ expr: 'meilisearch_http_response_time_seconds > 0.5'
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: Meilisearch http response time (instance {{ $labels.instance }})
+ description: "Meilisearch http response time is too high\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
diff --git a/dist/rules/minio/embedded-exporter.yml b/dist/rules/minio/embedded-exporter.yml
index 8e19729..1ac2de5 100644
--- a/dist/rules/minio/embedded-exporter.yml
+++ b/dist/rules/minio/embedded-exporter.yml
@@ -5,7 +5,7 @@ groups:
rules:
- alert: MinioClusterDiskOffline
- expr: 'minio_cluster_disk_offline_total > 0'
+ expr: 'minio_cluster_drive_offline_total > 0'
for: 0m
labels:
severity: critical
diff --git a/dist/rules/mongodb/percona-mongodb-exporter.yml b/dist/rules/mongodb/percona-mongodb-exporter.yml
index 3e1e5e9..1bd446f 100644
--- a/dist/rules/mongodb/percona-mongodb-exporter.yml
+++ b/dist/rules/mongodb/percona-mongodb-exporter.yml
@@ -66,12 +66,3 @@ groups:
annotations:
summary: MongoDB too many connections (instance {{ $labels.instance }})
description: "Too many connections (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
-
- - alert: MongodbVirtualMemoryUsage
- expr: '(sum(mongodb_ss_mem_virtual) BY (instance) / sum(mongodb_ss_mem_resident) BY (instance)) > 3'
- for: 2m
- labels:
- severity: warning
- annotations:
- summary: MongoDB virtual memory usage (instance {{ $labels.instance }})
- description: "High memory usage\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
diff --git a/dist/rules/mysql/mysqld-exporter.yml b/dist/rules/mysql/mysqld-exporter.yml
index ad8ed5f..3ef716f 100644
--- a/dist/rules/mysql/mysqld-exporter.yml
+++ b/dist/rules/mysql/mysqld-exporter.yml
@@ -22,6 +22,15 @@ groups:
summary: MySQL too many connections (> 80%) (instance {{ $labels.instance }})
description: "More than 80% of MySQL connections are in use on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ - alert: MysqlHighPreparedStatementsUtilization(>80%)
+ expr: 'max_over_time(mysql_global_status_prepared_stmt_count[1m]) / mysql_global_variables_max_prepared_stmt_count * 100 > 80'
+ for: 2m
+ labels:
+ severity: warning
+ annotations:
+ summary: MySQL high prepared statements utilization (> 80%) (instance {{ $labels.instance }})
+ description: "High utilization of prepared statements (>80%) on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
- alert: MysqlHighThreadsRunning
expr: 'max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections * 100 > 60'
for: 2m
@@ -84,3 +93,39 @@ groups:
annotations:
summary: MySQL restarted (instance {{ $labels.instance }})
description: "MySQL has just been restarted, less than one minute ago on {{ $labels.instance }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: MysqlHighQps
+ expr: 'irate(mysql_global_status_questions[1m]) > 10000'
+ for: 2m
+ labels:
+ severity: info
+ annotations:
+ summary: MySQL High QPS (instance {{ $labels.instance }})
+ description: "MySQL is being overload with unusual QPS (> 10k QPS).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: MysqlTooManyOpenFiles
+ expr: 'mysql_global_status_innodb_num_open_files / mysql_global_variables_open_files_limit * 100 > 75'
+ for: 2m
+ labels:
+ severity: warning
+ annotations:
+ summary: MySQL too many open files (instance {{ $labels.instance }})
+ description: "MySQL has too many open files, consider increase variables open_files_limit on {{ $labels.instance }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: MysqlInnodbForceRecoveryIsEnabled
+ expr: 'mysql_global_variables_innodb_force_recovery != 0'
+ for: 2m
+ labels:
+ severity: warning
+ annotations:
+ summary: MySQL InnoDB Force Recovery is enabled (instance {{ $labels.instance }})
+ description: "MySQL InnoDB force recovery is enabled on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: MysqlInnodbHistory_lenTooLong
+ expr: 'mysql_info_schema_innodb_metrics_transaction_trx_rseg_history_len > 50000'
+ for: 2m
+ labels:
+ severity: warning
+ annotations:
+ summary: MySQL InnoDB history_len too long (instance {{ $labels.instance }})
+ description: "MySQL history_len (undo log) too long on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
diff --git a/dist/rules/nats/nats-exporter.yml b/dist/rules/nats/nats-exporter.yml
index 13eda2b..7648762 100644
--- a/dist/rules/nats/nats-exporter.yml
+++ b/dist/rules/nats/nats-exporter.yml
@@ -32,10 +32,154 @@ groups:
description: "High number of NATS subscriptions ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsHighRoutesCount
- expr: 'gnatsd_routez_num_routes > 10'
+ expr: 'gnatsd_varz_routes > 10'
for: 3m
labels:
severity: warning
annotations:
summary: Nats high routes count (instance {{ $labels.instance }})
description: "High number of NATS routes ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: NatsHighMemoryUsage
+ expr: 'gnatsd_varz_mem > 200 * 1024 * 1024'
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: Nats high memory usage (instance {{ $labels.instance }})
+ description: "NATS server memory usage is above 200MB for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: NatsSlowConsumers
+ expr: 'gnatsd_varz_slow_consumers > 0'
+ for: 3m
+ labels:
+ severity: critical
+ annotations:
+ summary: Nats slow consumers (instance {{ $labels.instance }})
+ description: "There are slow consumers in NATS for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: NatsServerDown
+ expr: 'absent(up{job="nats"})'
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: Nats server down (instance {{ $labels.instance }})
+ description: "NATS server has been down for more than 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: NatsHighCpuUsage
+ expr: 'rate(gnatsd_varz_cpu[5m]) > 0.8'
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: Nats high CPU usage (instance {{ $labels.instance }})
+ description: "NATS server is using more than 80% CPU for the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: NatsHighNumberOfConnections
+ expr: 'gnatsd_connz_num_connections > 1000'
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: Nats high number of connections (instance {{ $labels.instance }})
+ description: "NATS server has more than 1000 active connections\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: NatsHighJetstreamStoreUsage
+ expr: 'gnatsd_varz_jetstream_stats_storage / gnatsd_varz_jetstream_config_max_storage > 0.8'
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: Nats high JetStream store usage (instance {{ $labels.instance }})
+ description: "JetStream store usage is over 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: NatsHighJetstreamMemoryUsage
+ expr: 'gnatsd_varz_jetstream_stats_memory / gnatsd_varz_jetstream_config_max_memory > 0.8'
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: Nats high JetStream memory usage (instance {{ $labels.instance }})
+ description: "JetStream memory usage is over 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: NatsHighNumberOfSubscriptions
+ expr: 'gnatsd_connz_subscriptions > 1000'
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: Nats high number of subscriptions (instance {{ $labels.instance }})
+ description: "NATS server has more than 1000 active subscriptions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: NatsHighPendingBytes
+ expr: 'gnatsd_connz_pending_bytes > 100000'
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: Nats high pending bytes (instance {{ $labels.instance }})
+ description: "NATS server has more than 100,000 pending bytes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: NatsTooManyErrors
+ expr: 'increase(gnatsd_varz_jetstream_stats_api_errors[5m]) > 0'
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: Nats too many errors (instance {{ $labels.instance }})
+ description: "NATS server has encountered errors in the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: NatsJetstreamConsumersExceeded
+ expr: 'sum(gnatsd_varz_jetstream_stats_accounts) > 100'
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: Nats JetStream consumers exceeded (instance {{ $labels.instance }})
+ description: "JetStream has more than 100 active consumers\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: NatsFrequentAuthenticationTimeouts
+ expr: 'increase(gnatsd_varz_auth_timeout[5m]) > 5'
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: Nats frequent authentication timeouts (instance {{ $labels.instance }})
+ description: "There have been more than 5 authentication timeouts in the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: NatsMaxPayloadSizeExceeded
+ expr: 'max(gnatsd_varz_max_payload) > 1024 * 1024'
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: Nats max payload size exceeded (instance {{ $labels.instance }})
+ description: "The max payload size allowed by NATS has been exceeded (1MB)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: NatsLeafNodeConnectionIssue
+ expr: 'increase(gnatsd_varz_leafnodes[5m]) == 0'
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: Nats leaf node connection issue (instance {{ $labels.instance }})
+ description: "No leaf node connections have been established in the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: NatsMaxPingOperationsExceeded
+ expr: 'gnatsd_varz_ping_max > 50'
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: Nats max ping operations exceeded (instance {{ $labels.instance }})
+ description: "The maximum number of ping operations in NATS has exceeded 50\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: NatsWriteDeadlineExceeded
+ expr: 'gnatsd_varz_write_deadline > 10'
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: Nats write deadline exceeded (instance {{ $labels.instance }})
+ description: "The write deadline has been exceeded in NATS, indicating potential message delivery issues\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
diff --git a/dist/rules/netdata/embedded-exporter.yml b/dist/rules/netdata/embedded-exporter.yml
index 7d21766..8c57745 100644
--- a/dist/rules/netdata/embedded-exporter.yml
+++ b/dist/rules/netdata/embedded-exporter.yml
@@ -23,7 +23,7 @@ groups:
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NetdataHighMemoryUsage
- expr: '100 / netdata_system_ram_MB_average * netdata_system_ram_MB_average{dimension=~"free|cached"} < 20'
+ expr: '100 / netdata_system_ram_MiB_average * netdata_system_ram_MiB_average{dimension=~"free|cached"} < 20'
for: 5m
labels:
severity: warning
diff --git a/dist/rules/postgresql/postgres-exporter.yml b/dist/rules/postgresql/postgres-exporter.yml
index 2c4a793..42e5bb8 100644
--- a/dist/rules/postgresql/postgres-exporter.yml
+++ b/dist/rules/postgresql/postgres-exporter.yml
@@ -32,7 +32,7 @@ groups:
description: "Postgresql exporter is showing errors. A query may be buggy in query.yaml\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlTableNotAutoVacuumed
- expr: '(pg_stat_user_tables_last_autovacuum > 0) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10'
+ expr: '((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_vacuum_threshold) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10'
for: 0m
labels:
severity: warning
@@ -41,7 +41,7 @@ groups:
description: "Table {{ $labels.relname }} has not been auto vacuumed for 10 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlTableNotAutoAnalyzed
- expr: '(pg_stat_user_tables_last_autoanalyze > 0) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10'
+ expr: '((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_analyze_threshold) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10'
for: 0m
labels:
severity: warning
@@ -50,7 +50,7 @@ groups:
description: "Table {{ $labels.relname }} has not been auto analyzed for 10 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlTooManyConnections
- expr: ''
+ expr: 'sum by (instance, job, server) (pg_stat_activity_count) > min by (instance, job, server) (pg_settings_max_connections * 0.8)'
for: 2m
labels:
severity: warning
@@ -62,7 +62,7 @@ groups:
expr: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5'
for: 2m
labels:
- severity: warning
+ severity: critical
annotations:
summary: Postgresql not enough connections (instance {{ $labels.instance }})
description: "PostgreSQL instance should have more connections (> 5)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
@@ -86,7 +86,7 @@ groups:
description: "Ratio of transactions being aborted compared to committed is > 2 %\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlCommitRateLow
- expr: 'rate(pg_stat_database_xact_commit[1m]) < 10'
+ expr: 'increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[5m]) < 5'
for: 2m
labels:
severity: critical
@@ -140,7 +140,7 @@ groups:
description: "PostgreSQL dead tuples is too large\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlConfigurationChanged
- expr: '{__name__=~"pg_settings_.*"} != ON(__name__) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"} OFFSET 5m'
+ expr: '{__name__=~"pg_settings_.*"} != ON(__name__, instance) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"} OFFSET 5m'
for: 0m
labels:
severity: info
@@ -155,7 +155,7 @@ groups:
severity: critical
annotations:
summary: Postgresql SSL compression active (instance {{ $labels.instance }})
- description: "Database connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ description: "Database allows connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlTooManyLocksAcquired
expr: '((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20'
@@ -183,3 +183,12 @@ groups:
annotations:
summary: Postgresql bloat table high (> 80%) (instance {{ $labels.instance }})
description: "The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: PostgresqlInvalidIndex
+ expr: 'pg_general_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}'
+ for: 6h
+ labels:
+ severity: warning
+ annotations:
+ summary: Postgresql invalid index (instance {{ $labels.instance }})
+ description: "The table {{ $labels.relname }} has an invalid index: {{ $labels.indexrelname }}. You should execute `DROP INDEX {{ $labels.indexrelname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
diff --git a/dist/rules/prometheus-self-monitoring/embedded-exporter.yml b/dist/rules/prometheus-self-monitoring/embedded-exporter.yml
index 65bfd82..908f001 100644
--- a/dist/rules/prometheus-self-monitoring/embedded-exporter.yml
+++ b/dist/rules/prometheus-self-monitoring/embedded-exporter.yml
@@ -32,7 +32,7 @@ groups:
description: "A Prometheus job does not have living target anymore.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusTargetMissingWithWarmupTime
- expr: 'sum by (instance, job) ((up == 0) * on (instance) group_right(job) (node_time_seconds - node_boot_time_seconds > 600))'
+ expr: 'sum by (instance, job) ((up == 0) * on (instance) group_left(__name__) (node_time_seconds - node_boot_time_seconds > 600))'
for: 0m
labels:
severity: critical
diff --git a/dist/rules/rabbitmq/kbudde-rabbitmq-exporter.yml b/dist/rules/rabbitmq/kbudde-rabbitmq-exporter.yml
index 05af1c7..40b6d95 100644
--- a/dist/rules/rabbitmq/kbudde-rabbitmq-exporter.yml
+++ b/dist/rules/rabbitmq/kbudde-rabbitmq-exporter.yml
@@ -10,7 +10,7 @@ groups:
labels:
severity: critical
annotations:
- summary: Rabbitmq down (instance {{ $labels.instance }})
+ summary: RabbitMQ down (instance {{ $labels.instance }})
description: "RabbitMQ node down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqClusterDown
@@ -19,7 +19,7 @@ groups:
labels:
severity: critical
annotations:
- summary: Rabbitmq cluster down (instance {{ $labels.instance }})
+ summary: RabbitMQ cluster down (instance {{ $labels.instance }})
description: "Less than 3 nodes running in RabbitMQ cluster\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqClusterPartition
@@ -28,7 +28,7 @@ groups:
labels:
severity: critical
annotations:
- summary: Rabbitmq cluster partition (instance {{ $labels.instance }})
+ summary: RabbitMQ cluster partition (instance {{ $labels.instance }})
description: "Cluster partition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqOutOfMemory
@@ -37,7 +37,7 @@ groups:
labels:
severity: warning
annotations:
- summary: Rabbitmq out of memory (instance {{ $labels.instance }})
+ summary: RabbitMQ out of memory (instance {{ $labels.instance }})
description: "Memory available for RabbmitMQ is low (< 10%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqTooManyConnections
@@ -46,7 +46,7 @@ groups:
labels:
severity: warning
annotations:
- summary: Rabbitmq too many connections (instance {{ $labels.instance }})
+ summary: RabbitMQ too many connections (instance {{ $labels.instance }})
description: "RabbitMQ instance has too many connections (> 1000)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqDeadLetterQueueFillingUp
@@ -55,7 +55,7 @@ groups:
labels:
severity: warning
annotations:
- summary: Rabbitmq dead letter queue filling up (instance {{ $labels.instance }})
+ summary: RabbitMQ dead letter queue filling up (instance {{ $labels.instance }})
description: "Dead letter queue is filling up (> 10 msgs)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqTooManyMessagesInQueue
@@ -64,7 +64,7 @@ groups:
labels:
severity: warning
annotations:
- summary: Rabbitmq too many messages in queue (instance {{ $labels.instance }})
+ summary: RabbitMQ too many messages in queue (instance {{ $labels.instance }})
description: "Queue is filling up (> 1000 msgs)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqSlowQueueConsuming
@@ -73,7 +73,7 @@ groups:
labels:
severity: warning
annotations:
- summary: Rabbitmq slow queue consuming (instance {{ $labels.instance }})
+ summary: RabbitMQ slow queue consuming (instance {{ $labels.instance }})
description: "Queue messages are consumed slowly (> 60s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqNoConsumer
@@ -82,7 +82,7 @@ groups:
labels:
severity: critical
annotations:
- summary: Rabbitmq no consumer (instance {{ $labels.instance }})
+ summary: RabbitMQ no consumer (instance {{ $labels.instance }})
description: "Queue has no consumer\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqTooManyConsumers
@@ -91,7 +91,7 @@ groups:
labels:
severity: critical
annotations:
- summary: Rabbitmq too many consumers (instance {{ $labels.instance }})
+ summary: RabbitMQ too many consumers (instance {{ $labels.instance }})
description: "Queue should have only 1 consumer\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqUnactiveExchange
@@ -100,5 +100,5 @@ groups:
labels:
severity: warning
annotations:
- summary: Rabbitmq unactive exchange (instance {{ $labels.instance }})
+ summary: RabbitMQ unactive exchange (instance {{ $labels.instance }})
description: "Exchange receive less than 5 msgs per second\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
diff --git a/dist/rules/rabbitmq/rabbitmq-exporter.yml b/dist/rules/rabbitmq/rabbitmq-exporter.yml
index 6efae97..10823d2 100644
--- a/dist/rules/rabbitmq/rabbitmq-exporter.yml
+++ b/dist/rules/rabbitmq/rabbitmq-exporter.yml
@@ -10,7 +10,7 @@ groups:
labels:
severity: critical
annotations:
- summary: Rabbitmq node down (instance {{ $labels.instance }})
+ summary: RabbitMQ node down (instance {{ $labels.instance }})
description: "Less than 3 nodes running in RabbitMQ cluster\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqNodeNotDistributed
@@ -19,7 +19,7 @@ groups:
labels:
severity: critical
annotations:
- summary: Rabbitmq node not distributed (instance {{ $labels.instance }})
+ summary: RabbitMQ node not distributed (instance {{ $labels.instance }})
description: "Distribution link state is not 'up'\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqInstancesDifferentVersions
@@ -28,8 +28,8 @@ groups:
labels:
severity: warning
annotations:
- summary: Rabbitmq instances different versions (instance {{ $labels.instance }})
- description: "Running different version of Rabbitmq in the same cluster, can lead to failure.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ summary: RabbitMQ instances different versions (instance {{ $labels.instance }})
+ description: "Running different version of RabbitMQ in the same cluster, can lead to failure.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqMemoryHigh
expr: 'rabbitmq_process_resident_memory_bytes / rabbitmq_resident_memory_limit_bytes * 100 > 90'
@@ -37,7 +37,7 @@ groups:
labels:
severity: warning
annotations:
- summary: Rabbitmq memory high (instance {{ $labels.instance }})
+ summary: RabbitMQ memory high (instance {{ $labels.instance }})
description: "A node use more than 90% of allocated RAM\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqFileDescriptorsUsage
@@ -46,16 +46,25 @@ groups:
labels:
severity: warning
annotations:
- summary: Rabbitmq file descriptors usage (instance {{ $labels.instance }})
+ summary: RabbitMQ file descriptors usage (instance {{ $labels.instance }})
description: "A node use more than 90% of file descriptors\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ - alert: RabbitmqTooManyReadyMessages
+ expr: 'sum(rabbitmq_queue_messages_ready) BY (queue) > 1000'
+ for: 1m
+ labels:
+ severity: warning
+ annotations:
+ summary: RabbitMQ too many ready messages (instance {{ $labels.instance }})
+ description: "RabbitMQ too many ready messages on {{ $labels.instace }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
- alert: RabbitmqTooManyUnackMessages
expr: 'sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000'
for: 1m
labels:
severity: warning
annotations:
- summary: Rabbitmq too many unack messages (instance {{ $labels.instance }})
+ summary: RabbitMQ too many unack messages (instance {{ $labels.instance }})
description: "Too many unacknowledged messages\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqTooManyConnections
@@ -64,7 +73,7 @@ groups:
labels:
severity: warning
annotations:
- summary: Rabbitmq too many connections (instance {{ $labels.instance }})
+ summary: RabbitMQ too many connections (instance {{ $labels.instance }})
description: "The total connections of a node is too high\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqNoQueueConsumer
@@ -73,7 +82,7 @@ groups:
labels:
severity: warning
annotations:
- summary: Rabbitmq no queue consumer (instance {{ $labels.instance }})
+ summary: RabbitMQ no queue consumer (instance {{ $labels.instance }})
description: "A queue has less than 1 consumer\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqUnroutableMessages
@@ -82,5 +91,5 @@ groups:
labels:
severity: warning
annotations:
- summary: Rabbitmq unroutable messages (instance {{ $labels.instance }})
+ summary: RabbitMQ unroutable messages (instance {{ $labels.instance }})
description: "A queue has unroutable messages\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
diff --git a/dist/rules/redis/oliver006-redis-exporter.yml b/dist/rules/redis/oliver006-redis-exporter.yml
index 08cdf23..6b4dd8d 100644
--- a/dist/rules/redis/oliver006-redis-exporter.yml
+++ b/dist/rules/redis/oliver006-redis-exporter.yml
@@ -77,7 +77,7 @@ groups:
description: "Redis is running out of system memory (> 90%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RedisOutOfConfiguredMaxmemory
- expr: 'redis_memory_used_bytes / redis_memory_max_bytes * 100 > 90'
+ expr: 'redis_memory_used_bytes / redis_memory_max_bytes * 100 > 90 and on(instance) redis_memory_max_bytes > 0'
for: 2m
labels:
severity: warning
diff --git a/dist/rules/s.m.a.r.t-device-monitoring/smartctl-exporter.yml b/dist/rules/s.m.a.r.t-device-monitoring/smartctl-exporter.yml
new file mode 100644
index 0000000..866d715
--- /dev/null
+++ b/dist/rules/s.m.a.r.t-device-monitoring/smartctl-exporter.yml
@@ -0,0 +1,77 @@
+groups:
+
+- name: SmartctlExporter
+
+ rules:
+
+ - alert: SmartDeviceTemperatureWarning
+ expr: '(avg_over_time(smartctl_device_temperature{temperature_type="current"} [5m]) unless on (instance, device) smartctl_device_temperature{temperature_type="drive_trip"}) > 60'
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: SMART device temperature warning (instance {{ $labels.instance }})
+ description: "Device temperature warning on {{ $labels.instance }} drive {{ $labels.device }} over 60°C\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: SmartDeviceTemperatureCritical
+ expr: '(max_over_time(smartctl_device_temperature{temperature_type="current"} [5m]) unless on (instance, device) smartctl_device_temperature{temperature_type="drive_trip"}) > 70'
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: SMART device temperature critical (instance {{ $labels.instance }})
+ description: "Device temperature critical on {{ $labels.instance }} drive {{ $labels.device }} over 70°C\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: SmartDeviceTemperatureOverTripValue
+ expr: 'max_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) >= on(device, instance) smartctl_device_temperature{temperature_type="drive_trip"}'
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: SMART device temperature over trip value (instance {{ $labels.instance }})
+ description: "Device temperature over trip value on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: SmartDeviceTemperatureNearingTripValue
+ expr: 'max_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) >= on(device, instance) (smartctl_device_temperature{temperature_type="drive_trip"} * .80)'
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: SMART device temperature nearing trip value (instance {{ $labels.instance }})
+ description: "Device temperature at 80% of trip value on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: SmartStatus
+ expr: 'smartctl_device_smart_status != 1'
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: SMART status (instance {{ $labels.instance }})
+ description: "Device has a SMART status failure on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: SmartCriticalWarning
+ expr: 'smartctl_device_critical_warning > 0'
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: SMART critical warning (instance {{ $labels.instance }})
+ description: "Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: SmartMediaErrors
+ expr: 'smartctl_device_media_errors > 0'
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: SMART media errors (instance {{ $labels.instance }})
+ description: "Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+
+ - alert: SmartWearoutIndicator
+ expr: 'smartctl_device_available_spare < smartctl_device_available_spare_threshold'
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: SMART Wearout Indicator (instance {{ $labels.instance }})
+ description: "Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
diff --git a/dist/template.yml b/dist/template.yml
index 0dd9684..cdde4ea 100644
--- a/dist/template.yml
+++ b/dist/template.yml
@@ -11,6 +11,6 @@ groups:
labels:
severity: {{ rule.severity }}
annotations:
- summary: {{ rule.name }} (instance {% raw %}{{ $labels.instance }}{% endraw %})
+ summary: {% if rule.summary %}{{ rule.summary }}{% else %}{{ rule.name }} (instance {% raw %}{{ $labels.instance }}{% endraw %}){% endif %}
description: "{{ rule.description | replace: '"', '\"' }}\n VALUE = {% raw %}{{ $value }}{% endraw %}\n LABELS = {% raw %}{{ $labels }}{% endraw %}"
{% endfor %}
\ No newline at end of file