Merge branch 'master' into master-1

This commit is contained in:
Samuel Berthe 2025-02-16 23:46:12 +01:00 committed by GitHub
commit 10724be49c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
36 changed files with 2205 additions and 1068 deletions

View file

@ -1,6 +1,7 @@
name: Publish name: Publish
on: on:
workflow_dispatch:
push: push:
branches: branches:
- master - master
@ -13,22 +14,23 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- name: Checkout Repo - name: Checkout Repo
uses: actions/checkout@v3 uses: actions/checkout@v4
- name: Set up Ruby - name: Set up Ruby
uses: ruby/setup-ruby@v1 uses: ruby/setup-ruby@v1
with: with:
ruby-version: 2.7 ruby-version: 3.4
- name: Set up yq - name: Set up yq
uses: mikefarah/yq@master uses: mikefarah/yq@master
- name: Install liquid - name: Install liquid
run: gem install liquid-cli run: |
gem install liquid -v 5.5.1
gem install liquid-cli
- name: Build rule configuration - name: Build rule configuration
run: | run: |
gem install liquid-cli
cat _data/rules.yml | yq -I 0 -o json > _data/rules.json cat _data/rules.yml | yq -I 0 -o json > _data/rules.json
rm -rf dist/rules rm -rf dist/rules

View file

@ -8,12 +8,12 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- name: Checkout Repo - name: Checkout Repo
uses: actions/checkout@v3 uses: actions/checkout@v4
- name: Set up Ruby - name: Set up Ruby
uses: ruby/setup-ruby@v1 uses: ruby/setup-ruby@v1
with: with:
ruby-version: 2.7 ruby-version: 3.4
- name: Set up yq - name: Set up yq
uses: mikefarah/yq@master uses: mikefarah/yq@master
@ -31,7 +31,7 @@ jobs:
mkdir -p "${subdir}" mkdir -p "${subdir}"
# groupName=$(echo "{% assign groupName = name | split: ' ' %}{% capture groupNameCamelcase %}{% for word in groupName %}{{ word | capitalize }} {% endfor %}{% endcapture %} {{ groupNameCamelcase | remove: ' ' | remove: '-' }}" | liquid $(echo ${service} | base64 --decode | jq -r '.name | ascii_downcase | split(" ") | join("-")')) # groupName=$(echo "{% assign groupName = name | split: ' ' %}{% capture groupNameCamelcase %}{% for word in groupName %}{{ word | capitalize }} {% endfor %}{% endcapture %} {{ groupNameCamelcase | remove: ' ' | remove: '-' }}" | liquid $(echo ${service} | base64 --decode | jq -r '.name | ascii_downcase | split(" ") | join("-")'))
for exporter in $(echo ${service} | base64 --decode | jq -r '.exporters[] | @base64'); do for exporter in $(echo ${service} | base64 --decode | jq -r '.exporters[] | @base64'); do
exporterName=$(echo ${exporter} | base64 --decode | jq -r '.slug') exporterName=$(echo ${exporter} | base64 --decode | jq -r '.slug')
cat dist/template.yml | liquid "$(echo ${exporter} | base64 --decode)" > ${subdir}/${exporterName}.yml cat dist/template.yml | liquid "$(echo ${exporter} | base64 --decode)" > ${subdir}/${exporterName}.yml

1
.gitignore vendored
View file

@ -4,3 +4,4 @@ _site/
.jekyll-metadata .jekyll-metadata
_data/rules.json _data/rules.json
test/rules/ test/rules/
/node_modules

View file

@ -32,8 +32,8 @@ Or with Docker:
docker run --rm -it -p 4000:4000 -v $(pwd):/srv/jekyll jekyll/jekyll jekyll serve docker run --rm -it -p 4000:4000 -v $(pwd):/srv/jekyll jekyll/jekyll jekyll serve
``` ```
Or with Docker-Compose: Or with Docker Compose:
``` ```
docker-compose up -d docker compose up -d
``` ```

7
FUNDING.json Normal file
View file

@ -0,0 +1,7 @@
{
"drips": {
"ethereum": {
"ownedBy": "0x1Baee8431ead537455399cC7099eBb219227C1f1"
}
}
}

View file

@ -1,3 +1,3 @@
source 'https://rubygems.org' source 'https://rubygems.org'
gem 'github-pages', group: :jekyll_plugins gem 'github-pages', '>= 232', group: :jekyll_plugins
gem 'webrick', '~> 1.3', '>= 1.3.1' gem 'webrick', '~> 1.8'

View file

@ -1,66 +1,61 @@
GEM GEM
remote: https://rubygems.org/ remote: https://rubygems.org/
specs: specs:
activesupport (6.0.6.1) activesupport (7.2.1)
concurrent-ruby (~> 1.0, >= 1.0.2) base64
i18n (>= 0.7, < 2) bigdecimal
minitest (~> 5.1) concurrent-ruby (~> 1.0, >= 1.3.1)
tzinfo (~> 1.1) connection_pool (>= 2.2.5)
zeitwerk (~> 2.2, >= 2.2.2) drb
addressable (2.8.0) i18n (>= 1.6, < 2)
public_suffix (>= 2.0.2, < 5.0) logger (>= 1.4.2)
minitest (>= 5.1)
securerandom (>= 0.3)
tzinfo (~> 2.0, >= 2.0.5)
addressable (2.8.7)
public_suffix (>= 2.0.2, < 7.0)
base64 (0.2.0)
bigdecimal (3.1.8)
coffee-script (2.4.1) coffee-script (2.4.1)
coffee-script-source coffee-script-source
execjs execjs
coffee-script-source (1.11.1) coffee-script-source (1.12.2)
colorator (1.1.0) colorator (1.1.0)
commonmarker (0.23.10) commonmarker (0.23.10)
concurrent-ruby (1.2.0) concurrent-ruby (1.3.4)
dnsruby (1.61.9) connection_pool (2.4.1)
simpleidn (~> 0.1) csv (3.3.0)
dnsruby (1.72.2)
simpleidn (~> 0.2.1)
drb (2.2.1)
em-websocket (0.5.3) em-websocket (0.5.3)
eventmachine (>= 0.12.9) eventmachine (>= 0.12.9)
http_parser.rb (~> 0) http_parser.rb (~> 0)
ethon (0.15.0) ethon (0.16.0)
ffi (>= 1.15.0) ffi (>= 1.15.0)
eventmachine (1.2.7) eventmachine (1.2.7)
execjs (2.8.1) execjs (2.9.1)
faraday (1.10.0) faraday (2.12.0)
faraday-em_http (~> 1.0) faraday-net_http (>= 2.0, < 3.4)
faraday-em_synchrony (~> 1.0) json
faraday-excon (~> 1.1) logger
faraday-httpclient (~> 1.0) faraday-net_http (3.3.0)
faraday-multipart (~> 1.0) net-http
faraday-net_http (~> 1.0) ffi (1.17.0)
faraday-net_http_persistent (~> 1.0) ffi (1.17.0-x86_64-linux-gnu)
faraday-patron (~> 1.0) ffi (1.17.0-x86_64-linux-musl)
faraday-rack (~> 1.0)
faraday-retry (~> 1.0)
ruby2_keywords (>= 0.0.4)
faraday-em_http (1.0.0)
faraday-em_synchrony (1.0.0)
faraday-excon (1.1.0)
faraday-httpclient (1.0.1)
faraday-multipart (1.0.3)
multipart-post (>= 1.2, < 3)
faraday-net_http (1.0.1)
faraday-net_http_persistent (1.2.0)
faraday-patron (1.0.0)
faraday-rack (1.0.0)
faraday-retry (1.0.3)
ffi (1.15.5)
forwardable-extended (2.6.0) forwardable-extended (2.6.0)
gemoji (3.0.1) gemoji (4.1.0)
github-pages (226) github-pages (232)
github-pages-health-check (= 1.17.9) github-pages-health-check (= 1.18.2)
jekyll (= 3.9.2) jekyll (= 3.10.0)
jekyll-avatar (= 0.7.0) jekyll-avatar (= 0.8.0)
jekyll-coffeescript (= 1.1.1) jekyll-coffeescript (= 1.2.2)
jekyll-commonmark-ghpages (= 0.2.0) jekyll-commonmark-ghpages (= 0.5.1)
jekyll-default-layout (= 0.1.4) jekyll-default-layout (= 0.1.5)
jekyll-feed (= 0.15.1) jekyll-feed (= 0.17.0)
jekyll-gist (= 1.5.0) jekyll-gist (= 1.5.0)
jekyll-github-metadata (= 2.13.0) jekyll-github-metadata (= 2.16.1)
jekyll-include-cache (= 0.2.1) jekyll-include-cache (= 0.2.1)
jekyll-mentions (= 1.6.0) jekyll-mentions (= 1.6.0)
jekyll-optional-front-matter (= 0.3.2) jekyll-optional-front-matter (= 0.3.2)
@ -87,32 +82,34 @@ GEM
jekyll-theme-tactile (= 0.2.0) jekyll-theme-tactile (= 0.2.0)
jekyll-theme-time-machine (= 0.2.0) jekyll-theme-time-machine (= 0.2.0)
jekyll-titles-from-headings (= 0.5.3) jekyll-titles-from-headings (= 0.5.3)
jemoji (= 0.12.0) jemoji (= 0.13.0)
kramdown (= 2.3.2) kramdown (= 2.4.0)
kramdown-parser-gfm (= 1.1.0) kramdown-parser-gfm (= 1.1.0)
liquid (= 4.0.3) liquid (= 4.0.4)
mercenary (~> 0.3) mercenary (~> 0.3)
minima (= 2.5.1) minima (= 2.5.1)
nokogiri (>= 1.13.4, < 2.0) nokogiri (>= 1.16.2, < 2.0)
rouge (= 3.26.0) rouge (= 3.30.0)
terminal-table (~> 1.4) terminal-table (~> 1.4)
github-pages-health-check (1.17.9) webrick (~> 1.8)
github-pages-health-check (1.18.2)
addressable (~> 2.3) addressable (~> 2.3)
dnsruby (~> 1.60) dnsruby (~> 1.60)
octokit (~> 4.0) octokit (>= 4, < 8)
public_suffix (>= 3.0, < 5.0) public_suffix (>= 3.0, < 6.0)
typhoeus (~> 1.3) typhoeus (~> 1.3)
html-pipeline (2.14.1) html-pipeline (2.14.3)
activesupport (>= 2) activesupport (>= 2)
nokogiri (>= 1.4) nokogiri (>= 1.4)
http_parser.rb (0.8.0) http_parser.rb (0.8.0)
i18n (0.9.5) i18n (1.14.6)
concurrent-ruby (~> 1.0) concurrent-ruby (~> 1.0)
jekyll (3.9.2) jekyll (3.10.0)
addressable (~> 2.4) addressable (~> 2.4)
colorator (~> 1.0) colorator (~> 1.0)
csv (~> 3.0)
em-websocket (~> 0.5) em-websocket (~> 0.5)
i18n (~> 0.7) i18n (>= 0.7, < 2)
jekyll-sass-converter (~> 1.0) jekyll-sass-converter (~> 1.0)
jekyll-watch (~> 2.0) jekyll-watch (~> 2.0)
kramdown (>= 1.17, < 3) kramdown (>= 1.17, < 3)
@ -121,27 +118,28 @@ GEM
pathutil (~> 0.9) pathutil (~> 0.9)
rouge (>= 1.7, < 4) rouge (>= 1.7, < 4)
safe_yaml (~> 1.0) safe_yaml (~> 1.0)
jekyll-avatar (0.7.0) webrick (>= 1.0)
jekyll-avatar (0.8.0)
jekyll (>= 3.0, < 5.0) jekyll (>= 3.0, < 5.0)
jekyll-coffeescript (1.1.1) jekyll-coffeescript (1.2.2)
coffee-script (~> 2.2) coffee-script (~> 2.2)
coffee-script-source (~> 1.11.1) coffee-script-source (~> 1.12)
jekyll-commonmark (1.4.0) jekyll-commonmark (1.4.0)
commonmarker (~> 0.22) commonmarker (~> 0.22)
jekyll-commonmark-ghpages (0.2.0) jekyll-commonmark-ghpages (0.5.1)
commonmarker (~> 0.23.4) commonmarker (>= 0.23.7, < 1.1.0)
jekyll (~> 3.9.0) jekyll (>= 3.9, < 4.0)
jekyll-commonmark (~> 1.4.0) jekyll-commonmark (~> 1.4.0)
rouge (>= 2.0, < 4.0) rouge (>= 2.0, < 5.0)
jekyll-default-layout (0.1.4) jekyll-default-layout (0.1.5)
jekyll (~> 3.0) jekyll (>= 3.0, < 5.0)
jekyll-feed (0.15.1) jekyll-feed (0.17.0)
jekyll (>= 3.7, < 5.0) jekyll (>= 3.7, < 5.0)
jekyll-gist (1.5.0) jekyll-gist (1.5.0)
octokit (~> 4.2) octokit (~> 4.2)
jekyll-github-metadata (2.13.0) jekyll-github-metadata (2.16.1)
jekyll (>= 3.4, < 5.0) jekyll (>= 3.4, < 5.0)
octokit (~> 4.0, != 4.4.0) octokit (>= 4, < 7, != 4.4.0)
jekyll-include-cache (0.2.1) jekyll-include-cache (0.2.1)
jekyll (>= 3.7, < 5.0) jekyll (>= 3.7, < 5.0)
jekyll-mentions (1.6.0) jekyll-mentions (1.6.0)
@ -212,40 +210,46 @@ GEM
jekyll (>= 3.3, < 5.0) jekyll (>= 3.3, < 5.0)
jekyll-watch (2.2.1) jekyll-watch (2.2.1)
listen (~> 3.0) listen (~> 3.0)
jemoji (0.12.0) jemoji (0.13.0)
gemoji (~> 3.0) gemoji (>= 3, < 5)
html-pipeline (~> 2.2) html-pipeline (~> 2.2)
jekyll (>= 3.0, < 5.0) jekyll (>= 3.0, < 5.0)
kramdown (2.3.2) json (2.7.2)
kramdown (2.4.0)
rexml rexml
kramdown-parser-gfm (1.1.0) kramdown-parser-gfm (1.1.0)
kramdown (~> 2.0) kramdown (~> 2.0)
liquid (4.0.3) liquid (4.0.4)
listen (3.7.1) listen (3.9.0)
rb-fsevent (~> 0.10, >= 0.10.3) rb-fsevent (~> 0.10, >= 0.10.3)
rb-inotify (~> 0.9, >= 0.9.10) rb-inotify (~> 0.9, >= 0.9.10)
logger (1.6.1)
mercenary (0.3.6) mercenary (0.3.6)
mini_portile2 (2.8.7)
minima (2.5.1) minima (2.5.1)
jekyll (>= 3.5, < 5.0) jekyll (>= 3.5, < 5.0)
jekyll-feed (~> 0.9) jekyll-feed (~> 0.9)
jekyll-seo-tag (~> 2.1) jekyll-seo-tag (~> 2.1)
minitest (5.17.0) minitest (5.25.1)
multipart-post (2.1.1) net-http (0.4.1)
nokogiri (1.14.3-x86_64-linux) uri
nokogiri (1.16.7)
mini_portile2 (~> 2.8.2)
racc (~> 1.4) racc (~> 1.4)
octokit (4.22.0) nokogiri (1.16.7-x86_64-linux)
faraday (>= 0.9) racc (~> 1.4)
sawyer (~> 0.8.0, >= 0.5.3) octokit (4.25.1)
faraday (>= 1, < 3)
sawyer (~> 0.9)
pathutil (0.16.2) pathutil (0.16.2)
forwardable-extended (~> 2.6) forwardable-extended (~> 2.6)
public_suffix (4.0.7) public_suffix (5.1.1)
racc (1.6.2) racc (1.8.1)
rb-fsevent (0.11.1) rb-fsevent (0.11.2)
rb-inotify (0.10.1) rb-inotify (0.11.1)
ffi (~> 1.0) ffi (~> 1.0)
rexml (3.2.5) rexml (3.3.9)
rouge (3.26.0) rouge (3.30.0)
ruby2_keywords (0.0.5)
rubyzip (2.3.2) rubyzip (2.3.2)
safe_yaml (1.0.5) safe_yaml (1.0.5)
sass (3.7.4) sass (3.7.4)
@ -253,32 +257,29 @@ GEM
sass-listen (4.0.0) sass-listen (4.0.0)
rb-fsevent (~> 0.9, >= 0.9.4) rb-fsevent (~> 0.9, >= 0.9.4)
rb-inotify (~> 0.9, >= 0.9.7) rb-inotify (~> 0.9, >= 0.9.7)
sawyer (0.8.2) sawyer (0.9.2)
addressable (>= 2.3.5) addressable (>= 2.3.5)
faraday (> 0.8, < 2.0) faraday (>= 0.17.3, < 3)
simpleidn (0.2.1) securerandom (0.3.1)
unf (~> 0.1.4) simpleidn (0.2.3)
terminal-table (1.8.0) terminal-table (1.8.0)
unicode-display_width (~> 1.1, >= 1.1.1) unicode-display_width (~> 1.1, >= 1.1.1)
thread_safe (0.3.6) typhoeus (1.4.1)
typhoeus (1.4.0)
ethon (>= 0.9.0) ethon (>= 0.9.0)
tzinfo (1.2.11) tzinfo (2.0.6)
thread_safe (~> 0.1) concurrent-ruby (~> 1.0)
unf (0.1.4)
unf_ext
unf_ext (0.0.8.1)
unicode-display_width (1.8.0) unicode-display_width (1.8.0)
webrick (1.7.0) uri (0.13.1)
zeitwerk (2.6.6) webrick (1.8.2)
PLATFORMS PLATFORMS
ruby
x86_64-linux x86_64-linux
x86_64-linux-musl x86_64-linux-musl
DEPENDENCIES DEPENDENCIES
github-pages github-pages (>= 232)
webrick (~> 1.3, >= 1.3.1) webrick (~> 1.8)
BUNDLED WITH BUNDLED WITH
2.3.13 2.3.13

View file

@ -4,6 +4,21 @@
Collection available here: **[https://samber.github.io/awesome-prometheus-alerts](https://samber.github.io/awesome-prometheus-alerts)** Collection available here: **[https://samber.github.io/awesome-prometheus-alerts](https://samber.github.io/awesome-prometheus-alerts)**
<div align="center">
<hr>
<sup><b>Sponsored by:</b></sup>
<br>
<a href="https://betterstack.com">
<div>
<img src="https://samber.github.io/awesome-prometheus-alerts/assets/sponsor-betterstack.png" width="200" alt="Better Stack">
</div>
<div>
Better Stack lets you centralize, search, and visualize your logs.
</div>
</a>
<hr>
</div>
## ✨ Contents ## ✨ Contents
- [Rules](#-rules) - [Rules](#-rules)
@ -18,6 +33,7 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts
- [Prometheus self-monitoring](https://samber.github.io/awesome-prometheus-alerts/rules#prometheus-internals) - [Prometheus self-monitoring](https://samber.github.io/awesome-prometheus-alerts/rules#prometheus-internals)
- [Host/Hardware](https://samber.github.io/awesome-prometheus-alerts/rules#host-and-hardware) - [Host/Hardware](https://samber.github.io/awesome-prometheus-alerts/rules#host-and-hardware)
- [SMART](https://samber.github.io/awesome-prometheus-alerts/rules#smart)
- [Docker Containers](https://samber.github.io/awesome-prometheus-alerts/rules#docker-containers) - [Docker Containers](https://samber.github.io/awesome-prometheus-alerts/rules#docker-containers)
- [Blackbox](https://samber.github.io/awesome-prometheus-alerts/rules#blackbox) - [Blackbox](https://samber.github.io/awesome-prometheus-alerts/rules#blackbox)
- [Windows](https://samber.github.io/awesome-prometheus-alerts/rules#windows-server) - [Windows](https://samber.github.io/awesome-prometheus-alerts/rules#windows-server)
@ -35,12 +51,15 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts
- [MongoDB](https://samber.github.io/awesome-prometheus-alerts/rules#mongodb) - [MongoDB](https://samber.github.io/awesome-prometheus-alerts/rules#mongodb)
- [RabbitMQ](https://samber.github.io/awesome-prometheus-alerts/rules#rabbitmq) - [RabbitMQ](https://samber.github.io/awesome-prometheus-alerts/rules#rabbitmq)
- [Elasticsearch](https://samber.github.io/awesome-prometheus-alerts/rules#elasticsearch) - [Elasticsearch](https://samber.github.io/awesome-prometheus-alerts/rules#elasticsearch)
- [Meilisearch](https://samber.github.io/awesome-prometheus-alerts/rules#meilisearch)
- [Cassandra](https://samber.github.io/awesome-prometheus-alerts/rules#cassandra) - [Cassandra](https://samber.github.io/awesome-prometheus-alerts/rules#cassandra)
- [Clickhouse](https://samber.github.io/awesome-prometheus-alerts/rules#clickhouse)
- [Zookeeper](https://samber.github.io/awesome-prometheus-alerts/rules#zookeeper) - [Zookeeper](https://samber.github.io/awesome-prometheus-alerts/rules#zookeeper)
- [Kafka](https://samber.github.io/awesome-prometheus-alerts/rules#kafka) - [Kafka](https://samber.github.io/awesome-prometheus-alerts/rules#kafka)
- [Pulsar](https://samber.github.io/awesome-prometheus-alerts/rules#pulsar) - [Pulsar](https://samber.github.io/awesome-prometheus-alerts/rules#pulsar)
- [Nats](https://samber.github.io/awesome-prometheus-alerts/rules#nats) - [Nats](https://samber.github.io/awesome-prometheus-alerts/rules#nats)
- [Solr](https://samber.github.io/awesome-prometheus-alerts/rules#solr) - [Solr](https://samber.github.io/awesome-prometheus-alerts/rules#solr)
- [Hadoop](https://samber.github.io/awesome-prometheus-alerts/rules#hadoop)
#### Reverse proxies and load balancers #### Reverse proxies and load balancers
@ -48,6 +67,7 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts
- [Apache](https://samber.github.io/awesome-prometheus-alerts/rules#apache) - [Apache](https://samber.github.io/awesome-prometheus-alerts/rules#apache)
- [HaProxy](https://samber.github.io/awesome-prometheus-alerts/rules#haproxy) - [HaProxy](https://samber.github.io/awesome-prometheus-alerts/rules#haproxy)
- [Traefik](https://samber.github.io/awesome-prometheus-alerts/rules#traefik) - [Traefik](https://samber.github.io/awesome-prometheus-alerts/rules#traefik)
- [Caddy](https://samber.github.io/awesome-prometheus-alerts/rules#caddy)
#### Runtimes #### Runtimes
@ -83,7 +103,9 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts
- [Loki](https://samber.github.io/awesome-prometheus-alerts/rules#loki) - [Loki](https://samber.github.io/awesome-prometheus-alerts/rules#loki)
- [Promtail](https://samber.github.io/awesome-prometheus-alerts/rules#promtail) - [Promtail](https://samber.github.io/awesome-prometheus-alerts/rules#promtail)
- [Cortex](https://samber.github.io/awesome-prometheus-alerts/rules#cortex) - [Cortex](https://samber.github.io/awesome-prometheus-alerts/rules#cortex)
- [Grafana Alloy](https://samber.github.io/awesome-prometheus-alerts/rules#grafana-alloy)
- [Jenkins](https://samber.github.io/awesome-prometheus-alerts/rules#jenkins) - [Jenkins](https://samber.github.io/awesome-prometheus-alerts/rules#jenkins)
- [Graph Node](https://samber.github.io/awesome-prometheus-alerts/rules#graph-node)
## 🤝 Contributing ## 🤝 Contributing

File diff suppressed because it is too large Load diff

View file

@ -125,6 +125,18 @@
class="fa fa-linkedin" target="_blank"></a> class="fa fa-linkedin" target="_blank"></a>
</li> </li>
</ul> </ul>
<ul id="sponsoring">
<li>
Kindly supported by&nbsp; 👉
</li>
<li>
<a href="https://betterstack.com/">
<img width="" src="assets/sponsor-betterstack.png" />
</a>
</li>
</ul>
</header> </header>
<main id="content" class="main-content" role="main"> <main id="content" class="main-content" role="main">
@ -147,7 +159,7 @@
s['ScreebObject']=r;s[r]=s[r]||function(){(s[r].q=s[r].q||[]).push(arguments)}; s['ScreebObject']=r;s[r]=s[r]||function(){(s[r].q=s[r].q||[]).push(arguments)};
b=c.createElement('script');b.type='text/javascript'; b=c.createElement('script');b.type='text/javascript';
b.id=r;b.src=ee;b.async=1;c.getElementsByTagName("head")[0].appendChild(b); b.id=r;b.src=ee;b.async=1;c.getElementsByTagName("head")[0].appendChild(b);
}(window,document,'$screeb','https://t.screeb.app/tag.js')); }(window,document,'$screeb','https://t2.screeb.app/tag.js'));
$screeb('init', '232450e3-d3fe-4240-b543-649a5041a7db'); $screeb('init', '232450e3-d3fe-4240-b543-649a5041a7db');
</script> </script>

View file

@ -80,7 +80,7 @@ route:
- receiver: "pager" - receiver: "pager"
group_wait: 10s group_wait: 10s
match_re: match_re:
severity: critial severity: critical
continue: true continue: true
receivers: receivers:
@ -135,4 +135,7 @@ If the notification takes too much time to be triggered, check the following del
- `for: 5m` (alerts/example-mysql.yml) - `for: 5m` (alerts/example-mysql.yml)
- `group_wait = 10s` (alertmanager.yml) - `group_wait = 10s` (alertmanager.yml)
Also read [https://pracucci.com/prometheus-understanding-the-delays-on-alerting.html](https://pracucci.com/prometheus-understanding-the-delays-on-alerting.html). Also read:
- [https://pracucci.com/prometheus-understanding-the-delays-on-alerting.html](https://pracucci.com/prometheus-understanding-the-delays-on-alerting.html).
- [https://hodovi.cc/blog/creating-awesome-alertmanager-templates-for-slack/](https://hodovi.cc/blog/creating-awesome-alertmanager-templates-for-slack/)
- [https://grafana.com/blog/2024/10/03/how-to-use-prometheus-to-efficiently-detect-anomalies-at-scale/](https://grafana.com/blog/2024/10/03/how-to-use-prometheus-to-efficiently-detect-anomalies-at-scale/)

View file

@ -115,3 +115,29 @@ h2 {
max-width: 85rem; max-width: 85rem;
} }
} }
ul#sponsoring {
display: flex;
align-items: center;
justify-content: center;
margin-top: 50px;
}
ul#sponsoring li {
display: flex;
padding: 0px 15px;
font-size: 16px;
}
ul#sponsoring li a {
display: flex;
}
ul#sponsoring li a img {
max-width: 180px;
max-height: 80px;
}
.page-header {
padding-bottom: 30px;
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 16 KiB

32
dist/rules/caddy/null.yml vendored Normal file
View file

@ -0,0 +1,32 @@
groups:
- name:
rules:
- alert: CaddyReverseProxyDown
expr: 'count(caddy_reverse_proxy_upstreams_healthy) by (upstream) == 0'
for: 0m
labels:
severity: critical
annotations:
summary: Caddy Reverse Proxy Down (instance {{ $labels.instance }})
description: "All Caddy reverse proxies are down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CaddyHighHttp4xxErrorRateService
expr: 'sum(rate(caddy_http_request_duration_seconds_count{code=~"4.."}[3m])) by (instance) / sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) * 100 > 5'
for: 1m
labels:
severity: critical
annotations:
summary: Caddy high HTTP 4xx error rate service (instance {{ $labels.instance }})
description: "Caddy service 4xx error rate is above 5%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: CaddyHighHttp5xxErrorRateService
expr: 'sum(rate(caddy_http_request_duration_seconds_count{code=~"5.."}[3m])) by (instance) / sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) * 100 > 5'
for: 1m
labels:
severity: critical
annotations:
summary: Caddy high HTTP 5xx error rate service (instance {{ $labels.instance }})
description: "Caddy service 5xx error rate is above 5%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -0,0 +1,131 @@
groups:
- name: EmbeddedExporter
rules:
- alert: ClickhouseMemoryUsageCritical
expr: 'ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 90'
for: 5m
labels:
severity: critical
annotations:
summary: ClickHouse Memory Usage Critical (instance {{ $labels.instance }})
description: "Memory usage is critically high, over 90%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ClickhouseMemoryUsageWarning
expr: 'ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 80'
for: 5m
labels:
severity: warning
annotations:
summary: ClickHouse Memory Usage Warning (instance {{ $labels.instance }})
description: "Memory usage is over 80%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ClickhouseDiskSpaceLowOnDefault
expr: 'ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 20'
for: 2m
labels:
severity: warning
annotations:
summary: ClickHouse Disk Space Low on Default (instance {{ $labels.instance }})
description: "Disk space on default is below 20%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ClickhouseDiskSpaceCriticalOnDefault
expr: 'ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 10'
for: 2m
labels:
severity: critical
annotations:
summary: ClickHouse Disk Space Critical on Default (instance {{ $labels.instance }})
description: "Disk space on default disk is critically low, below 10%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ClickhouseDiskSpaceLowOnBackups
expr: 'ClickHouseAsyncMetrics_DiskAvailable_backups / (ClickHouseAsyncMetrics_DiskAvailable_backups + ClickHouseAsyncMetrics_DiskUsed_backups) * 100 < 20'
for: 2m
labels:
severity: warning
annotations:
summary: ClickHouse Disk Space Low on Backups (instance {{ $labels.instance }})
description: "Disk space on backups is below 20%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ClickhouseReplicaErrors
expr: 'ClickHouseErrorMetric_ALL_REPLICAS_ARE_STALE == 1 or ClickHouseErrorMetric_ALL_REPLICAS_LOST == 1'
for: 0m
labels:
severity: critical
annotations:
summary: ClickHouse Replica Errors (instance {{ $labels.instance }})
description: "Critical replica errors detected, either all replicas are stale or lost.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ClickhouseNoAvailableReplicas
expr: 'ClickHouseErrorMetric_NO_AVAILABLE_REPLICA == 1'
for: 0m
labels:
severity: critical
annotations:
summary: ClickHouse No Available Replicas (instance {{ $labels.instance }})
description: "No available replicas in ClickHouse.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ClickhouseNoLiveReplicas
expr: 'ClickHouseErrorMetric_TOO_FEW_LIVE_REPLICAS == 1'
for: 0m
labels:
severity: critical
annotations:
summary: ClickHouse No Live Replicas (instance {{ $labels.instance }})
description: "There are too few live replicas available, risking data loss and service disruption.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ClickhouseHighNetworkTraffic
expr: 'ClickHouseMetrics_NetworkSend > 250 or ClickHouseMetrics_NetworkReceive > 250'
for: 5m
labels:
severity: warning
annotations:
summary: ClickHouse High Network Traffic (instance {{ $labels.instance }})
description: "Network traffic is unusually high, may affect cluster performance.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ClickhouseHighTcpConnections
expr: 'ClickHouseMetrics_TCPConnection > 400'
for: 5m
labels:
severity: warning
annotations:
summary: ClickHouse High TCP Connections (instance {{ $labels.instance }})
description: "High number of TCP connections, indicating heavy client or inter-cluster communication.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ClickhouseInterserverConnectionIssues
expr: 'increase(ClickHouseMetrics_InterserverConnection[5m]) > 0'
for: 1m
labels:
severity: warning
annotations:
summary: ClickHouse Interserver Connection Issues (instance {{ $labels.instance }})
description: "An increase in interserver connections may indicate replication or distributed query handling issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ClickhouseZookeeperConnectionIssues
expr: 'avg(ClickHouseMetrics_ZooKeeperSession) != 1'
for: 3m
labels:
severity: warning
annotations:
summary: ClickHouse ZooKeeper Connection Issues (instance {{ $labels.instance }})
description: "ClickHouse is experiencing issues with ZooKeeper connections, which may affect cluster state and coordination.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ClickhouseAuthenticationFailures
expr: 'increase(ClickHouseErrorMetric_AUTHENTICATION_FAILED[5m]) > 0'
for: 0m
labels:
severity: info
annotations:
summary: ClickHouse Authentication Failures (instance {{ $labels.instance }})
description: "Authentication failures detected, indicating potential security issues or misconfiguration.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ClickhouseAccessDeniedErrors
expr: 'increase(ClickHouseErrorMetric_RESOURCE_ACCESS_DENIED[5m]) > 0'
for: 0m
labels:
severity: info
annotations:
summary: ClickHouse Access Denied Errors (instance {{ $labels.instance }})
description: "Access denied errors have been logged, which could indicate permission issues or unauthorized access attempts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -23,7 +23,7 @@ groups:
description: "A container is absent for 5 min\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "A container is absent for 5 min\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ContainerHighCpuUtilization - alert: ContainerHighCpuUtilization
expr: '(sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) > 80' expr: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) > 80'
for: 2m for: 2m
labels: labels:
severity: warning severity: warning
@ -50,8 +50,8 @@ groups:
description: "Container Volume usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Container Volume usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ContainerHighThrottleRate - alert: ContainerHighThrottleRate
expr: 'rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1' expr: 'sum(increase(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) by (container, pod, namespace) / sum(increase(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > ( 25 / 100 )'
for: 2m for: 5m
labels: labels:
severity: warning severity: warning
annotations: annotations:
@ -69,7 +69,7 @@ groups:
- alert: ContainerLowCpuUtilization - alert: ContainerLowCpuUtilization
expr: '(sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) < 20' expr: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) < 20'
for: 7d for: 7d
labels: labels:
severity: info severity: info

View file

@ -138,3 +138,39 @@ groups:
annotations: annotations:
summary: Elasticsearch no new documents (instance {{ $labels.instance }}) summary: Elasticsearch no new documents (instance {{ $labels.instance }})
description: "No new documents for 10 min!\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "No new documents for 10 min!\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ElasticsearchHighIndexingLatency
expr: 'elasticsearch_indices_indexing_index_time_seconds_total / elasticsearch_indices_indexing_index_total > 0.0005'
for: 10m
labels:
severity: warning
annotations:
summary: Elasticsearch High Indexing Latency (instance {{ $labels.instance }})
description: "The indexing latency on Elasticsearch cluster is higher than the threshold.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ElasticsearchHighIndexingRate
expr: 'sum(rate(elasticsearch_indices_indexing_index_total[1m]))> 10000'
for: 5m
labels:
severity: warning
annotations:
summary: Elasticsearch High Indexing Rate (instance {{ $labels.instance }})
description: "The indexing rate on Elasticsearch cluster is higher than the threshold.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ElasticsearchHighQueryRate
expr: 'sum(rate(elasticsearch_indices_search_query_total[1m])) > 100'
for: 5m
labels:
severity: warning
annotations:
summary: Elasticsearch High Query Rate (instance {{ $labels.instance }})
description: "The query rate on Elasticsearch cluster is higher than the threshold.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ElasticsearchHighQueryLatency
expr: 'elasticsearch_indices_search_fetch_time_seconds / elasticsearch_indices_search_fetch_total > 1'
for: 5m
labels:
severity: warning
annotations:
summary: Elasticsearch High Query Latency (instance {{ $labels.instance }})
description: "The query latency on Elasticsearch cluster is higher than the threshold.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -0,0 +1,14 @@
groups:
- name: EmbeddedExporter
rules:
- alert: GrafanaAlloyServiceDown
expr: 'count by (instance) (alloy_build_info) unless count by (instance) (alloy_build_info offset 2m) '
for: 0m
labels:
severity: critical
annotations:
summary: Grafana Alloy service down (instance {{ $labels.instance }})
description: "Alloy on (instance {{ $labels.instance }}) is not responding or has stopped running.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -0,0 +1,59 @@
groups:
- name: EmbeddedExporter
rules:
- alert: ProviderFailedBecauseNet_versionFailed
expr: 'eth_rpc_status == 1'
for: 0m
labels:
severity: critical
annotations:
summary: Provider failed because net_version failed (instance {{ $labels.instance }})
description: "Failed net_version for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ProviderFailedBecauseGetGenesisFailed
expr: 'eth_rpc_status == 2'
for: 0m
labels:
severity: critical
annotations:
summary: Provider failed because get genesis failed (instance {{ $labels.instance }})
description: "Failed to get genesis for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ProviderFailedBecauseNet_versionTimeout
expr: 'eth_rpc_status == 3'
for: 0m
labels:
severity: critical
annotations:
summary: Provider failed because net_version timeout (instance {{ $labels.instance }})
description: "net_version timeout for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: ProviderFailedBecauseGetGenesisTimeout
expr: 'eth_rpc_status == 4'
for: 0m
labels:
severity: critical
annotations:
summary: Provider failed because get genesis timeout (instance {{ $labels.instance }})
description: "Timeout to get genesis for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: StoreConnectionIsTooSlow
expr: 'store_connection_wait_time_ms > 10'
for: 0m
labels:
severity: warning
annotations:
summary: Store connection is too slow (instance {{ $labels.instance }})
description: "Store connection is too slow to `{{$labels.pool}}` pool, `{{$labels.shard}}` shard in Graph node `{{$labels.instance}}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: StoreConnectionIsTooSlow
expr: 'store_connection_wait_time_ms > 20'
for: 0m
labels:
severity: critical
annotations:
summary: Store connection is too slow (instance {{ $labels.instance }})
description: "Store connection is too slow to `{{$labels.pool}}` pool, `{{$labels.shard}}` shard in Graph node `{{$labels.instance}}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

95
dist/rules/hadoop/jmx_exporter.yml vendored Normal file
View file

@ -0,0 +1,95 @@
groups:
- name: Jmx_exporter
rules:
- alert: HadoopNameNodeDown
expr: 'up{job="hadoop-namenode"} == 0'
for: 5m
labels:
severity: critical
annotations:
summary: Hadoop Name Node Down (instance {{ $labels.instance }})
description: "The Hadoop NameNode service is unavailable.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HadoopResourceManagerDown
expr: 'up{job="hadoop-resourcemanager"} == 0'
for: 5m
labels:
severity: critical
annotations:
summary: Hadoop Resource Manager Down (instance {{ $labels.instance }})
description: "The Hadoop ResourceManager service is unavailable.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HadoopDataNodeOutOfService
expr: 'hadoop_datanode_last_heartbeat == 0'
for: 10m
labels:
severity: warning
annotations:
summary: Hadoop Data Node Out Of Service (instance {{ $labels.instance }})
description: "The Hadoop DataNode is not sending heartbeats.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HadoopHdfsDiskSpaceLow
expr: '(hadoop_hdfs_bytes_total - hadoop_hdfs_bytes_used) / hadoop_hdfs_bytes_total < 0.1'
for: 15m
labels:
severity: warning
annotations:
summary: Hadoop HDFS Disk Space Low (instance {{ $labels.instance }})
description: "Available HDFS disk space is running low.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HadoopMapReduceTaskFailures
expr: 'hadoop_mapreduce_task_failures_total > 100'
for: 10m
labels:
severity: critical
annotations:
summary: Hadoop Map Reduce Task Failures (instance {{ $labels.instance }})
description: "There is an unusually high number of MapReduce task failures.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HadoopResourceManagerMemoryHigh
expr: 'hadoop_resourcemanager_memory_bytes / hadoop_resourcemanager_memory_max_bytes > 0.8'
for: 15m
labels:
severity: warning
annotations:
summary: Hadoop Resource Manager Memory High (instance {{ $labels.instance }})
description: "The Hadoop ResourceManager is approaching its memory limit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HadoopYarnContainerAllocationFailures
expr: 'hadoop_yarn_container_allocation_failures_total > 10'
for: 10m
labels:
severity: warning
annotations:
summary: Hadoop YARN Container Allocation Failures (instance {{ $labels.instance }})
description: "There is a significant number of YARN container allocation failures.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HadoopHbaseRegionCountHigh
expr: 'hadoop_hbase_region_count > 5000'
for: 15m
labels:
severity: warning
annotations:
summary: Hadoop HBase Region Count High (instance {{ $labels.instance }})
description: "The HBase cluster has an unusually high number of regions.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HadoopHbaseRegionServerHeapLow
expr: 'hadoop_hbase_region_server_heap_bytes / hadoop_hbase_region_server_max_heap_bytes < 0.2'
for: 10m
labels:
severity: critical
annotations:
summary: Hadoop HBase Region Server Heap Low (instance {{ $labels.instance }})
description: "HBase Region Servers are running low on heap space.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HadoopHbaseWriteRequestsLatencyHigh
expr: 'hadoop_hbase_write_requests_latency_seconds > 0.5'
for: 10m
labels:
severity: warning
annotations:
summary: Hadoop HBase Write Requests Latency High (instance {{ $labels.instance }})
description: "HBase Write Requests are experiencing high latency.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -77,7 +77,7 @@ groups:
description: "Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HaproxyBackendMaxActiveSession - alert: HaproxyBackendMaxActiveSession
expr: '((sum by (backend) (avg_over_time(haproxy_backend_max_sessions[2m]) * 100) / sum by (backend) (avg_over_time(haproxy_backend_limit_sessions[2m])))) > 80' expr: '((sum by (backend) (avg_over_time(haproxy_backend_current_sessions[2m]) * 100) / sum by (backend) (avg_over_time(haproxy_backend_limit_sessions[2m])))) > 80'
for: 2m for: 2m
labels: labels:
severity: warning severity: warning

View file

@ -5,7 +5,7 @@ groups:
rules: rules:
- alert: HostOutOfMemory - alert: HostOutOfMemory
expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < .10)'
for: 2m for: 2m
labels: labels:
severity: warning severity: warning
@ -14,106 +14,97 @@ groups:
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostMemoryUnderMemoryPressure - alert: HostMemoryUnderMemoryPressure
expr: '(rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' expr: '(rate(node_vmstat_pgmajfault[5m]) > 1000)'
for: 2m for: 0m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Host memory under memory pressure (instance {{ $labels.instance }}) summary: Host memory under memory pressure (instance {{ $labels.instance }})
description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "The node is under heavy memory pressure. High rate of loading memory pages from disk.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostMemoryIsUnderutilized - alert: HostMemoryIsUnderutilized
expr: '(100 - (rate(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' expr: 'min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8'
for: 1w for: 0m
labels: labels:
severity: info severity: info
annotations: annotations:
summary: Host Memory is underutilized (instance {{ $labels.instance }}) summary: Host Memory is underutilized (instance {{ $labels.instance }})
description: "Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualNetworkThroughputIn - alert: HostUnusualNetworkThroughputIn
expr: '(sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' expr: '((rate(node_network_receive_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)'
for: 5m for: 0m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Host unusual network throughput in (instance {{ $labels.instance }}) summary: Host unusual network throughput in (instance {{ $labels.instance }})
description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Host receive bandwidth is high (>80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualNetworkThroughputOut - alert: HostUnusualNetworkThroughputOut
expr: '(sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' expr: '((rate(node_network_transmit_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)'
for: 5m for: 0m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Host unusual network throughput out (instance {{ $labels.instance }}) summary: Host unusual network throughput out (instance {{ $labels.instance }})
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Host transmit bandwidth is high (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskReadRate - alert: HostUnusualDiskReadRate
expr: '(sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' expr: '(rate(node_disk_io_time_seconds_total[5m]) > .80)'
for: 5m for: 0m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Host unusual disk read rate (instance {{ $labels.instance }}) summary: Host unusual disk read rate (instance {{ $labels.instance }})
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Disk is too busy (IO wait > 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskWriteRate
expr: '(sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
labels:
severity: warning
annotations:
summary: Host unusual disk write rate (instance {{ $labels.instance }})
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOutOfDiskSpace - alert: HostOutOfDiskSpace
expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' expr: '(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0)'
for: 2m for: 2m
labels: labels:
severity: warning severity: critical
annotations: annotations:
summary: Host out of disk space (instance {{ $labels.instance }}) summary: Host out of disk space (instance {{ $labels.instance }})
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostDiskWillFillIn24Hours - alert: HostDiskMayFillIn24Hours
expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' expr: 'predict_linear(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_avail_bytes > 0'
for: 2m for: 2m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Host disk will fill in 24 hours (instance {{ $labels.instance }}) summary: Host disk may fill in 24 hours (instance {{ $labels.instance }})
description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Filesystem will likely run out of space within the next 24 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOutOfInodes - alert: HostOutOfInodes
expr: '(node_filesystem_files_free / node_filesystem_files * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' expr: '(node_filesystem_files_free / node_filesystem_files < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0)'
for: 2m for: 2m
labels: labels:
severity: warning severity: critical
annotations: annotations:
summary: Host out of inodes (instance {{ $labels.instance }}) summary: Host out of inodes (instance {{ $labels.instance }})
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostFilesystemDeviceError - alert: HostFilesystemDeviceError
expr: 'node_filesystem_device_error == 1' expr: 'node_filesystem_device_error{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} == 1'
for: 0m for: 2m
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: Host filesystem device error (instance {{ $labels.instance }}) summary: Host filesystem device error (instance {{ $labels.instance }})
description: "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Error stat-ing the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostInodesWillFillIn24Hours - alert: HostInodesMayFillIn24Hours
expr: '(node_filesystem_files_free / node_filesystem_files * 100 < 10 and predict_linear(node_filesystem_files_free[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' expr: 'predict_linear(node_filesystem_files_free{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_files_free > 0'
for: 2m for: 2m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }}) summary: Host inodes may fill in 24 hours (instance {{ $labels.instance }})
description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Filesystem will likely run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskReadLatency - alert: HostUnusualDiskReadLatency
expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0'
for: 2m for: 2m
labels: labels:
severity: warning severity: warning
@ -122,7 +113,7 @@ groups:
description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskWriteLatency - alert: HostUnusualDiskWriteLatency
expr: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' expr: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0)'
for: 2m for: 2m
labels: labels:
severity: warning severity: warning
@ -131,7 +122,7 @@ groups:
description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostHighCpuLoad - alert: HostHighCpuLoad
expr: '(sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > .80'
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
@ -140,16 +131,16 @@ groups:
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostCpuIsUnderutilized - alert: HostCpuIsUnderutilized
expr: '(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' expr: '(min by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8'
for: 1w for: 1w
labels: labels:
severity: info severity: info
annotations: annotations:
summary: Host CPU is underutilized (instance {{ $labels.instance }}) summary: Host CPU is underutilized (instance {{ $labels.instance }})
description: "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostCpuStealNoisyNeighbor - alert: HostCpuStealNoisyNeighbor
expr: '(avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' expr: 'avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10'
for: 0m for: 0m
labels: labels:
severity: warning severity: warning
@ -158,34 +149,34 @@ groups:
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostCpuHighIowait - alert: HostCpuHighIowait
expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' expr: 'avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10'
for: 0m for: 0m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Host CPU high iowait (instance {{ $labels.instance }}) summary: Host CPU high iowait (instance {{ $labels.instance }})
description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "CPU iowait > 10%. Your CPU is idling waiting for storage to respond.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskIo - alert: HostUnusualDiskIo
expr: '(rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' expr: 'rate(node_disk_io_time_seconds_total[5m]) > 0.8'
for: 5m for: 5m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Host unusual disk IO (instance {{ $labels.instance }}) summary: Host unusual disk IO (instance {{ $labels.instance }})
description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostContextSwitching - alert: HostContextSwitchingHigh
expr: '((rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2'
for: 0m for: 0m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Host context switching (instance {{ $labels.instance }}) summary: Host context switching high (instance {{ $labels.instance }})
description: "Context switching is growing on the node (> 10000 / s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Context switching is growing on the node (twice the daily average during the last 15m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostSwapIsFillingUp - alert: HostSwapIsFillingUp
expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80)'
for: 2m for: 2m
labels: labels:
severity: warning severity: warning
@ -194,7 +185,7 @@ groups:
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostSystemdServiceCrashed - alert: HostSystemdServiceCrashed
expr: '(node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' expr: '(node_systemd_unit_state{state="failed"} == 1)'
for: 0m for: 0m
labels: labels:
severity: warning severity: warning
@ -203,7 +194,7 @@ groups:
description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostPhysicalComponentTooHot - alert: HostPhysicalComponentTooHot
expr: '((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl"} > 75)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' expr: 'node_hwmon_temp_celsius > node_hwmon_temp_max_celsius'
for: 5m for: 5m
labels: labels:
severity: warning severity: warning
@ -212,7 +203,7 @@ groups:
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNodeOvertemperatureAlarm - alert: HostNodeOvertemperatureAlarm
expr: '(node_hwmon_temp_crit_alarm_celsius == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' expr: '((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1))'
for: 0m for: 0m
labels: labels:
severity: critical severity: critical
@ -220,35 +211,35 @@ groups:
summary: Host node overtemperature alarm (instance {{ $labels.instance }}) summary: Host node overtemperature alarm (instance {{ $labels.instance }})
description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostRaidArrayGotInactive - alert: HostSoftwareRaidInsufficientDrives
expr: '(node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' expr: '((node_md_disks_required - on(device, instance) node_md_disks{state="active"}) > 0)'
for: 0m for: 0m
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: Host RAID array got inactive (instance {{ $labels.instance }}) summary: Host software RAID insufficient drives (instance {{ $labels.instance }})
description: "RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} has insufficient drives remaining.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostRaidDiskFailure - alert: HostSoftwareRaidDiskFailure
expr: '(node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' expr: '(node_md_disks{state="failed"} > 0)'
for: 2m for: 2m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Host RAID disk failure (instance {{ $labels.instance }}) summary: Host software RAID disk failure (instance {{ $labels.instance }})
description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} needs attention.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostKernelVersionDeviations - alert: HostKernelVersionDeviations
expr: '(count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' expr: 'changes(node_uname_info[1h]) > 0'
for: 6h for: 0m
labels: labels:
severity: warning severity: info
annotations: annotations:
summary: Host kernel version deviations (instance {{ $labels.instance }}) summary: Host kernel version deviations (instance {{ $labels.instance }})
description: "Different kernel versions are running\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Kernel version for {{ $labels.instance }} has changed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOomKillDetected - alert: HostOomKillDetected
expr: '(increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' expr: '(increase(node_vmstat_oom_kill[1m]) > 0)'
for: 0m for: 0m
labels: labels:
severity: warning severity: warning
@ -257,7 +248,7 @@ groups:
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostEdacCorrectableErrorsDetected - alert: HostEdacCorrectableErrorsDetected
expr: '(increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' expr: '(increase(node_edac_correctable_errors_total[1m]) > 0)'
for: 0m for: 0m
labels: labels:
severity: info severity: info
@ -266,7 +257,7 @@ groups:
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostEdacUncorrectableErrorsDetected - alert: HostEdacUncorrectableErrorsDetected
expr: '(node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' expr: '(node_edac_uncorrectable_errors_total > 0)'
for: 0m for: 0m
labels: labels:
severity: warning severity: warning
@ -275,7 +266,7 @@ groups:
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkReceiveErrors - alert: HostNetworkReceiveErrors
expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01)'
for: 2m for: 2m
labels: labels:
severity: warning severity: warning
@ -284,7 +275,7 @@ groups:
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkTransmitErrors - alert: HostNetworkTransmitErrors
expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01)'
for: 2m for: 2m
labels: labels:
severity: warning severity: warning
@ -292,17 +283,8 @@ groups:
summary: Host Network Transmit Errors (instance {{ $labels.instance }}) summary: Host Network Transmit Errors (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkInterfaceSaturated
expr: '((rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} > 0.8 < 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 1m
labels:
severity: warning
annotations:
summary: Host Network Interface Saturated (instance {{ $labels.instance }})
description: "The network interface \"{{ $labels.device }}\" on \"{{ $labels.instance }}\" is getting overloaded.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkBondDegraded - alert: HostNetworkBondDegraded
expr: '((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' expr: '((node_bonding_active - node_bonding_slaves) != 0)'
for: 2m for: 2m
labels: labels:
severity: warning severity: warning
@ -311,7 +293,7 @@ groups:
description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostConntrackLimit - alert: HostConntrackLimit
expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8)'
for: 5m for: 5m
labels: labels:
severity: warning severity: warning
@ -320,7 +302,7 @@ groups:
description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostClockSkew - alert: HostClockSkew
expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0))'
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
@ -329,7 +311,7 @@ groups:
description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostClockNotSynchronising - alert: HostClockNotSynchronising
expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16)'
for: 2m for: 2m
labels: labels:
severity: warning severity: warning
@ -338,7 +320,7 @@ groups:
description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostRequiresReboot - alert: HostRequiresReboot
expr: '(node_reboot_required > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' expr: '(node_reboot_required > 0)'
for: 4h for: 4h
labels: labels:
severity: info severity: info

View file

@ -10,44 +10,44 @@ groups:
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: Kubernetes node not ready (instance {{ $labels.instance }}) summary: Kubernetes Node ready (node {{ $labels.node }})
description: "Node {{ $labels.node }} has been unready for a long time\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Node {{ $labels.node }} has been unready for a long time\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesMemoryPressure - alert: KubernetesNodeMemoryPressure
expr: 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1' expr: 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1'
for: 2m for: 2m
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: Kubernetes memory pressure (instance {{ $labels.instance }}) summary: Kubernetes memory pressure (node {{ $labels.node }})
description: "{{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Node {{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesDiskPressure - alert: KubernetesNodeDiskPressure
expr: 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1' expr: 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1'
for: 2m for: 2m
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: Kubernetes disk pressure (instance {{ $labels.instance }}) summary: Kubernetes disk pressure (node {{ $labels.node }})
description: "{{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Node {{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesNetworkUnavailable - alert: KubernetesNodeNetworkUnavailable
expr: 'kube_node_status_condition{condition="NetworkUnavailable",status="true"} == 1' expr: 'kube_node_status_condition{condition="NetworkUnavailable",status="true"} == 1'
for: 2m for: 2m
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: Kubernetes network unavailable (instance {{ $labels.instance }}) summary: Kubernetes Node network unavailable (instance {{ $labels.instance }})
description: "{{ $labels.node }} has NetworkUnavailable condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Node {{ $labels.node }} has NetworkUnavailable condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesOutOfCapacity - alert: KubernetesNodeOutOfPodCapacity
expr: 'sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90' expr: 'sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid, instance) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90'
for: 2m for: 2m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Kubernetes out of capacity (instance {{ $labels.instance }}) summary: Kubernetes Node out of pod capacity (instance {{ $labels.instance }})
description: "{{ $labels.node }} is out of capacity\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Node {{ $labels.node }} is out of pod capacity\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesContainerOomKiller - alert: KubernetesContainerOomKiller
expr: '(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1' expr: '(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1'
@ -55,7 +55,7 @@ groups:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Kubernetes container oom killer (instance {{ $labels.instance }}) summary: Kubernetes container oom killer ({{ $labels.namespace }}/{{ $labels.pod }}:{{ $labels.container }})
description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesJobFailed - alert: KubernetesJobFailed
@ -64,16 +64,25 @@ groups:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Kubernetes Job failed (instance {{ $labels.instance }}) summary: Kubernetes Job failed ({{ $labels.namespace }}/{{ $labels.job_name }})
description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesJobNotStarting
expr: 'kube_job_status_active == 0 and kube_job_status_failed == 0 and kube_job_status_succeeded == 0 and (time() - kube_job_status_start_time) > 600'
for: 0m
labels:
severity: warning
annotations:
summary: Kubernetes Job not starting ({{ $labels.namespace }}/{{ $labels.job_name }})
description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} did not start for 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesCronjobSuspended - alert: KubernetesCronjobSuspended
expr: 'kube_cronjob_spec_suspend != 0' expr: 'kube_cronjob_spec_suspend != 0'
for: 0m for: 0m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Kubernetes CronJob suspended (instance {{ $labels.instance }}) summary: Kubernetes CronJob suspended ({{ $labels.namespace }}/{{ $labels.cronjob }})
description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is suspended\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is suspended\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesPersistentvolumeclaimPending - alert: KubernetesPersistentvolumeclaimPending
@ -82,7 +91,7 @@ groups:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Kubernetes PersistentVolumeClaim pending (instance {{ $labels.instance }}) summary: Kubernetes PersistentVolumeClaim pending ({{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }})
description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pending\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pending\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesVolumeOutOfDiskSpace - alert: KubernetesVolumeOutOfDiskSpace
@ -95,13 +104,13 @@ groups:
description: "Volume is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Volume is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesVolumeFullInFourDays - alert: KubernetesVolumeFullInFourDays
expr: 'predict_linear(kubelet_volume_stats_available_bytes[6h], 4 * 24 * 3600) < 0' expr: 'predict_linear(kubelet_volume_stats_available_bytes[6h:5m], 4 * 24 * 3600) < 0'
for: 0m for: 0m
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: Kubernetes Volume full in four days (instance {{ $labels.instance }}) summary: Kubernetes Volume full in four days (instance {{ $labels.instance }})
description: "{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Volume under {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesPersistentvolumeError - alert: KubernetesPersistentvolumeError
expr: 'kube_persistentvolume_status_phase{phase=~"Failed|Pending", job="kube-state-metrics"} > 0' expr: 'kube_persistentvolume_status_phase{phase=~"Failed|Pending", job="kube-state-metrics"} > 0'
@ -109,8 +118,8 @@ groups:
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: Kubernetes PersistentVolume error (instance {{ $labels.instance }}) summary: Kubernetes PersistentVolumeClaim pending ({{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }})
description: "Persistent volume is in bad state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Persistent volume {{ $labels.persistentvolume }} is in bad state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesStatefulsetDown - alert: KubernetesStatefulsetDown
expr: 'kube_statefulset_replicas != kube_statefulset_status_replicas_ready > 0' expr: 'kube_statefulset_replicas != kube_statefulset_status_replicas_ready > 0'
@ -118,35 +127,35 @@ groups:
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: Kubernetes StatefulSet down (instance {{ $labels.instance }}) summary: Kubernetes StatefulSet down ({{ $labels.namespace }}/{{ $labels.statefulset }})
description: "A StatefulSet went down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} went down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesHpaScalingAbility - alert: KubernetesHpaScaleInability
expr: 'kube_horizontalpodautoscaler_status_condition{status="false", condition="AbleToScale"} == 1' expr: '(kube_horizontalpodautoscaler_spec_max_replicas - kube_horizontalpodautoscaler_status_desired_replicas) * on (horizontalpodautoscaler,namespace) (kube_horizontalpodautoscaler_status_condition{condition="ScalingLimited", status="true"} == 1) == 0'
for: 2m for: 2m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Kubernetes HPA scaling ability (instance {{ $labels.instance }}) summary: Kubernetes HPA scale inability (instance {{ $labels.instance }})
description: "Pod is unable to scale\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to scale\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesHpaMetricAvailability - alert: KubernetesHpaMetricsUnavailability
expr: 'kube_horizontalpodautoscaler_status_condition{status="false", condition="ScalingActive"} == 1' expr: 'kube_horizontalpodautoscaler_status_condition{status="false", condition="ScalingActive"} == 1'
for: 0m for: 0m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Kubernetes HPA metric availability (instance {{ $labels.instance }}) summary: Kubernetes HPA metrics unavailability (instance {{ $labels.instance }})
description: "HPA is not able to collect metrics\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to collect metrics\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesHpaScaleCapability - alert: KubernetesHpaScaleMaximum
expr: 'kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas' expr: '(kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas) and (kube_horizontalpodautoscaler_spec_max_replicas > 1) and (kube_horizontalpodautoscaler_spec_min_replicas != kube_horizontalpodautoscaler_spec_max_replicas)'
for: 2m for: 2m
labels: labels:
severity: info severity: info
annotations: annotations:
summary: Kubernetes HPA scale capability (instance {{ $labels.instance }}) summary: Kubernetes HPA scale maximum (instance {{ $labels.instance }})
description: "The maximum number of desired Pods has been hit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has hit maximum number of desired pods\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesHpaUnderutilized - alert: KubernetesHpaUnderutilized
expr: 'max(quantile_over_time(0.5, kube_horizontalpodautoscaler_status_desired_replicas[1d]) == kube_horizontalpodautoscaler_spec_min_replicas) by (horizontalpodautoscaler) > 3' expr: 'max(quantile_over_time(0.5, kube_horizontalpodautoscaler_status_desired_replicas[1d]) == kube_horizontalpodautoscaler_spec_min_replicas) by (horizontalpodautoscaler) > 3'
@ -155,7 +164,7 @@ groups:
severity: info severity: info
annotations: annotations:
summary: Kubernetes HPA underutilized (instance {{ $labels.instance }}) summary: Kubernetes HPA underutilized (instance {{ $labels.instance }})
description: "HPA is constantly at minimum replicas for 50% of the time. Potential cost saving here.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is constantly at minimum replicas for 50% of the time. Potential cost saving here.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesPodNotHealthy - alert: KubernetesPodNotHealthy
expr: 'sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"}) > 0' expr: 'sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"}) > 0'
@ -163,8 +172,8 @@ groups:
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: Kubernetes Pod not healthy (instance {{ $labels.instance }}) summary: Kubernetes Pod not healthy ({{ $labels.namespace }}/{{ $labels.pod }})
description: "Pod has been in a non-ready state for longer than 15 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-running state for longer than 15 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesPodCrashLooping - alert: KubernetesPodCrashLooping
expr: 'increase(kube_pod_container_status_restarts_total[1m]) > 3' expr: 'increase(kube_pod_container_status_restarts_total[1m]) > 3'
@ -172,17 +181,17 @@ groups:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Kubernetes pod crash looping (instance {{ $labels.instance }}) summary: Kubernetes pod crash looping ({{ $labels.namespace }}/{{ $labels.pod }})
description: "Pod {{ $labels.pod }} is crash looping\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesReplicassetMismatch - alert: KubernetesReplicasetReplicasMismatch
expr: 'kube_replicaset_spec_replicas != kube_replicaset_status_ready_replicas' expr: 'kube_replicaset_spec_replicas != kube_replicaset_status_ready_replicas'
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Kubernetes ReplicasSet mismatch (instance {{ $labels.instance }}) summary: Kubernetes ReplicasSet mismatch ({{ $labels.namespace }}/{{ $labels.replicaset }})
description: "Deployment Replicas mismatch\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "ReplicaSet {{ $labels.namespace }}/{{ $labels.replicaset }} replicas mismatch\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesDeploymentReplicasMismatch - alert: KubernetesDeploymentReplicasMismatch
expr: 'kube_deployment_spec_replicas != kube_deployment_status_replicas_available' expr: 'kube_deployment_spec_replicas != kube_deployment_status_replicas_available'
@ -190,8 +199,8 @@ groups:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Kubernetes Deployment replicas mismatch (instance {{ $labels.instance }}) summary: Kubernetes Deployment replicas mismatch ({{ $labels.namespace }}/{{ $labels.deployment }})
description: "Deployment Replicas mismatch\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replicas mismatch\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesStatefulsetReplicasMismatch - alert: KubernetesStatefulsetReplicasMismatch
expr: 'kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas' expr: 'kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas'
@ -200,7 +209,7 @@ groups:
severity: warning severity: warning
annotations: annotations:
summary: Kubernetes StatefulSet replicas mismatch (instance {{ $labels.instance }}) summary: Kubernetes StatefulSet replicas mismatch (instance {{ $labels.instance }})
description: "A StatefulSet does not match the expected number of replicas.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "StatefulSet does not match the expected number of replicas.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesDeploymentGenerationMismatch - alert: KubernetesDeploymentGenerationMismatch
expr: 'kube_deployment_status_observed_generation != kube_deployment_metadata_generation' expr: 'kube_deployment_status_observed_generation != kube_deployment_metadata_generation'
@ -208,8 +217,8 @@ groups:
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: Kubernetes Deployment generation mismatch (instance {{ $labels.instance }}) summary: Kubernetes Deployment generation mismatch ({{ $labels.namespace }}/{{ $labels.deployment }})
description: "A Deployment has failed but has not been rolled back.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has failed but has not been rolled back.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesStatefulsetGenerationMismatch - alert: KubernetesStatefulsetGenerationMismatch
expr: 'kube_statefulset_status_observed_generation != kube_statefulset_metadata_generation' expr: 'kube_statefulset_status_observed_generation != kube_statefulset_metadata_generation'
@ -217,8 +226,8 @@ groups:
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: Kubernetes StatefulSet generation mismatch (instance {{ $labels.instance }}) summary: Kubernetes StatefulSet generation mismatch ({{ $labels.namespace }}/{{ $labels.statefulset }})
description: "A StatefulSet has failed but has not been rolled back.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has failed but has not been rolled back.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesStatefulsetUpdateNotRolledOut - alert: KubernetesStatefulsetUpdateNotRolledOut
expr: 'max without (revision) (kube_statefulset_status_current_revision unless kube_statefulset_status_update_revision) * (kube_statefulset_replicas != kube_statefulset_status_replicas_updated)' expr: 'max without (revision) (kube_statefulset_status_current_revision unless kube_statefulset_status_update_revision) * (kube_statefulset_replicas != kube_statefulset_status_replicas_updated)'
@ -226,8 +235,8 @@ groups:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Kubernetes StatefulSet update not rolled out (instance {{ $labels.instance }}) summary: Kubernetes StatefulSet update not rolled out ({{ $labels.namespace }}/{{ $labels.statefulset }})
description: "StatefulSet update has not been rolled out.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesDaemonsetRolloutStuck - alert: KubernetesDaemonsetRolloutStuck
expr: 'kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 or kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0' expr: 'kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 or kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0'
@ -235,8 +244,8 @@ groups:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Kubernetes DaemonSet rollout stuck (instance {{ $labels.instance }}) summary: Kubernetes DaemonSet rollout stuck ({{ $labels.namespace }}/{{ $labels.daemonset }})
description: "Some Pods of DaemonSet are not scheduled or not ready\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled or not ready\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesDaemonsetMisscheduled - alert: KubernetesDaemonsetMisscheduled
expr: 'kube_daemonset_status_number_misscheduled > 0' expr: 'kube_daemonset_status_number_misscheduled > 0'
@ -244,8 +253,8 @@ groups:
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: Kubernetes DaemonSet misscheduled (instance {{ $labels.instance }}) summary: Kubernetes DaemonSet misscheduled ({{ $labels.namespace }}/{{ $labels.daemonset }})
description: "Some DaemonSet Pods are running where they are not supposed to run\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesCronjobTooLong - alert: KubernetesCronjobTooLong
expr: 'time() - kube_cronjob_next_schedule_time > 3600' expr: 'time() - kube_cronjob_next_schedule_time > 3600'
@ -253,7 +262,7 @@ groups:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Kubernetes CronJob too long (instance {{ $labels.instance }}) summary: Kubernetes CronJob too long ({{ $labels.namespace }}/{{ $labels.cronjob }})
description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesJobSlowCompletion - alert: KubernetesJobSlowCompletion
@ -262,11 +271,11 @@ groups:
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: Kubernetes job slow completion (instance {{ $labels.instance }}) summary: Kubernetes job slow completion ({{ $labels.namespace }}/{{ $labels.job_name }})
description: "Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in time.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in time.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesApiServerErrors - alert: KubernetesApiServerErrors
expr: 'sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[1m])) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) * 100 > 3' expr: 'sum(rate(apiserver_request_total{job="apiserver",code=~"(?:5..)"}[1m])) by (instance, job) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) by (instance, job) * 100 > 3'
for: 2m for: 2m
labels: labels:
severity: critical severity: critical
@ -302,7 +311,7 @@ groups:
description: "A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KubernetesApiServerLatency - alert: KubernetesApiServerLatency
expr: 'histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"} [10m])) WITHOUT (instance, resource)) / 1e+06 > 1' expr: 'histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{verb!~"(?:CONNECT|WATCHLIST|WATCH|PROXY)"} [10m])) WITHOUT (subresource)) > 1'
for: 2m for: 2m
labels: labels:
severity: warning severity: warning

View file

@ -0,0 +1,23 @@
groups:
- name: EmbeddedExporter
rules:
- alert: MeilisearchIndexIsEmpty
expr: 'meilisearch_index_docs_count == 0'
for: 0m
labels:
severity: warning
annotations:
summary: Meilisearch index is empty (instance {{ $labels.instance }})
description: "Meilisearch instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MeilisearchHttpResponseTime
expr: 'meilisearch_http_response_time_seconds > 0.5'
for: 0m
labels:
severity: warning
annotations:
summary: Meilisearch http response time (instance {{ $labels.instance }})
description: "Meilisearch http response time is too high\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -5,7 +5,7 @@ groups:
rules: rules:
- alert: MinioClusterDiskOffline - alert: MinioClusterDiskOffline
expr: 'minio_cluster_disk_offline_total > 0' expr: 'minio_cluster_drive_offline_total > 0'
for: 0m for: 0m
labels: labels:
severity: critical severity: critical

View file

@ -66,12 +66,3 @@ groups:
annotations: annotations:
summary: MongoDB too many connections (instance {{ $labels.instance }}) summary: MongoDB too many connections (instance {{ $labels.instance }})
description: "Too many connections (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Too many connections (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MongodbVirtualMemoryUsage
expr: '(sum(mongodb_ss_mem_virtual) BY (instance) / sum(mongodb_ss_mem_resident) BY (instance)) > 3'
for: 2m
labels:
severity: warning
annotations:
summary: MongoDB virtual memory usage (instance {{ $labels.instance }})
description: "High memory usage\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -22,6 +22,15 @@ groups:
summary: MySQL too many connections (> 80%) (instance {{ $labels.instance }}) summary: MySQL too many connections (> 80%) (instance {{ $labels.instance }})
description: "More than 80% of MySQL connections are in use on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "More than 80% of MySQL connections are in use on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MysqlHighPreparedStatementsUtilization(>80%)
expr: 'max_over_time(mysql_global_status_prepared_stmt_count[1m]) / mysql_global_variables_max_prepared_stmt_count * 100 > 80'
for: 2m
labels:
severity: warning
annotations:
summary: MySQL high prepared statements utilization (> 80%) (instance {{ $labels.instance }})
description: "High utilization of prepared statements (>80%) on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MysqlHighThreadsRunning - alert: MysqlHighThreadsRunning
expr: 'max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections * 100 > 60' expr: 'max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections * 100 > 60'
for: 2m for: 2m
@ -84,3 +93,39 @@ groups:
annotations: annotations:
summary: MySQL restarted (instance {{ $labels.instance }}) summary: MySQL restarted (instance {{ $labels.instance }})
description: "MySQL has just been restarted, less than one minute ago on {{ $labels.instance }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "MySQL has just been restarted, less than one minute ago on {{ $labels.instance }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MysqlHighQps
expr: 'irate(mysql_global_status_questions[1m]) > 10000'
for: 2m
labels:
severity: info
annotations:
summary: MySQL High QPS (instance {{ $labels.instance }})
description: "MySQL is being overload with unusual QPS (> 10k QPS).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MysqlTooManyOpenFiles
expr: 'mysql_global_status_innodb_num_open_files / mysql_global_variables_open_files_limit * 100 > 75'
for: 2m
labels:
severity: warning
annotations:
summary: MySQL too many open files (instance {{ $labels.instance }})
description: "MySQL has too many open files, consider increase variables open_files_limit on {{ $labels.instance }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MysqlInnodbForceRecoveryIsEnabled
expr: 'mysql_global_variables_innodb_force_recovery != 0'
for: 2m
labels:
severity: warning
annotations:
summary: MySQL InnoDB Force Recovery is enabled (instance {{ $labels.instance }})
description: "MySQL InnoDB force recovery is enabled on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MysqlInnodbHistory_lenTooLong
expr: 'mysql_info_schema_innodb_metrics_transaction_trx_rseg_history_len > 50000'
for: 2m
labels:
severity: warning
annotations:
summary: MySQL InnoDB history_len too long (instance {{ $labels.instance }})
description: "MySQL history_len (undo log) too long on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -32,10 +32,154 @@ groups:
description: "High number of NATS subscriptions ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "High number of NATS subscriptions ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsHighRoutesCount - alert: NatsHighRoutesCount
expr: 'gnatsd_routez_num_routes > 10' expr: 'gnatsd_varz_routes > 10'
for: 3m for: 3m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Nats high routes count (instance {{ $labels.instance }}) summary: Nats high routes count (instance {{ $labels.instance }})
description: "High number of NATS routes ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "High number of NATS routes ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsHighMemoryUsage
expr: 'gnatsd_varz_mem > 200 * 1024 * 1024'
for: 5m
labels:
severity: warning
annotations:
summary: Nats high memory usage (instance {{ $labels.instance }})
description: "NATS server memory usage is above 200MB for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsSlowConsumers
expr: 'gnatsd_varz_slow_consumers > 0'
for: 3m
labels:
severity: critical
annotations:
summary: Nats slow consumers (instance {{ $labels.instance }})
description: "There are slow consumers in NATS for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsServerDown
expr: 'absent(up{job="nats"})'
for: 5m
labels:
severity: critical
annotations:
summary: Nats server down (instance {{ $labels.instance }})
description: "NATS server has been down for more than 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsHighCpuUsage
expr: 'rate(gnatsd_varz_cpu[5m]) > 0.8'
for: 5m
labels:
severity: warning
annotations:
summary: Nats high CPU usage (instance {{ $labels.instance }})
description: "NATS server is using more than 80% CPU for the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsHighNumberOfConnections
expr: 'gnatsd_connz_num_connections > 1000'
for: 5m
labels:
severity: warning
annotations:
summary: Nats high number of connections (instance {{ $labels.instance }})
description: "NATS server has more than 1000 active connections\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsHighJetstreamStoreUsage
expr: 'gnatsd_varz_jetstream_stats_storage / gnatsd_varz_jetstream_config_max_storage > 0.8'
for: 5m
labels:
severity: warning
annotations:
summary: Nats high JetStream store usage (instance {{ $labels.instance }})
description: "JetStream store usage is over 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsHighJetstreamMemoryUsage
expr: 'gnatsd_varz_jetstream_stats_memory / gnatsd_varz_jetstream_config_max_memory > 0.8'
for: 5m
labels:
severity: warning
annotations:
summary: Nats high JetStream memory usage (instance {{ $labels.instance }})
description: "JetStream memory usage is over 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsHighNumberOfSubscriptions
expr: 'gnatsd_connz_subscriptions > 1000'
for: 5m
labels:
severity: warning
annotations:
summary: Nats high number of subscriptions (instance {{ $labels.instance }})
description: "NATS server has more than 1000 active subscriptions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsHighPendingBytes
expr: 'gnatsd_connz_pending_bytes > 100000'
for: 5m
labels:
severity: warning
annotations:
summary: Nats high pending bytes (instance {{ $labels.instance }})
description: "NATS server has more than 100,000 pending bytes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsTooManyErrors
expr: 'increase(gnatsd_varz_jetstream_stats_api_errors[5m]) > 0'
for: 5m
labels:
severity: warning
annotations:
summary: Nats too many errors (instance {{ $labels.instance }})
description: "NATS server has encountered errors in the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsJetstreamConsumersExceeded
expr: 'sum(gnatsd_varz_jetstream_stats_accounts) > 100'
for: 5m
labels:
severity: warning
annotations:
summary: Nats JetStream consumers exceeded (instance {{ $labels.instance }})
description: "JetStream has more than 100 active consumers\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsFrequentAuthenticationTimeouts
expr: 'increase(gnatsd_varz_auth_timeout[5m]) > 5'
for: 5m
labels:
severity: warning
annotations:
summary: Nats frequent authentication timeouts (instance {{ $labels.instance }})
description: "There have been more than 5 authentication timeouts in the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsMaxPayloadSizeExceeded
expr: 'max(gnatsd_varz_max_payload) > 1024 * 1024'
for: 5m
labels:
severity: critical
annotations:
summary: Nats max payload size exceeded (instance {{ $labels.instance }})
description: "The max payload size allowed by NATS has been exceeded (1MB)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsLeafNodeConnectionIssue
expr: 'increase(gnatsd_varz_leafnodes[5m]) == 0'
for: 5m
labels:
severity: critical
annotations:
summary: Nats leaf node connection issue (instance {{ $labels.instance }})
description: "No leaf node connections have been established in the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsMaxPingOperationsExceeded
expr: 'gnatsd_varz_ping_max > 50'
for: 5m
labels:
severity: warning
annotations:
summary: Nats max ping operations exceeded (instance {{ $labels.instance }})
description: "The maximum number of ping operations in NATS has exceeded 50\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsWriteDeadlineExceeded
expr: 'gnatsd_varz_write_deadline > 10'
for: 5m
labels:
severity: critical
annotations:
summary: Nats write deadline exceeded (instance {{ $labels.instance }})
description: "The write deadline has been exceeded in NATS, indicating potential message delivery issues\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -23,7 +23,7 @@ groups:
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NetdataHighMemoryUsage - alert: NetdataHighMemoryUsage
expr: '100 / netdata_system_ram_MB_average * netdata_system_ram_MB_average{dimension=~"free|cached"} < 20' expr: '100 / netdata_system_ram_MiB_average * netdata_system_ram_MiB_average{dimension=~"free|cached"} < 20'
for: 5m for: 5m
labels: labels:
severity: warning severity: warning

View file

@ -32,7 +32,7 @@ groups:
description: "Postgresql exporter is showing errors. A query may be buggy in query.yaml\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Postgresql exporter is showing errors. A query may be buggy in query.yaml\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlTableNotAutoVacuumed - alert: PostgresqlTableNotAutoVacuumed
expr: '(pg_stat_user_tables_last_autovacuum > 0) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10' expr: '((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_vacuum_threshold) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10'
for: 0m for: 0m
labels: labels:
severity: warning severity: warning
@ -41,7 +41,7 @@ groups:
description: "Table {{ $labels.relname }} has not been auto vacuumed for 10 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Table {{ $labels.relname }} has not been auto vacuumed for 10 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlTableNotAutoAnalyzed - alert: PostgresqlTableNotAutoAnalyzed
expr: '(pg_stat_user_tables_last_autoanalyze > 0) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10' expr: '((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_analyze_threshold) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10'
for: 0m for: 0m
labels: labels:
severity: warning severity: warning
@ -50,7 +50,7 @@ groups:
description: "Table {{ $labels.relname }} has not been auto analyzed for 10 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Table {{ $labels.relname }} has not been auto analyzed for 10 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlTooManyConnections - alert: PostgresqlTooManyConnections
expr: '' expr: 'sum by (instance, job, server) (pg_stat_activity_count) > min by (instance, job, server) (pg_settings_max_connections * 0.8)'
for: 2m for: 2m
labels: labels:
severity: warning severity: warning
@ -62,7 +62,7 @@ groups:
expr: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5' expr: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5'
for: 2m for: 2m
labels: labels:
severity: warning severity: critical
annotations: annotations:
summary: Postgresql not enough connections (instance {{ $labels.instance }}) summary: Postgresql not enough connections (instance {{ $labels.instance }})
description: "PostgreSQL instance should have more connections (> 5)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "PostgreSQL instance should have more connections (> 5)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
@ -86,7 +86,7 @@ groups:
description: "Ratio of transactions being aborted compared to committed is > 2 %\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Ratio of transactions being aborted compared to committed is > 2 %\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlCommitRateLow - alert: PostgresqlCommitRateLow
expr: 'rate(pg_stat_database_xact_commit[1m]) < 10' expr: 'increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[5m]) < 5'
for: 2m for: 2m
labels: labels:
severity: critical severity: critical
@ -140,7 +140,7 @@ groups:
description: "PostgreSQL dead tuples is too large\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "PostgreSQL dead tuples is too large\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlConfigurationChanged - alert: PostgresqlConfigurationChanged
expr: '{__name__=~"pg_settings_.*"} != ON(__name__) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"} OFFSET 5m' expr: '{__name__=~"pg_settings_.*"} != ON(__name__, instance) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"} OFFSET 5m'
for: 0m for: 0m
labels: labels:
severity: info severity: info
@ -155,7 +155,7 @@ groups:
severity: critical severity: critical
annotations: annotations:
summary: Postgresql SSL compression active (instance {{ $labels.instance }}) summary: Postgresql SSL compression active (instance {{ $labels.instance }})
description: "Database connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Database allows connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlTooManyLocksAcquired - alert: PostgresqlTooManyLocksAcquired
expr: '((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20' expr: '((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20'
@ -183,3 +183,12 @@ groups:
annotations: annotations:
summary: Postgresql bloat table high (> 80%) (instance {{ $labels.instance }}) summary: Postgresql bloat table high (> 80%) (instance {{ $labels.instance }})
description: "The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlInvalidIndex
expr: 'pg_general_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}'
for: 6h
labels:
severity: warning
annotations:
summary: Postgresql invalid index (instance {{ $labels.instance }})
description: "The table {{ $labels.relname }} has an invalid index: {{ $labels.indexrelname }}. You should execute `DROP INDEX {{ $labels.indexrelname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -32,7 +32,7 @@ groups:
description: "A Prometheus job does not have living target anymore.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "A Prometheus job does not have living target anymore.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PrometheusTargetMissingWithWarmupTime - alert: PrometheusTargetMissingWithWarmupTime
expr: 'sum by (instance, job) ((up == 0) * on (instance) group_right(job) (node_time_seconds - node_boot_time_seconds > 600))' expr: 'sum by (instance, job) ((up == 0) * on (instance) group_left(__name__) (node_time_seconds - node_boot_time_seconds > 600))'
for: 0m for: 0m
labels: labels:
severity: critical severity: critical

View file

@ -10,7 +10,7 @@ groups:
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: Rabbitmq down (instance {{ $labels.instance }}) summary: RabbitMQ down (instance {{ $labels.instance }})
description: "RabbitMQ node down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "RabbitMQ node down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqClusterDown - alert: RabbitmqClusterDown
@ -19,7 +19,7 @@ groups:
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: Rabbitmq cluster down (instance {{ $labels.instance }}) summary: RabbitMQ cluster down (instance {{ $labels.instance }})
description: "Less than 3 nodes running in RabbitMQ cluster\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Less than 3 nodes running in RabbitMQ cluster\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqClusterPartition - alert: RabbitmqClusterPartition
@ -28,7 +28,7 @@ groups:
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: Rabbitmq cluster partition (instance {{ $labels.instance }}) summary: RabbitMQ cluster partition (instance {{ $labels.instance }})
description: "Cluster partition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Cluster partition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqOutOfMemory - alert: RabbitmqOutOfMemory
@ -37,7 +37,7 @@ groups:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Rabbitmq out of memory (instance {{ $labels.instance }}) summary: RabbitMQ out of memory (instance {{ $labels.instance }})
description: "Memory available for RabbmitMQ is low (< 10%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Memory available for RabbmitMQ is low (< 10%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqTooManyConnections - alert: RabbitmqTooManyConnections
@ -46,7 +46,7 @@ groups:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Rabbitmq too many connections (instance {{ $labels.instance }}) summary: RabbitMQ too many connections (instance {{ $labels.instance }})
description: "RabbitMQ instance has too many connections (> 1000)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "RabbitMQ instance has too many connections (> 1000)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqDeadLetterQueueFillingUp - alert: RabbitmqDeadLetterQueueFillingUp
@ -55,7 +55,7 @@ groups:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Rabbitmq dead letter queue filling up (instance {{ $labels.instance }}) summary: RabbitMQ dead letter queue filling up (instance {{ $labels.instance }})
description: "Dead letter queue is filling up (> 10 msgs)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Dead letter queue is filling up (> 10 msgs)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqTooManyMessagesInQueue - alert: RabbitmqTooManyMessagesInQueue
@ -64,7 +64,7 @@ groups:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Rabbitmq too many messages in queue (instance {{ $labels.instance }}) summary: RabbitMQ too many messages in queue (instance {{ $labels.instance }})
description: "Queue is filling up (> 1000 msgs)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Queue is filling up (> 1000 msgs)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqSlowQueueConsuming - alert: RabbitmqSlowQueueConsuming
@ -73,7 +73,7 @@ groups:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Rabbitmq slow queue consuming (instance {{ $labels.instance }}) summary: RabbitMQ slow queue consuming (instance {{ $labels.instance }})
description: "Queue messages are consumed slowly (> 60s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Queue messages are consumed slowly (> 60s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqNoConsumer - alert: RabbitmqNoConsumer
@ -82,7 +82,7 @@ groups:
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: Rabbitmq no consumer (instance {{ $labels.instance }}) summary: RabbitMQ no consumer (instance {{ $labels.instance }})
description: "Queue has no consumer\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Queue has no consumer\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqTooManyConsumers - alert: RabbitmqTooManyConsumers
@ -91,7 +91,7 @@ groups:
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: Rabbitmq too many consumers (instance {{ $labels.instance }}) summary: RabbitMQ too many consumers (instance {{ $labels.instance }})
description: "Queue should have only 1 consumer\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Queue should have only 1 consumer\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqUnactiveExchange - alert: RabbitmqUnactiveExchange
@ -100,5 +100,5 @@ groups:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Rabbitmq unactive exchange (instance {{ $labels.instance }}) summary: RabbitMQ unactive exchange (instance {{ $labels.instance }})
description: "Exchange receive less than 5 msgs per second\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Exchange receive less than 5 msgs per second\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -10,7 +10,7 @@ groups:
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: Rabbitmq node down (instance {{ $labels.instance }}) summary: RabbitMQ node down (instance {{ $labels.instance }})
description: "Less than 3 nodes running in RabbitMQ cluster\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Less than 3 nodes running in RabbitMQ cluster\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqNodeNotDistributed - alert: RabbitmqNodeNotDistributed
@ -19,7 +19,7 @@ groups:
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: Rabbitmq node not distributed (instance {{ $labels.instance }}) summary: RabbitMQ node not distributed (instance {{ $labels.instance }})
description: "Distribution link state is not 'up'\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Distribution link state is not 'up'\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqInstancesDifferentVersions - alert: RabbitmqInstancesDifferentVersions
@ -28,8 +28,8 @@ groups:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Rabbitmq instances different versions (instance {{ $labels.instance }}) summary: RabbitMQ instances different versions (instance {{ $labels.instance }})
description: "Running different version of Rabbitmq in the same cluster, can lead to failure.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Running different version of RabbitMQ in the same cluster, can lead to failure.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqMemoryHigh - alert: RabbitmqMemoryHigh
expr: 'rabbitmq_process_resident_memory_bytes / rabbitmq_resident_memory_limit_bytes * 100 > 90' expr: 'rabbitmq_process_resident_memory_bytes / rabbitmq_resident_memory_limit_bytes * 100 > 90'
@ -37,7 +37,7 @@ groups:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Rabbitmq memory high (instance {{ $labels.instance }}) summary: RabbitMQ memory high (instance {{ $labels.instance }})
description: "A node use more than 90% of allocated RAM\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "A node use more than 90% of allocated RAM\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqFileDescriptorsUsage - alert: RabbitmqFileDescriptorsUsage
@ -46,16 +46,25 @@ groups:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Rabbitmq file descriptors usage (instance {{ $labels.instance }}) summary: RabbitMQ file descriptors usage (instance {{ $labels.instance }})
description: "A node use more than 90% of file descriptors\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "A node use more than 90% of file descriptors\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqTooManyReadyMessages
expr: 'sum(rabbitmq_queue_messages_ready) BY (queue) > 1000'
for: 1m
labels:
severity: warning
annotations:
summary: RabbitMQ too many ready messages (instance {{ $labels.instance }})
description: "RabbitMQ too many ready messages on {{ $labels.instace }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqTooManyUnackMessages - alert: RabbitmqTooManyUnackMessages
expr: 'sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000' expr: 'sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000'
for: 1m for: 1m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Rabbitmq too many unack messages (instance {{ $labels.instance }}) summary: RabbitMQ too many unack messages (instance {{ $labels.instance }})
description: "Too many unacknowledged messages\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Too many unacknowledged messages\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqTooManyConnections - alert: RabbitmqTooManyConnections
@ -64,7 +73,7 @@ groups:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Rabbitmq too many connections (instance {{ $labels.instance }}) summary: RabbitMQ too many connections (instance {{ $labels.instance }})
description: "The total connections of a node is too high\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "The total connections of a node is too high\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqNoQueueConsumer - alert: RabbitmqNoQueueConsumer
@ -73,7 +82,7 @@ groups:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Rabbitmq no queue consumer (instance {{ $labels.instance }}) summary: RabbitMQ no queue consumer (instance {{ $labels.instance }})
description: "A queue has less than 1 consumer\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "A queue has less than 1 consumer\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RabbitmqUnroutableMessages - alert: RabbitmqUnroutableMessages
@ -82,5 +91,5 @@ groups:
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Rabbitmq unroutable messages (instance {{ $labels.instance }}) summary: RabbitMQ unroutable messages (instance {{ $labels.instance }})
description: "A queue has unroutable messages\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "A queue has unroutable messages\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -77,7 +77,7 @@ groups:
description: "Redis is running out of system memory (> 90%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Redis is running out of system memory (> 90%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RedisOutOfConfiguredMaxmemory - alert: RedisOutOfConfiguredMaxmemory
expr: 'redis_memory_used_bytes / redis_memory_max_bytes * 100 > 90' expr: 'redis_memory_used_bytes / redis_memory_max_bytes * 100 > 90 and on(instance) redis_memory_max_bytes > 0'
for: 2m for: 2m
labels: labels:
severity: warning severity: warning

View file

@ -0,0 +1,77 @@
groups:
- name: SmartctlExporter
rules:
- alert: SmartDeviceTemperatureWarning
expr: '(avg_over_time(smartctl_device_temperature{temperature_type="current"} [5m]) unless on (instance, device) smartctl_device_temperature{temperature_type="drive_trip"}) > 60'
for: 0m
labels:
severity: warning
annotations:
summary: SMART device temperature warning (instance {{ $labels.instance }})
description: "Device temperature warning on {{ $labels.instance }} drive {{ $labels.device }} over 60°C\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SmartDeviceTemperatureCritical
expr: '(max_over_time(smartctl_device_temperature{temperature_type="current"} [5m]) unless on (instance, device) smartctl_device_temperature{temperature_type="drive_trip"}) > 70'
for: 0m
labels:
severity: critical
annotations:
summary: SMART device temperature critical (instance {{ $labels.instance }})
description: "Device temperature critical on {{ $labels.instance }} drive {{ $labels.device }} over 70°C\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SmartDeviceTemperatureOverTripValue
expr: 'max_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) >= on(device, instance) smartctl_device_temperature{temperature_type="drive_trip"}'
for: 0m
labels:
severity: critical
annotations:
summary: SMART device temperature over trip value (instance {{ $labels.instance }})
description: "Device temperature over trip value on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SmartDeviceTemperatureNearingTripValue
expr: 'max_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) >= on(device, instance) (smartctl_device_temperature{temperature_type="drive_trip"} * .80)'
for: 0m
labels:
severity: warning
annotations:
summary: SMART device temperature nearing trip value (instance {{ $labels.instance }})
description: "Device temperature at 80% of trip value on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SmartStatus
expr: 'smartctl_device_smart_status != 1'
for: 0m
labels:
severity: critical
annotations:
summary: SMART status (instance {{ $labels.instance }})
description: "Device has a SMART status failure on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SmartCriticalWarning
expr: 'smartctl_device_critical_warning > 0'
for: 0m
labels:
severity: critical
annotations:
summary: SMART critical warning (instance {{ $labels.instance }})
description: "Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SmartMediaErrors
expr: 'smartctl_device_media_errors > 0'
for: 0m
labels:
severity: critical
annotations:
summary: SMART media errors (instance {{ $labels.instance }})
description: "Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: SmartWearoutIndicator
expr: 'smartctl_device_available_spare < smartctl_device_available_spare_threshold'
for: 0m
labels:
severity: critical
annotations:
summary: SMART Wearout Indicator (instance {{ $labels.instance }})
description: "Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

2
dist/template.yml vendored
View file

@ -11,6 +11,6 @@ groups:
labels: labels:
severity: {{ rule.severity }} severity: {{ rule.severity }}
annotations: annotations:
summary: {{ rule.name }} (instance {% raw %}{{ $labels.instance }}{% endraw %}) summary: {% if rule.summary %}{{ rule.summary }}{% else %}{{ rule.name }} (instance {% raw %}{{ $labels.instance }}{% endraw %}){% endif %}
description: "{{ rule.description | replace: '"', '\"' }}\n VALUE = {% raw %}{{ $value }}{% endraw %}\n LABELS = {% raw %}{{ $labels }}{% endraw %}" description: "{{ rule.description | replace: '"', '\"' }}\n VALUE = {% raw %}{{ $value }}{% endraw %}\n LABELS = {% raw %}{{ $labels }}{% endraw %}"
{% endfor %} {% endfor %}