mirror of
https://github.com/samber/awesome-prometheus-alerts.git
synced 2026-06-26 11:27:00 +08:00
Merge branch 'master' into master-1
This commit is contained in:
commit
10724be49c
36 changed files with 2205 additions and 1068 deletions
10
.github/workflows/dist.yml
vendored
10
.github/workflows/dist.yml
vendored
|
|
@ -1,6 +1,7 @@
|
||||||
name: Publish
|
name: Publish
|
||||||
|
|
||||||
on:
|
on:
|
||||||
|
workflow_dispatch:
|
||||||
push:
|
push:
|
||||||
branches:
|
branches:
|
||||||
- master
|
- master
|
||||||
|
|
@ -13,22 +14,23 @@ jobs:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout Repo
|
- name: Checkout Repo
|
||||||
uses: actions/checkout@v3
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: Set up Ruby
|
- name: Set up Ruby
|
||||||
uses: ruby/setup-ruby@v1
|
uses: ruby/setup-ruby@v1
|
||||||
with:
|
with:
|
||||||
ruby-version: 2.7
|
ruby-version: 3.4
|
||||||
|
|
||||||
- name: Set up yq
|
- name: Set up yq
|
||||||
uses: mikefarah/yq@master
|
uses: mikefarah/yq@master
|
||||||
|
|
||||||
- name: Install liquid
|
- name: Install liquid
|
||||||
run: gem install liquid-cli
|
run: |
|
||||||
|
gem install liquid -v 5.5.1
|
||||||
|
gem install liquid-cli
|
||||||
|
|
||||||
- name: Build rule configuration
|
- name: Build rule configuration
|
||||||
run: |
|
run: |
|
||||||
gem install liquid-cli
|
|
||||||
cat _data/rules.yml | yq -I 0 -o json > _data/rules.json
|
cat _data/rules.yml | yq -I 0 -o json > _data/rules.json
|
||||||
|
|
||||||
rm -rf dist/rules
|
rm -rf dist/rules
|
||||||
|
|
|
||||||
6
.github/workflows/test.yml
vendored
6
.github/workflows/test.yml
vendored
|
|
@ -8,12 +8,12 @@ jobs:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout Repo
|
- name: Checkout Repo
|
||||||
uses: actions/checkout@v3
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: Set up Ruby
|
- name: Set up Ruby
|
||||||
uses: ruby/setup-ruby@v1
|
uses: ruby/setup-ruby@v1
|
||||||
with:
|
with:
|
||||||
ruby-version: 2.7
|
ruby-version: 3.4
|
||||||
|
|
||||||
- name: Set up yq
|
- name: Set up yq
|
||||||
uses: mikefarah/yq@master
|
uses: mikefarah/yq@master
|
||||||
|
|
@ -31,7 +31,7 @@ jobs:
|
||||||
mkdir -p "${subdir}"
|
mkdir -p "${subdir}"
|
||||||
|
|
||||||
# groupName=$(echo "{% assign groupName = name | split: ' ' %}{% capture groupNameCamelcase %}{% for word in groupName %}{{ word | capitalize }} {% endfor %}{% endcapture %} {{ groupNameCamelcase | remove: ' ' | remove: '-' }}" | liquid $(echo ${service} | base64 --decode | jq -r '.name | ascii_downcase | split(" ") | join("-")'))
|
# groupName=$(echo "{% assign groupName = name | split: ' ' %}{% capture groupNameCamelcase %}{% for word in groupName %}{{ word | capitalize }} {% endfor %}{% endcapture %} {{ groupNameCamelcase | remove: ' ' | remove: '-' }}" | liquid $(echo ${service} | base64 --decode | jq -r '.name | ascii_downcase | split(" ") | join("-")'))
|
||||||
|
|
||||||
for exporter in $(echo ${service} | base64 --decode | jq -r '.exporters[] | @base64'); do
|
for exporter in $(echo ${service} | base64 --decode | jq -r '.exporters[] | @base64'); do
|
||||||
exporterName=$(echo ${exporter} | base64 --decode | jq -r '.slug')
|
exporterName=$(echo ${exporter} | base64 --decode | jq -r '.slug')
|
||||||
cat dist/template.yml | liquid "$(echo ${exporter} | base64 --decode)" > ${subdir}/${exporterName}.yml
|
cat dist/template.yml | liquid "$(echo ${exporter} | base64 --decode)" > ${subdir}/${exporterName}.yml
|
||||||
|
|
|
||||||
1
.gitignore
vendored
1
.gitignore
vendored
|
|
@ -4,3 +4,4 @@ _site/
|
||||||
.jekyll-metadata
|
.jekyll-metadata
|
||||||
_data/rules.json
|
_data/rules.json
|
||||||
test/rules/
|
test/rules/
|
||||||
|
/node_modules
|
||||||
|
|
@ -32,8 +32,8 @@ Or with Docker:
|
||||||
docker run --rm -it -p 4000:4000 -v $(pwd):/srv/jekyll jekyll/jekyll jekyll serve
|
docker run --rm -it -p 4000:4000 -v $(pwd):/srv/jekyll jekyll/jekyll jekyll serve
|
||||||
```
|
```
|
||||||
|
|
||||||
Or with Docker-Compose:
|
Or with Docker Compose:
|
||||||
|
|
||||||
```
|
```
|
||||||
docker-compose up -d
|
docker compose up -d
|
||||||
```
|
```
|
||||||
|
|
|
||||||
7
FUNDING.json
Normal file
7
FUNDING.json
Normal file
|
|
@ -0,0 +1,7 @@
|
||||||
|
{
|
||||||
|
"drips": {
|
||||||
|
"ethereum": {
|
||||||
|
"ownedBy": "0x1Baee8431ead537455399cC7099eBb219227C1f1"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
4
Gemfile
4
Gemfile
|
|
@ -1,3 +1,3 @@
|
||||||
source 'https://rubygems.org'
|
source 'https://rubygems.org'
|
||||||
gem 'github-pages', group: :jekyll_plugins
|
gem 'github-pages', '>= 232', group: :jekyll_plugins
|
||||||
gem 'webrick', '~> 1.3', '>= 1.3.1'
|
gem 'webrick', '~> 1.8'
|
||||||
211
Gemfile.lock
211
Gemfile.lock
|
|
@ -1,66 +1,61 @@
|
||||||
GEM
|
GEM
|
||||||
remote: https://rubygems.org/
|
remote: https://rubygems.org/
|
||||||
specs:
|
specs:
|
||||||
activesupport (6.0.6.1)
|
activesupport (7.2.1)
|
||||||
concurrent-ruby (~> 1.0, >= 1.0.2)
|
base64
|
||||||
i18n (>= 0.7, < 2)
|
bigdecimal
|
||||||
minitest (~> 5.1)
|
concurrent-ruby (~> 1.0, >= 1.3.1)
|
||||||
tzinfo (~> 1.1)
|
connection_pool (>= 2.2.5)
|
||||||
zeitwerk (~> 2.2, >= 2.2.2)
|
drb
|
||||||
addressable (2.8.0)
|
i18n (>= 1.6, < 2)
|
||||||
public_suffix (>= 2.0.2, < 5.0)
|
logger (>= 1.4.2)
|
||||||
|
minitest (>= 5.1)
|
||||||
|
securerandom (>= 0.3)
|
||||||
|
tzinfo (~> 2.0, >= 2.0.5)
|
||||||
|
addressable (2.8.7)
|
||||||
|
public_suffix (>= 2.0.2, < 7.0)
|
||||||
|
base64 (0.2.0)
|
||||||
|
bigdecimal (3.1.8)
|
||||||
coffee-script (2.4.1)
|
coffee-script (2.4.1)
|
||||||
coffee-script-source
|
coffee-script-source
|
||||||
execjs
|
execjs
|
||||||
coffee-script-source (1.11.1)
|
coffee-script-source (1.12.2)
|
||||||
colorator (1.1.0)
|
colorator (1.1.0)
|
||||||
commonmarker (0.23.10)
|
commonmarker (0.23.10)
|
||||||
concurrent-ruby (1.2.0)
|
concurrent-ruby (1.3.4)
|
||||||
dnsruby (1.61.9)
|
connection_pool (2.4.1)
|
||||||
simpleidn (~> 0.1)
|
csv (3.3.0)
|
||||||
|
dnsruby (1.72.2)
|
||||||
|
simpleidn (~> 0.2.1)
|
||||||
|
drb (2.2.1)
|
||||||
em-websocket (0.5.3)
|
em-websocket (0.5.3)
|
||||||
eventmachine (>= 0.12.9)
|
eventmachine (>= 0.12.9)
|
||||||
http_parser.rb (~> 0)
|
http_parser.rb (~> 0)
|
||||||
ethon (0.15.0)
|
ethon (0.16.0)
|
||||||
ffi (>= 1.15.0)
|
ffi (>= 1.15.0)
|
||||||
eventmachine (1.2.7)
|
eventmachine (1.2.7)
|
||||||
execjs (2.8.1)
|
execjs (2.9.1)
|
||||||
faraday (1.10.0)
|
faraday (2.12.0)
|
||||||
faraday-em_http (~> 1.0)
|
faraday-net_http (>= 2.0, < 3.4)
|
||||||
faraday-em_synchrony (~> 1.0)
|
json
|
||||||
faraday-excon (~> 1.1)
|
logger
|
||||||
faraday-httpclient (~> 1.0)
|
faraday-net_http (3.3.0)
|
||||||
faraday-multipart (~> 1.0)
|
net-http
|
||||||
faraday-net_http (~> 1.0)
|
ffi (1.17.0)
|
||||||
faraday-net_http_persistent (~> 1.0)
|
ffi (1.17.0-x86_64-linux-gnu)
|
||||||
faraday-patron (~> 1.0)
|
ffi (1.17.0-x86_64-linux-musl)
|
||||||
faraday-rack (~> 1.0)
|
|
||||||
faraday-retry (~> 1.0)
|
|
||||||
ruby2_keywords (>= 0.0.4)
|
|
||||||
faraday-em_http (1.0.0)
|
|
||||||
faraday-em_synchrony (1.0.0)
|
|
||||||
faraday-excon (1.1.0)
|
|
||||||
faraday-httpclient (1.0.1)
|
|
||||||
faraday-multipart (1.0.3)
|
|
||||||
multipart-post (>= 1.2, < 3)
|
|
||||||
faraday-net_http (1.0.1)
|
|
||||||
faraday-net_http_persistent (1.2.0)
|
|
||||||
faraday-patron (1.0.0)
|
|
||||||
faraday-rack (1.0.0)
|
|
||||||
faraday-retry (1.0.3)
|
|
||||||
ffi (1.15.5)
|
|
||||||
forwardable-extended (2.6.0)
|
forwardable-extended (2.6.0)
|
||||||
gemoji (3.0.1)
|
gemoji (4.1.0)
|
||||||
github-pages (226)
|
github-pages (232)
|
||||||
github-pages-health-check (= 1.17.9)
|
github-pages-health-check (= 1.18.2)
|
||||||
jekyll (= 3.9.2)
|
jekyll (= 3.10.0)
|
||||||
jekyll-avatar (= 0.7.0)
|
jekyll-avatar (= 0.8.0)
|
||||||
jekyll-coffeescript (= 1.1.1)
|
jekyll-coffeescript (= 1.2.2)
|
||||||
jekyll-commonmark-ghpages (= 0.2.0)
|
jekyll-commonmark-ghpages (= 0.5.1)
|
||||||
jekyll-default-layout (= 0.1.4)
|
jekyll-default-layout (= 0.1.5)
|
||||||
jekyll-feed (= 0.15.1)
|
jekyll-feed (= 0.17.0)
|
||||||
jekyll-gist (= 1.5.0)
|
jekyll-gist (= 1.5.0)
|
||||||
jekyll-github-metadata (= 2.13.0)
|
jekyll-github-metadata (= 2.16.1)
|
||||||
jekyll-include-cache (= 0.2.1)
|
jekyll-include-cache (= 0.2.1)
|
||||||
jekyll-mentions (= 1.6.0)
|
jekyll-mentions (= 1.6.0)
|
||||||
jekyll-optional-front-matter (= 0.3.2)
|
jekyll-optional-front-matter (= 0.3.2)
|
||||||
|
|
@ -87,32 +82,34 @@ GEM
|
||||||
jekyll-theme-tactile (= 0.2.0)
|
jekyll-theme-tactile (= 0.2.0)
|
||||||
jekyll-theme-time-machine (= 0.2.0)
|
jekyll-theme-time-machine (= 0.2.0)
|
||||||
jekyll-titles-from-headings (= 0.5.3)
|
jekyll-titles-from-headings (= 0.5.3)
|
||||||
jemoji (= 0.12.0)
|
jemoji (= 0.13.0)
|
||||||
kramdown (= 2.3.2)
|
kramdown (= 2.4.0)
|
||||||
kramdown-parser-gfm (= 1.1.0)
|
kramdown-parser-gfm (= 1.1.0)
|
||||||
liquid (= 4.0.3)
|
liquid (= 4.0.4)
|
||||||
mercenary (~> 0.3)
|
mercenary (~> 0.3)
|
||||||
minima (= 2.5.1)
|
minima (= 2.5.1)
|
||||||
nokogiri (>= 1.13.4, < 2.0)
|
nokogiri (>= 1.16.2, < 2.0)
|
||||||
rouge (= 3.26.0)
|
rouge (= 3.30.0)
|
||||||
terminal-table (~> 1.4)
|
terminal-table (~> 1.4)
|
||||||
github-pages-health-check (1.17.9)
|
webrick (~> 1.8)
|
||||||
|
github-pages-health-check (1.18.2)
|
||||||
addressable (~> 2.3)
|
addressable (~> 2.3)
|
||||||
dnsruby (~> 1.60)
|
dnsruby (~> 1.60)
|
||||||
octokit (~> 4.0)
|
octokit (>= 4, < 8)
|
||||||
public_suffix (>= 3.0, < 5.0)
|
public_suffix (>= 3.0, < 6.0)
|
||||||
typhoeus (~> 1.3)
|
typhoeus (~> 1.3)
|
||||||
html-pipeline (2.14.1)
|
html-pipeline (2.14.3)
|
||||||
activesupport (>= 2)
|
activesupport (>= 2)
|
||||||
nokogiri (>= 1.4)
|
nokogiri (>= 1.4)
|
||||||
http_parser.rb (0.8.0)
|
http_parser.rb (0.8.0)
|
||||||
i18n (0.9.5)
|
i18n (1.14.6)
|
||||||
concurrent-ruby (~> 1.0)
|
concurrent-ruby (~> 1.0)
|
||||||
jekyll (3.9.2)
|
jekyll (3.10.0)
|
||||||
addressable (~> 2.4)
|
addressable (~> 2.4)
|
||||||
colorator (~> 1.0)
|
colorator (~> 1.0)
|
||||||
|
csv (~> 3.0)
|
||||||
em-websocket (~> 0.5)
|
em-websocket (~> 0.5)
|
||||||
i18n (~> 0.7)
|
i18n (>= 0.7, < 2)
|
||||||
jekyll-sass-converter (~> 1.0)
|
jekyll-sass-converter (~> 1.0)
|
||||||
jekyll-watch (~> 2.0)
|
jekyll-watch (~> 2.0)
|
||||||
kramdown (>= 1.17, < 3)
|
kramdown (>= 1.17, < 3)
|
||||||
|
|
@ -121,27 +118,28 @@ GEM
|
||||||
pathutil (~> 0.9)
|
pathutil (~> 0.9)
|
||||||
rouge (>= 1.7, < 4)
|
rouge (>= 1.7, < 4)
|
||||||
safe_yaml (~> 1.0)
|
safe_yaml (~> 1.0)
|
||||||
jekyll-avatar (0.7.0)
|
webrick (>= 1.0)
|
||||||
|
jekyll-avatar (0.8.0)
|
||||||
jekyll (>= 3.0, < 5.0)
|
jekyll (>= 3.0, < 5.0)
|
||||||
jekyll-coffeescript (1.1.1)
|
jekyll-coffeescript (1.2.2)
|
||||||
coffee-script (~> 2.2)
|
coffee-script (~> 2.2)
|
||||||
coffee-script-source (~> 1.11.1)
|
coffee-script-source (~> 1.12)
|
||||||
jekyll-commonmark (1.4.0)
|
jekyll-commonmark (1.4.0)
|
||||||
commonmarker (~> 0.22)
|
commonmarker (~> 0.22)
|
||||||
jekyll-commonmark-ghpages (0.2.0)
|
jekyll-commonmark-ghpages (0.5.1)
|
||||||
commonmarker (~> 0.23.4)
|
commonmarker (>= 0.23.7, < 1.1.0)
|
||||||
jekyll (~> 3.9.0)
|
jekyll (>= 3.9, < 4.0)
|
||||||
jekyll-commonmark (~> 1.4.0)
|
jekyll-commonmark (~> 1.4.0)
|
||||||
rouge (>= 2.0, < 4.0)
|
rouge (>= 2.0, < 5.0)
|
||||||
jekyll-default-layout (0.1.4)
|
jekyll-default-layout (0.1.5)
|
||||||
jekyll (~> 3.0)
|
jekyll (>= 3.0, < 5.0)
|
||||||
jekyll-feed (0.15.1)
|
jekyll-feed (0.17.0)
|
||||||
jekyll (>= 3.7, < 5.0)
|
jekyll (>= 3.7, < 5.0)
|
||||||
jekyll-gist (1.5.0)
|
jekyll-gist (1.5.0)
|
||||||
octokit (~> 4.2)
|
octokit (~> 4.2)
|
||||||
jekyll-github-metadata (2.13.0)
|
jekyll-github-metadata (2.16.1)
|
||||||
jekyll (>= 3.4, < 5.0)
|
jekyll (>= 3.4, < 5.0)
|
||||||
octokit (~> 4.0, != 4.4.0)
|
octokit (>= 4, < 7, != 4.4.0)
|
||||||
jekyll-include-cache (0.2.1)
|
jekyll-include-cache (0.2.1)
|
||||||
jekyll (>= 3.7, < 5.0)
|
jekyll (>= 3.7, < 5.0)
|
||||||
jekyll-mentions (1.6.0)
|
jekyll-mentions (1.6.0)
|
||||||
|
|
@ -212,40 +210,46 @@ GEM
|
||||||
jekyll (>= 3.3, < 5.0)
|
jekyll (>= 3.3, < 5.0)
|
||||||
jekyll-watch (2.2.1)
|
jekyll-watch (2.2.1)
|
||||||
listen (~> 3.0)
|
listen (~> 3.0)
|
||||||
jemoji (0.12.0)
|
jemoji (0.13.0)
|
||||||
gemoji (~> 3.0)
|
gemoji (>= 3, < 5)
|
||||||
html-pipeline (~> 2.2)
|
html-pipeline (~> 2.2)
|
||||||
jekyll (>= 3.0, < 5.0)
|
jekyll (>= 3.0, < 5.0)
|
||||||
kramdown (2.3.2)
|
json (2.7.2)
|
||||||
|
kramdown (2.4.0)
|
||||||
rexml
|
rexml
|
||||||
kramdown-parser-gfm (1.1.0)
|
kramdown-parser-gfm (1.1.0)
|
||||||
kramdown (~> 2.0)
|
kramdown (~> 2.0)
|
||||||
liquid (4.0.3)
|
liquid (4.0.4)
|
||||||
listen (3.7.1)
|
listen (3.9.0)
|
||||||
rb-fsevent (~> 0.10, >= 0.10.3)
|
rb-fsevent (~> 0.10, >= 0.10.3)
|
||||||
rb-inotify (~> 0.9, >= 0.9.10)
|
rb-inotify (~> 0.9, >= 0.9.10)
|
||||||
|
logger (1.6.1)
|
||||||
mercenary (0.3.6)
|
mercenary (0.3.6)
|
||||||
|
mini_portile2 (2.8.7)
|
||||||
minima (2.5.1)
|
minima (2.5.1)
|
||||||
jekyll (>= 3.5, < 5.0)
|
jekyll (>= 3.5, < 5.0)
|
||||||
jekyll-feed (~> 0.9)
|
jekyll-feed (~> 0.9)
|
||||||
jekyll-seo-tag (~> 2.1)
|
jekyll-seo-tag (~> 2.1)
|
||||||
minitest (5.17.0)
|
minitest (5.25.1)
|
||||||
multipart-post (2.1.1)
|
net-http (0.4.1)
|
||||||
nokogiri (1.14.3-x86_64-linux)
|
uri
|
||||||
|
nokogiri (1.16.7)
|
||||||
|
mini_portile2 (~> 2.8.2)
|
||||||
racc (~> 1.4)
|
racc (~> 1.4)
|
||||||
octokit (4.22.0)
|
nokogiri (1.16.7-x86_64-linux)
|
||||||
faraday (>= 0.9)
|
racc (~> 1.4)
|
||||||
sawyer (~> 0.8.0, >= 0.5.3)
|
octokit (4.25.1)
|
||||||
|
faraday (>= 1, < 3)
|
||||||
|
sawyer (~> 0.9)
|
||||||
pathutil (0.16.2)
|
pathutil (0.16.2)
|
||||||
forwardable-extended (~> 2.6)
|
forwardable-extended (~> 2.6)
|
||||||
public_suffix (4.0.7)
|
public_suffix (5.1.1)
|
||||||
racc (1.6.2)
|
racc (1.8.1)
|
||||||
rb-fsevent (0.11.1)
|
rb-fsevent (0.11.2)
|
||||||
rb-inotify (0.10.1)
|
rb-inotify (0.11.1)
|
||||||
ffi (~> 1.0)
|
ffi (~> 1.0)
|
||||||
rexml (3.2.5)
|
rexml (3.3.9)
|
||||||
rouge (3.26.0)
|
rouge (3.30.0)
|
||||||
ruby2_keywords (0.0.5)
|
|
||||||
rubyzip (2.3.2)
|
rubyzip (2.3.2)
|
||||||
safe_yaml (1.0.5)
|
safe_yaml (1.0.5)
|
||||||
sass (3.7.4)
|
sass (3.7.4)
|
||||||
|
|
@ -253,32 +257,29 @@ GEM
|
||||||
sass-listen (4.0.0)
|
sass-listen (4.0.0)
|
||||||
rb-fsevent (~> 0.9, >= 0.9.4)
|
rb-fsevent (~> 0.9, >= 0.9.4)
|
||||||
rb-inotify (~> 0.9, >= 0.9.7)
|
rb-inotify (~> 0.9, >= 0.9.7)
|
||||||
sawyer (0.8.2)
|
sawyer (0.9.2)
|
||||||
addressable (>= 2.3.5)
|
addressable (>= 2.3.5)
|
||||||
faraday (> 0.8, < 2.0)
|
faraday (>= 0.17.3, < 3)
|
||||||
simpleidn (0.2.1)
|
securerandom (0.3.1)
|
||||||
unf (~> 0.1.4)
|
simpleidn (0.2.3)
|
||||||
terminal-table (1.8.0)
|
terminal-table (1.8.0)
|
||||||
unicode-display_width (~> 1.1, >= 1.1.1)
|
unicode-display_width (~> 1.1, >= 1.1.1)
|
||||||
thread_safe (0.3.6)
|
typhoeus (1.4.1)
|
||||||
typhoeus (1.4.0)
|
|
||||||
ethon (>= 0.9.0)
|
ethon (>= 0.9.0)
|
||||||
tzinfo (1.2.11)
|
tzinfo (2.0.6)
|
||||||
thread_safe (~> 0.1)
|
concurrent-ruby (~> 1.0)
|
||||||
unf (0.1.4)
|
|
||||||
unf_ext
|
|
||||||
unf_ext (0.0.8.1)
|
|
||||||
unicode-display_width (1.8.0)
|
unicode-display_width (1.8.0)
|
||||||
webrick (1.7.0)
|
uri (0.13.1)
|
||||||
zeitwerk (2.6.6)
|
webrick (1.8.2)
|
||||||
|
|
||||||
PLATFORMS
|
PLATFORMS
|
||||||
|
ruby
|
||||||
x86_64-linux
|
x86_64-linux
|
||||||
x86_64-linux-musl
|
x86_64-linux-musl
|
||||||
|
|
||||||
DEPENDENCIES
|
DEPENDENCIES
|
||||||
github-pages
|
github-pages (>= 232)
|
||||||
webrick (~> 1.3, >= 1.3.1)
|
webrick (~> 1.8)
|
||||||
|
|
||||||
BUNDLED WITH
|
BUNDLED WITH
|
||||||
2.3.13
|
2.3.13
|
||||||
|
|
|
||||||
22
README.md
22
README.md
|
|
@ -4,6 +4,21 @@
|
||||||
|
|
||||||
Collection available here: **[https://samber.github.io/awesome-prometheus-alerts](https://samber.github.io/awesome-prometheus-alerts)**
|
Collection available here: **[https://samber.github.io/awesome-prometheus-alerts](https://samber.github.io/awesome-prometheus-alerts)**
|
||||||
|
|
||||||
|
<div align="center">
|
||||||
|
<hr>
|
||||||
|
<sup><b>Sponsored by:</b></sup>
|
||||||
|
<br>
|
||||||
|
<a href="https://betterstack.com">
|
||||||
|
<div>
|
||||||
|
<img src="https://samber.github.io/awesome-prometheus-alerts/assets/sponsor-betterstack.png" width="200" alt="Better Stack">
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
Better Stack lets you centralize, search, and visualize your logs.
|
||||||
|
</div>
|
||||||
|
</a>
|
||||||
|
<hr>
|
||||||
|
</div>
|
||||||
|
|
||||||
## ✨ Contents
|
## ✨ Contents
|
||||||
|
|
||||||
- [Rules](#-rules)
|
- [Rules](#-rules)
|
||||||
|
|
@ -18,6 +33,7 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts
|
||||||
|
|
||||||
- [Prometheus self-monitoring](https://samber.github.io/awesome-prometheus-alerts/rules#prometheus-internals)
|
- [Prometheus self-monitoring](https://samber.github.io/awesome-prometheus-alerts/rules#prometheus-internals)
|
||||||
- [Host/Hardware](https://samber.github.io/awesome-prometheus-alerts/rules#host-and-hardware)
|
- [Host/Hardware](https://samber.github.io/awesome-prometheus-alerts/rules#host-and-hardware)
|
||||||
|
- [SMART](https://samber.github.io/awesome-prometheus-alerts/rules#smart)
|
||||||
- [Docker Containers](https://samber.github.io/awesome-prometheus-alerts/rules#docker-containers)
|
- [Docker Containers](https://samber.github.io/awesome-prometheus-alerts/rules#docker-containers)
|
||||||
- [Blackbox](https://samber.github.io/awesome-prometheus-alerts/rules#blackbox)
|
- [Blackbox](https://samber.github.io/awesome-prometheus-alerts/rules#blackbox)
|
||||||
- [Windows](https://samber.github.io/awesome-prometheus-alerts/rules#windows-server)
|
- [Windows](https://samber.github.io/awesome-prometheus-alerts/rules#windows-server)
|
||||||
|
|
@ -35,12 +51,15 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts
|
||||||
- [MongoDB](https://samber.github.io/awesome-prometheus-alerts/rules#mongodb)
|
- [MongoDB](https://samber.github.io/awesome-prometheus-alerts/rules#mongodb)
|
||||||
- [RabbitMQ](https://samber.github.io/awesome-prometheus-alerts/rules#rabbitmq)
|
- [RabbitMQ](https://samber.github.io/awesome-prometheus-alerts/rules#rabbitmq)
|
||||||
- [Elasticsearch](https://samber.github.io/awesome-prometheus-alerts/rules#elasticsearch)
|
- [Elasticsearch](https://samber.github.io/awesome-prometheus-alerts/rules#elasticsearch)
|
||||||
|
- [Meilisearch](https://samber.github.io/awesome-prometheus-alerts/rules#meilisearch)
|
||||||
- [Cassandra](https://samber.github.io/awesome-prometheus-alerts/rules#cassandra)
|
- [Cassandra](https://samber.github.io/awesome-prometheus-alerts/rules#cassandra)
|
||||||
|
- [Clickhouse](https://samber.github.io/awesome-prometheus-alerts/rules#clickhouse)
|
||||||
- [Zookeeper](https://samber.github.io/awesome-prometheus-alerts/rules#zookeeper)
|
- [Zookeeper](https://samber.github.io/awesome-prometheus-alerts/rules#zookeeper)
|
||||||
- [Kafka](https://samber.github.io/awesome-prometheus-alerts/rules#kafka)
|
- [Kafka](https://samber.github.io/awesome-prometheus-alerts/rules#kafka)
|
||||||
- [Pulsar](https://samber.github.io/awesome-prometheus-alerts/rules#pulsar)
|
- [Pulsar](https://samber.github.io/awesome-prometheus-alerts/rules#pulsar)
|
||||||
- [Nats](https://samber.github.io/awesome-prometheus-alerts/rules#nats)
|
- [Nats](https://samber.github.io/awesome-prometheus-alerts/rules#nats)
|
||||||
- [Solr](https://samber.github.io/awesome-prometheus-alerts/rules#solr)
|
- [Solr](https://samber.github.io/awesome-prometheus-alerts/rules#solr)
|
||||||
|
- [Hadoop](https://samber.github.io/awesome-prometheus-alerts/rules#hadoop)
|
||||||
|
|
||||||
#### Reverse proxies and load balancers
|
#### Reverse proxies and load balancers
|
||||||
|
|
||||||
|
|
@ -48,6 +67,7 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts
|
||||||
- [Apache](https://samber.github.io/awesome-prometheus-alerts/rules#apache)
|
- [Apache](https://samber.github.io/awesome-prometheus-alerts/rules#apache)
|
||||||
- [HaProxy](https://samber.github.io/awesome-prometheus-alerts/rules#haproxy)
|
- [HaProxy](https://samber.github.io/awesome-prometheus-alerts/rules#haproxy)
|
||||||
- [Traefik](https://samber.github.io/awesome-prometheus-alerts/rules#traefik)
|
- [Traefik](https://samber.github.io/awesome-prometheus-alerts/rules#traefik)
|
||||||
|
- [Caddy](https://samber.github.io/awesome-prometheus-alerts/rules#caddy)
|
||||||
|
|
||||||
#### Runtimes
|
#### Runtimes
|
||||||
|
|
||||||
|
|
@ -83,7 +103,9 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts
|
||||||
- [Loki](https://samber.github.io/awesome-prometheus-alerts/rules#loki)
|
- [Loki](https://samber.github.io/awesome-prometheus-alerts/rules#loki)
|
||||||
- [Promtail](https://samber.github.io/awesome-prometheus-alerts/rules#promtail)
|
- [Promtail](https://samber.github.io/awesome-prometheus-alerts/rules#promtail)
|
||||||
- [Cortex](https://samber.github.io/awesome-prometheus-alerts/rules#cortex)
|
- [Cortex](https://samber.github.io/awesome-prometheus-alerts/rules#cortex)
|
||||||
|
- [Grafana Alloy](https://samber.github.io/awesome-prometheus-alerts/rules#grafana-alloy)
|
||||||
- [Jenkins](https://samber.github.io/awesome-prometheus-alerts/rules#jenkins)
|
- [Jenkins](https://samber.github.io/awesome-prometheus-alerts/rules#jenkins)
|
||||||
|
- [Graph Node](https://samber.github.io/awesome-prometheus-alerts/rules#graph-node)
|
||||||
|
|
||||||
## 🤝 Contributing
|
## 🤝 Contributing
|
||||||
|
|
||||||
|
|
|
||||||
1911
_data/rules.yml
1911
_data/rules.yml
File diff suppressed because it is too large
Load diff
|
|
@ -125,6 +125,18 @@
|
||||||
class="fa fa-linkedin" target="_blank"></a>
|
class="fa fa-linkedin" target="_blank"></a>
|
||||||
</li>
|
</li>
|
||||||
</ul>
|
</ul>
|
||||||
|
|
||||||
|
|
||||||
|
<ul id="sponsoring">
|
||||||
|
<li>
|
||||||
|
Kindly supported by 👉
|
||||||
|
</li>
|
||||||
|
<li>
|
||||||
|
<a href="https://betterstack.com/">
|
||||||
|
<img width="" src="assets/sponsor-betterstack.png" />
|
||||||
|
</a>
|
||||||
|
</li>
|
||||||
|
</ul>
|
||||||
</header>
|
</header>
|
||||||
|
|
||||||
<main id="content" class="main-content" role="main">
|
<main id="content" class="main-content" role="main">
|
||||||
|
|
@ -147,7 +159,7 @@
|
||||||
s['ScreebObject']=r;s[r]=s[r]||function(){(s[r].q=s[r].q||[]).push(arguments)};
|
s['ScreebObject']=r;s[r]=s[r]||function(){(s[r].q=s[r].q||[]).push(arguments)};
|
||||||
b=c.createElement('script');b.type='text/javascript';
|
b=c.createElement('script');b.type='text/javascript';
|
||||||
b.id=r;b.src=ee;b.async=1;c.getElementsByTagName("head")[0].appendChild(b);
|
b.id=r;b.src=ee;b.async=1;c.getElementsByTagName("head")[0].appendChild(b);
|
||||||
}(window,document,'$screeb','https://t.screeb.app/tag.js'));
|
}(window,document,'$screeb','https://t2.screeb.app/tag.js'));
|
||||||
|
|
||||||
$screeb('init', '232450e3-d3fe-4240-b543-649a5041a7db');
|
$screeb('init', '232450e3-d3fe-4240-b543-649a5041a7db');
|
||||||
</script>
|
</script>
|
||||||
|
|
|
||||||
|
|
@ -80,7 +80,7 @@ route:
|
||||||
- receiver: "pager"
|
- receiver: "pager"
|
||||||
group_wait: 10s
|
group_wait: 10s
|
||||||
match_re:
|
match_re:
|
||||||
severity: critial
|
severity: critical
|
||||||
continue: true
|
continue: true
|
||||||
|
|
||||||
receivers:
|
receivers:
|
||||||
|
|
@ -135,4 +135,7 @@ If the notification takes too much time to be triggered, check the following del
|
||||||
- `for: 5m` (alerts/example-mysql.yml)
|
- `for: 5m` (alerts/example-mysql.yml)
|
||||||
- `group_wait = 10s` (alertmanager.yml)
|
- `group_wait = 10s` (alertmanager.yml)
|
||||||
|
|
||||||
Also read [https://pracucci.com/prometheus-understanding-the-delays-on-alerting.html](https://pracucci.com/prometheus-understanding-the-delays-on-alerting.html).
|
Also read:
|
||||||
|
- [https://pracucci.com/prometheus-understanding-the-delays-on-alerting.html](https://pracucci.com/prometheus-understanding-the-delays-on-alerting.html).
|
||||||
|
- [https://hodovi.cc/blog/creating-awesome-alertmanager-templates-for-slack/](https://hodovi.cc/blog/creating-awesome-alertmanager-templates-for-slack/)
|
||||||
|
- [https://grafana.com/blog/2024/10/03/how-to-use-prometheus-to-efficiently-detect-anomalies-at-scale/](https://grafana.com/blog/2024/10/03/how-to-use-prometheus-to-efficiently-detect-anomalies-at-scale/)
|
||||||
|
|
|
||||||
|
|
@ -115,3 +115,29 @@ h2 {
|
||||||
max-width: 85rem;
|
max-width: 85rem;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ul#sponsoring {
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
justify-content: center;
|
||||||
|
margin-top: 50px;
|
||||||
|
}
|
||||||
|
|
||||||
|
ul#sponsoring li {
|
||||||
|
display: flex;
|
||||||
|
padding: 0px 15px;
|
||||||
|
font-size: 16px;
|
||||||
|
}
|
||||||
|
|
||||||
|
ul#sponsoring li a {
|
||||||
|
display: flex;
|
||||||
|
}
|
||||||
|
|
||||||
|
ul#sponsoring li a img {
|
||||||
|
max-width: 180px;
|
||||||
|
max-height: 80px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.page-header {
|
||||||
|
padding-bottom: 30px;
|
||||||
|
}
|
||||||
BIN
assets/sponsor-betterstack.png
Normal file
BIN
assets/sponsor-betterstack.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 16 KiB |
32
dist/rules/caddy/null.yml
vendored
Normal file
32
dist/rules/caddy/null.yml
vendored
Normal file
|
|
@ -0,0 +1,32 @@
|
||||||
|
groups:
|
||||||
|
|
||||||
|
- name:
|
||||||
|
|
||||||
|
rules:
|
||||||
|
|
||||||
|
- alert: CaddyReverseProxyDown
|
||||||
|
expr: 'count(caddy_reverse_proxy_upstreams_healthy) by (upstream) == 0'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Caddy Reverse Proxy Down (instance {{ $labels.instance }})
|
||||||
|
description: "All Caddy reverse proxies are down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: CaddyHighHttp4xxErrorRateService
|
||||||
|
expr: 'sum(rate(caddy_http_request_duration_seconds_count{code=~"4.."}[3m])) by (instance) / sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) * 100 > 5'
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Caddy high HTTP 4xx error rate service (instance {{ $labels.instance }})
|
||||||
|
description: "Caddy service 4xx error rate is above 5%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: CaddyHighHttp5xxErrorRateService
|
||||||
|
expr: 'sum(rate(caddy_http_request_duration_seconds_count{code=~"5.."}[3m])) by (instance) / sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) * 100 > 5'
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Caddy high HTTP 5xx error rate service (instance {{ $labels.instance }})
|
||||||
|
description: "Caddy service 5xx error rate is above 5%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
131
dist/rules/clickhouse/embedded-exporter.yml
vendored
Normal file
131
dist/rules/clickhouse/embedded-exporter.yml
vendored
Normal file
|
|
@ -0,0 +1,131 @@
|
||||||
|
groups:
|
||||||
|
|
||||||
|
- name: EmbeddedExporter
|
||||||
|
|
||||||
|
rules:
|
||||||
|
|
||||||
|
- alert: ClickhouseMemoryUsageCritical
|
||||||
|
expr: 'ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 90'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: ClickHouse Memory Usage Critical (instance {{ $labels.instance }})
|
||||||
|
description: "Memory usage is critically high, over 90%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: ClickhouseMemoryUsageWarning
|
||||||
|
expr: 'ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 80'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: ClickHouse Memory Usage Warning (instance {{ $labels.instance }})
|
||||||
|
description: "Memory usage is over 80%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: ClickhouseDiskSpaceLowOnDefault
|
||||||
|
expr: 'ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 20'
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: ClickHouse Disk Space Low on Default (instance {{ $labels.instance }})
|
||||||
|
description: "Disk space on default is below 20%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: ClickhouseDiskSpaceCriticalOnDefault
|
||||||
|
expr: 'ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 10'
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: ClickHouse Disk Space Critical on Default (instance {{ $labels.instance }})
|
||||||
|
description: "Disk space on default disk is critically low, below 10%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: ClickhouseDiskSpaceLowOnBackups
|
||||||
|
expr: 'ClickHouseAsyncMetrics_DiskAvailable_backups / (ClickHouseAsyncMetrics_DiskAvailable_backups + ClickHouseAsyncMetrics_DiskUsed_backups) * 100 < 20'
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: ClickHouse Disk Space Low on Backups (instance {{ $labels.instance }})
|
||||||
|
description: "Disk space on backups is below 20%.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: ClickhouseReplicaErrors
|
||||||
|
expr: 'ClickHouseErrorMetric_ALL_REPLICAS_ARE_STALE == 1 or ClickHouseErrorMetric_ALL_REPLICAS_LOST == 1'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: ClickHouse Replica Errors (instance {{ $labels.instance }})
|
||||||
|
description: "Critical replica errors detected, either all replicas are stale or lost.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: ClickhouseNoAvailableReplicas
|
||||||
|
expr: 'ClickHouseErrorMetric_NO_AVAILABLE_REPLICA == 1'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: ClickHouse No Available Replicas (instance {{ $labels.instance }})
|
||||||
|
description: "No available replicas in ClickHouse.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: ClickhouseNoLiveReplicas
|
||||||
|
expr: 'ClickHouseErrorMetric_TOO_FEW_LIVE_REPLICAS == 1'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: ClickHouse No Live Replicas (instance {{ $labels.instance }})
|
||||||
|
description: "There are too few live replicas available, risking data loss and service disruption.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: ClickhouseHighNetworkTraffic
|
||||||
|
expr: 'ClickHouseMetrics_NetworkSend > 250 or ClickHouseMetrics_NetworkReceive > 250'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: ClickHouse High Network Traffic (instance {{ $labels.instance }})
|
||||||
|
description: "Network traffic is unusually high, may affect cluster performance.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: ClickhouseHighTcpConnections
|
||||||
|
expr: 'ClickHouseMetrics_TCPConnection > 400'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: ClickHouse High TCP Connections (instance {{ $labels.instance }})
|
||||||
|
description: "High number of TCP connections, indicating heavy client or inter-cluster communication.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: ClickhouseInterserverConnectionIssues
|
||||||
|
expr: 'increase(ClickHouseMetrics_InterserverConnection[5m]) > 0'
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: ClickHouse Interserver Connection Issues (instance {{ $labels.instance }})
|
||||||
|
description: "An increase in interserver connections may indicate replication or distributed query handling issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: ClickhouseZookeeperConnectionIssues
|
||||||
|
expr: 'avg(ClickHouseMetrics_ZooKeeperSession) != 1'
|
||||||
|
for: 3m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: ClickHouse ZooKeeper Connection Issues (instance {{ $labels.instance }})
|
||||||
|
description: "ClickHouse is experiencing issues with ZooKeeper connections, which may affect cluster state and coordination.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: ClickhouseAuthenticationFailures
|
||||||
|
expr: 'increase(ClickHouseErrorMetric_AUTHENTICATION_FAILED[5m]) > 0'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: info
|
||||||
|
annotations:
|
||||||
|
summary: ClickHouse Authentication Failures (instance {{ $labels.instance }})
|
||||||
|
description: "Authentication failures detected, indicating potential security issues or misconfiguration.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: ClickhouseAccessDeniedErrors
|
||||||
|
expr: 'increase(ClickHouseErrorMetric_RESOURCE_ACCESS_DENIED[5m]) > 0'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: info
|
||||||
|
annotations:
|
||||||
|
summary: ClickHouse Access Denied Errors (instance {{ $labels.instance }})
|
||||||
|
description: "Access denied errors have been logged, which could indicate permission issues or unauthorized access attempts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
@ -23,7 +23,7 @@ groups:
|
||||||
description: "A container is absent for 5 min\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "A container is absent for 5 min\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: ContainerHighCpuUtilization
|
- alert: ContainerHighCpuUtilization
|
||||||
expr: '(sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) > 80'
|
expr: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) > 80'
|
||||||
for: 2m
|
for: 2m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
|
@ -50,8 +50,8 @@ groups:
|
||||||
description: "Container Volume usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Container Volume usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: ContainerHighThrottleRate
|
- alert: ContainerHighThrottleRate
|
||||||
expr: 'rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1'
|
expr: 'sum(increase(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) by (container, pod, namespace) / sum(increase(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > ( 25 / 100 )'
|
||||||
for: 2m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
|
|
@ -69,7 +69,7 @@ groups:
|
||||||
|
|
||||||
|
|
||||||
- alert: ContainerLowCpuUtilization
|
- alert: ContainerLowCpuUtilization
|
||||||
expr: '(sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) < 20'
|
expr: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) < 20'
|
||||||
for: 7d
|
for: 7d
|
||||||
labels:
|
labels:
|
||||||
severity: info
|
severity: info
|
||||||
|
|
|
||||||
|
|
@ -138,3 +138,39 @@ groups:
|
||||||
annotations:
|
annotations:
|
||||||
summary: Elasticsearch no new documents (instance {{ $labels.instance }})
|
summary: Elasticsearch no new documents (instance {{ $labels.instance }})
|
||||||
description: "No new documents for 10 min!\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "No new documents for 10 min!\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: ElasticsearchHighIndexingLatency
|
||||||
|
expr: 'elasticsearch_indices_indexing_index_time_seconds_total / elasticsearch_indices_indexing_index_total > 0.0005'
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Elasticsearch High Indexing Latency (instance {{ $labels.instance }})
|
||||||
|
description: "The indexing latency on Elasticsearch cluster is higher than the threshold.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: ElasticsearchHighIndexingRate
|
||||||
|
expr: 'sum(rate(elasticsearch_indices_indexing_index_total[1m]))> 10000'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Elasticsearch High Indexing Rate (instance {{ $labels.instance }})
|
||||||
|
description: "The indexing rate on Elasticsearch cluster is higher than the threshold.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: ElasticsearchHighQueryRate
|
||||||
|
expr: 'sum(rate(elasticsearch_indices_search_query_total[1m])) > 100'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Elasticsearch High Query Rate (instance {{ $labels.instance }})
|
||||||
|
description: "The query rate on Elasticsearch cluster is higher than the threshold.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: ElasticsearchHighQueryLatency
|
||||||
|
expr: 'elasticsearch_indices_search_fetch_time_seconds / elasticsearch_indices_search_fetch_total > 1'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Elasticsearch High Query Latency (instance {{ $labels.instance }})
|
||||||
|
description: "The query latency on Elasticsearch cluster is higher than the threshold.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
|
||||||
14
dist/rules/grafana-alloy/embedded-exporter.yml
vendored
Normal file
14
dist/rules/grafana-alloy/embedded-exporter.yml
vendored
Normal file
|
|
@ -0,0 +1,14 @@
|
||||||
|
groups:
|
||||||
|
|
||||||
|
- name: EmbeddedExporter
|
||||||
|
|
||||||
|
rules:
|
||||||
|
|
||||||
|
- alert: GrafanaAlloyServiceDown
|
||||||
|
expr: 'count by (instance) (alloy_build_info) unless count by (instance) (alloy_build_info offset 2m) '
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Grafana Alloy service down (instance {{ $labels.instance }})
|
||||||
|
description: "Alloy on (instance {{ $labels.instance }}) is not responding or has stopped running.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
59
dist/rules/graph-node/embedded-exporter.yml
vendored
Normal file
59
dist/rules/graph-node/embedded-exporter.yml
vendored
Normal file
|
|
@ -0,0 +1,59 @@
|
||||||
|
groups:
|
||||||
|
|
||||||
|
- name: EmbeddedExporter
|
||||||
|
|
||||||
|
rules:
|
||||||
|
|
||||||
|
- alert: ProviderFailedBecauseNet_versionFailed
|
||||||
|
expr: 'eth_rpc_status == 1'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Provider failed because net_version failed (instance {{ $labels.instance }})
|
||||||
|
description: "Failed net_version for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: ProviderFailedBecauseGetGenesisFailed
|
||||||
|
expr: 'eth_rpc_status == 2'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Provider failed because get genesis failed (instance {{ $labels.instance }})
|
||||||
|
description: "Failed to get genesis for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: ProviderFailedBecauseNet_versionTimeout
|
||||||
|
expr: 'eth_rpc_status == 3'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Provider failed because net_version timeout (instance {{ $labels.instance }})
|
||||||
|
description: "net_version timeout for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: ProviderFailedBecauseGetGenesisTimeout
|
||||||
|
expr: 'eth_rpc_status == 4'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Provider failed because get genesis timeout (instance {{ $labels.instance }})
|
||||||
|
description: "Timeout to get genesis for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: StoreConnectionIsTooSlow
|
||||||
|
expr: 'store_connection_wait_time_ms > 10'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Store connection is too slow (instance {{ $labels.instance }})
|
||||||
|
description: "Store connection is too slow to `{{$labels.pool}}` pool, `{{$labels.shard}}` shard in Graph node `{{$labels.instance}}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: StoreConnectionIsTooSlow
|
||||||
|
expr: 'store_connection_wait_time_ms > 20'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Store connection is too slow (instance {{ $labels.instance }})
|
||||||
|
description: "Store connection is too slow to `{{$labels.pool}}` pool, `{{$labels.shard}}` shard in Graph node `{{$labels.instance}}`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
95
dist/rules/hadoop/jmx_exporter.yml
vendored
Normal file
95
dist/rules/hadoop/jmx_exporter.yml
vendored
Normal file
|
|
@ -0,0 +1,95 @@
|
||||||
|
groups:
|
||||||
|
|
||||||
|
- name: Jmx_exporter
|
||||||
|
|
||||||
|
rules:
|
||||||
|
|
||||||
|
- alert: HadoopNameNodeDown
|
||||||
|
expr: 'up{job="hadoop-namenode"} == 0'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Hadoop Name Node Down (instance {{ $labels.instance }})
|
||||||
|
description: "The Hadoop NameNode service is unavailable.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HadoopResourceManagerDown
|
||||||
|
expr: 'up{job="hadoop-resourcemanager"} == 0'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Hadoop Resource Manager Down (instance {{ $labels.instance }})
|
||||||
|
description: "The Hadoop ResourceManager service is unavailable.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HadoopDataNodeOutOfService
|
||||||
|
expr: 'hadoop_datanode_last_heartbeat == 0'
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Hadoop Data Node Out Of Service (instance {{ $labels.instance }})
|
||||||
|
description: "The Hadoop DataNode is not sending heartbeats.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HadoopHdfsDiskSpaceLow
|
||||||
|
expr: '(hadoop_hdfs_bytes_total - hadoop_hdfs_bytes_used) / hadoop_hdfs_bytes_total < 0.1'
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Hadoop HDFS Disk Space Low (instance {{ $labels.instance }})
|
||||||
|
description: "Available HDFS disk space is running low.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HadoopMapReduceTaskFailures
|
||||||
|
expr: 'hadoop_mapreduce_task_failures_total > 100'
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Hadoop Map Reduce Task Failures (instance {{ $labels.instance }})
|
||||||
|
description: "There is an unusually high number of MapReduce task failures.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HadoopResourceManagerMemoryHigh
|
||||||
|
expr: 'hadoop_resourcemanager_memory_bytes / hadoop_resourcemanager_memory_max_bytes > 0.8'
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Hadoop Resource Manager Memory High (instance {{ $labels.instance }})
|
||||||
|
description: "The Hadoop ResourceManager is approaching its memory limit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HadoopYarnContainerAllocationFailures
|
||||||
|
expr: 'hadoop_yarn_container_allocation_failures_total > 10'
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Hadoop YARN Container Allocation Failures (instance {{ $labels.instance }})
|
||||||
|
description: "There is a significant number of YARN container allocation failures.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HadoopHbaseRegionCountHigh
|
||||||
|
expr: 'hadoop_hbase_region_count > 5000'
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Hadoop HBase Region Count High (instance {{ $labels.instance }})
|
||||||
|
description: "The HBase cluster has an unusually high number of regions.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HadoopHbaseRegionServerHeapLow
|
||||||
|
expr: 'hadoop_hbase_region_server_heap_bytes / hadoop_hbase_region_server_max_heap_bytes < 0.2'
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Hadoop HBase Region Server Heap Low (instance {{ $labels.instance }})
|
||||||
|
description: "HBase Region Servers are running low on heap space.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HadoopHbaseWriteRequestsLatencyHigh
|
||||||
|
expr: 'hadoop_hbase_write_requests_latency_seconds > 0.5'
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Hadoop HBase Write Requests Latency High (instance {{ $labels.instance }})
|
||||||
|
description: "HBase Write Requests are experiencing high latency.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
2
dist/rules/haproxy/haproxy-exporter-v1.yml
vendored
2
dist/rules/haproxy/haproxy-exporter-v1.yml
vendored
|
|
@ -77,7 +77,7 @@ groups:
|
||||||
description: "Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: HaproxyBackendMaxActiveSession
|
- alert: HaproxyBackendMaxActiveSession
|
||||||
expr: '((sum by (backend) (avg_over_time(haproxy_backend_max_sessions[2m]) * 100) / sum by (backend) (avg_over_time(haproxy_backend_limit_sessions[2m])))) > 80'
|
expr: '((sum by (backend) (avg_over_time(haproxy_backend_current_sessions[2m]) * 100) / sum by (backend) (avg_over_time(haproxy_backend_limit_sessions[2m])))) > 80'
|
||||||
for: 2m
|
for: 2m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
|
|
||||||
160
dist/rules/host-and-hardware/node-exporter.yml
vendored
160
dist/rules/host-and-hardware/node-exporter.yml
vendored
|
|
@ -5,7 +5,7 @@ groups:
|
||||||
rules:
|
rules:
|
||||||
|
|
||||||
- alert: HostOutOfMemory
|
- alert: HostOutOfMemory
|
||||||
expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < .10)'
|
||||||
for: 2m
|
for: 2m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
|
@ -14,106 +14,97 @@ groups:
|
||||||
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: HostMemoryUnderMemoryPressure
|
- alert: HostMemoryUnderMemoryPressure
|
||||||
expr: '(rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
expr: '(rate(node_vmstat_pgmajfault[5m]) > 1000)'
|
||||||
for: 2m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: Host memory under memory pressure (instance {{ $labels.instance }})
|
summary: Host memory under memory pressure (instance {{ $labels.instance }})
|
||||||
description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "The node is under heavy memory pressure. High rate of loading memory pages from disk.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: HostMemoryIsUnderutilized
|
- alert: HostMemoryIsUnderutilized
|
||||||
expr: '(100 - (rate(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
expr: 'min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8'
|
||||||
for: 1w
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: info
|
severity: info
|
||||||
annotations:
|
annotations:
|
||||||
summary: Host Memory is underutilized (instance {{ $labels.instance }})
|
summary: Host Memory is underutilized (instance {{ $labels.instance }})
|
||||||
description: "Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: HostUnusualNetworkThroughputIn
|
- alert: HostUnusualNetworkThroughputIn
|
||||||
expr: '(sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
expr: '((rate(node_network_receive_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)'
|
||||||
for: 5m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: Host unusual network throughput in (instance {{ $labels.instance }})
|
summary: Host unusual network throughput in (instance {{ $labels.instance }})
|
||||||
description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Host receive bandwidth is high (>80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: HostUnusualNetworkThroughputOut
|
- alert: HostUnusualNetworkThroughputOut
|
||||||
expr: '(sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
expr: '((rate(node_network_transmit_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)'
|
||||||
for: 5m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: Host unusual network throughput out (instance {{ $labels.instance }})
|
summary: Host unusual network throughput out (instance {{ $labels.instance }})
|
||||||
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Host transmit bandwidth is high (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: HostUnusualDiskReadRate
|
- alert: HostUnusualDiskReadRate
|
||||||
expr: '(sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
expr: '(rate(node_disk_io_time_seconds_total[5m]) > .80)'
|
||||||
for: 5m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: Host unusual disk read rate (instance {{ $labels.instance }})
|
summary: Host unusual disk read rate (instance {{ $labels.instance }})
|
||||||
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Disk is too busy (IO wait > 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: HostUnusualDiskWriteRate
|
|
||||||
expr: '(sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
|
||||||
for: 2m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: Host unusual disk write rate (instance {{ $labels.instance }})
|
|
||||||
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
||||||
|
|
||||||
- alert: HostOutOfDiskSpace
|
- alert: HostOutOfDiskSpace
|
||||||
expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
expr: '(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0)'
|
||||||
for: 2m
|
for: 2m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: Host out of disk space (instance {{ $labels.instance }})
|
summary: Host out of disk space (instance {{ $labels.instance }})
|
||||||
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: HostDiskWillFillIn24Hours
|
- alert: HostDiskMayFillIn24Hours
|
||||||
expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
expr: 'predict_linear(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_avail_bytes > 0'
|
||||||
for: 2m
|
for: 2m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
|
summary: Host disk may fill in 24 hours (instance {{ $labels.instance }})
|
||||||
description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Filesystem will likely run out of space within the next 24 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: HostOutOfInodes
|
- alert: HostOutOfInodes
|
||||||
expr: '(node_filesystem_files_free / node_filesystem_files * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
expr: '(node_filesystem_files_free / node_filesystem_files < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0)'
|
||||||
for: 2m
|
for: 2m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: Host out of inodes (instance {{ $labels.instance }})
|
summary: Host out of inodes (instance {{ $labels.instance }})
|
||||||
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: HostFilesystemDeviceError
|
- alert: HostFilesystemDeviceError
|
||||||
expr: 'node_filesystem_device_error == 1'
|
expr: 'node_filesystem_device_error{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} == 1'
|
||||||
for: 0m
|
for: 2m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: Host filesystem device error (instance {{ $labels.instance }})
|
summary: Host filesystem device error (instance {{ $labels.instance }})
|
||||||
description: "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Error stat-ing the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: HostInodesWillFillIn24Hours
|
- alert: HostInodesMayFillIn24Hours
|
||||||
expr: '(node_filesystem_files_free / node_filesystem_files * 100 < 10 and predict_linear(node_filesystem_files_free[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
expr: 'predict_linear(node_filesystem_files_free{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_files_free > 0'
|
||||||
for: 2m
|
for: 2m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
|
summary: Host inodes may fill in 24 hours (instance {{ $labels.instance }})
|
||||||
description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Filesystem will likely run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: HostUnusualDiskReadLatency
|
- alert: HostUnusualDiskReadLatency
|
||||||
expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0'
|
||||||
for: 2m
|
for: 2m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
|
@ -122,7 +113,7 @@ groups:
|
||||||
description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: HostUnusualDiskWriteLatency
|
- alert: HostUnusualDiskWriteLatency
|
||||||
expr: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
expr: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0)'
|
||||||
for: 2m
|
for: 2m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
|
@ -131,7 +122,7 @@ groups:
|
||||||
description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: HostHighCpuLoad
|
- alert: HostHighCpuLoad
|
||||||
expr: '(sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > .80'
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
|
@ -140,16 +131,16 @@ groups:
|
||||||
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: HostCpuIsUnderutilized
|
- alert: HostCpuIsUnderutilized
|
||||||
expr: '(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
expr: '(min by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8'
|
||||||
for: 1w
|
for: 1w
|
||||||
labels:
|
labels:
|
||||||
severity: info
|
severity: info
|
||||||
annotations:
|
annotations:
|
||||||
summary: Host CPU is underutilized (instance {{ $labels.instance }})
|
summary: Host CPU is underutilized (instance {{ $labels.instance }})
|
||||||
description: "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: HostCpuStealNoisyNeighbor
|
- alert: HostCpuStealNoisyNeighbor
|
||||||
expr: '(avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
expr: 'avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10'
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
|
@ -158,34 +149,34 @@ groups:
|
||||||
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: HostCpuHighIowait
|
- alert: HostCpuHighIowait
|
||||||
expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
expr: 'avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10'
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: Host CPU high iowait (instance {{ $labels.instance }})
|
summary: Host CPU high iowait (instance {{ $labels.instance }})
|
||||||
description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "CPU iowait > 10%. Your CPU is idling waiting for storage to respond.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: HostUnusualDiskIo
|
- alert: HostUnusualDiskIo
|
||||||
expr: '(rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
expr: 'rate(node_disk_io_time_seconds_total[5m]) > 0.8'
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: Host unusual disk IO (instance {{ $labels.instance }})
|
summary: Host unusual disk IO (instance {{ $labels.instance }})
|
||||||
description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: HostContextSwitching
|
- alert: HostContextSwitchingHigh
|
||||||
expr: '((rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2'
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: Host context switching (instance {{ $labels.instance }})
|
summary: Host context switching high (instance {{ $labels.instance }})
|
||||||
description: "Context switching is growing on the node (> 10000 / s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Context switching is growing on the node (twice the daily average during the last 15m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: HostSwapIsFillingUp
|
- alert: HostSwapIsFillingUp
|
||||||
expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80)'
|
||||||
for: 2m
|
for: 2m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
|
@ -194,7 +185,7 @@ groups:
|
||||||
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: HostSystemdServiceCrashed
|
- alert: HostSystemdServiceCrashed
|
||||||
expr: '(node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
expr: '(node_systemd_unit_state{state="failed"} == 1)'
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
|
@ -203,7 +194,7 @@ groups:
|
||||||
description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: HostPhysicalComponentTooHot
|
- alert: HostPhysicalComponentTooHot
|
||||||
expr: '((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl"} > 75)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
expr: 'node_hwmon_temp_celsius > node_hwmon_temp_max_celsius'
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
|
@ -212,7 +203,7 @@ groups:
|
||||||
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: HostNodeOvertemperatureAlarm
|
- alert: HostNodeOvertemperatureAlarm
|
||||||
expr: '(node_hwmon_temp_crit_alarm_celsius == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
expr: '((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1))'
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
|
|
@ -220,35 +211,35 @@ groups:
|
||||||
summary: Host node overtemperature alarm (instance {{ $labels.instance }})
|
summary: Host node overtemperature alarm (instance {{ $labels.instance }})
|
||||||
description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: HostRaidArrayGotInactive
|
- alert: HostSoftwareRaidInsufficientDrives
|
||||||
expr: '(node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
expr: '((node_md_disks_required - on(device, instance) node_md_disks{state="active"}) > 0)'
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: Host RAID array got inactive (instance {{ $labels.instance }})
|
summary: Host software RAID insufficient drives (instance {{ $labels.instance }})
|
||||||
description: "RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} has insufficient drives remaining.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: HostRaidDiskFailure
|
- alert: HostSoftwareRaidDiskFailure
|
||||||
expr: '(node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
expr: '(node_md_disks{state="failed"} > 0)'
|
||||||
for: 2m
|
for: 2m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: Host RAID disk failure (instance {{ $labels.instance }})
|
summary: Host software RAID disk failure (instance {{ $labels.instance }})
|
||||||
description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} needs attention.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: HostKernelVersionDeviations
|
- alert: HostKernelVersionDeviations
|
||||||
expr: '(count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
expr: 'changes(node_uname_info[1h]) > 0'
|
||||||
for: 6h
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: info
|
||||||
annotations:
|
annotations:
|
||||||
summary: Host kernel version deviations (instance {{ $labels.instance }})
|
summary: Host kernel version deviations (instance {{ $labels.instance }})
|
||||||
description: "Different kernel versions are running\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Kernel version for {{ $labels.instance }} has changed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: HostOomKillDetected
|
- alert: HostOomKillDetected
|
||||||
expr: '(increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
expr: '(increase(node_vmstat_oom_kill[1m]) > 0)'
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
|
@ -257,7 +248,7 @@ groups:
|
||||||
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: HostEdacCorrectableErrorsDetected
|
- alert: HostEdacCorrectableErrorsDetected
|
||||||
expr: '(increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
expr: '(increase(node_edac_correctable_errors_total[1m]) > 0)'
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: info
|
severity: info
|
||||||
|
|
@ -266,7 +257,7 @@ groups:
|
||||||
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: HostEdacUncorrectableErrorsDetected
|
- alert: HostEdacUncorrectableErrorsDetected
|
||||||
expr: '(node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
expr: '(node_edac_uncorrectable_errors_total > 0)'
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
|
@ -275,7 +266,7 @@ groups:
|
||||||
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: HostNetworkReceiveErrors
|
- alert: HostNetworkReceiveErrors
|
||||||
expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01)'
|
||||||
for: 2m
|
for: 2m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
|
@ -284,7 +275,7 @@ groups:
|
||||||
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: HostNetworkTransmitErrors
|
- alert: HostNetworkTransmitErrors
|
||||||
expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01)'
|
||||||
for: 2m
|
for: 2m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
|
@ -292,17 +283,8 @@ groups:
|
||||||
summary: Host Network Transmit Errors (instance {{ $labels.instance }})
|
summary: Host Network Transmit Errors (instance {{ $labels.instance }})
|
||||||
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: HostNetworkInterfaceSaturated
|
|
||||||
expr: '((rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} > 0.8 < 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
|
||||||
for: 1m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: Host Network Interface Saturated (instance {{ $labels.instance }})
|
|
||||||
description: "The network interface \"{{ $labels.device }}\" on \"{{ $labels.instance }}\" is getting overloaded.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
||||||
|
|
||||||
- alert: HostNetworkBondDegraded
|
- alert: HostNetworkBondDegraded
|
||||||
expr: '((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
expr: '((node_bonding_active - node_bonding_slaves) != 0)'
|
||||||
for: 2m
|
for: 2m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
|
@ -311,7 +293,7 @@ groups:
|
||||||
description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: HostConntrackLimit
|
- alert: HostConntrackLimit
|
||||||
expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8)'
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
|
@ -320,7 +302,7 @@ groups:
|
||||||
description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: HostClockSkew
|
- alert: HostClockSkew
|
||||||
expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0))'
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
|
@ -329,7 +311,7 @@ groups:
|
||||||
description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: HostClockNotSynchronising
|
- alert: HostClockNotSynchronising
|
||||||
expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16)'
|
||||||
for: 2m
|
for: 2m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
|
@ -338,7 +320,7 @@ groups:
|
||||||
description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: HostRequiresReboot
|
- alert: HostRequiresReboot
|
||||||
expr: '(node_reboot_required > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
expr: '(node_reboot_required > 0)'
|
||||||
for: 4h
|
for: 4h
|
||||||
labels:
|
labels:
|
||||||
severity: info
|
severity: info
|
||||||
|
|
|
||||||
129
dist/rules/kubernetes/kubestate-exporter.yml
vendored
129
dist/rules/kubernetes/kubestate-exporter.yml
vendored
|
|
@ -10,44 +10,44 @@ groups:
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: Kubernetes node not ready (instance {{ $labels.instance }})
|
summary: Kubernetes Node ready (node {{ $labels.node }})
|
||||||
description: "Node {{ $labels.node }} has been unready for a long time\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Node {{ $labels.node }} has been unready for a long time\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: KubernetesMemoryPressure
|
- alert: KubernetesNodeMemoryPressure
|
||||||
expr: 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1'
|
expr: 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1'
|
||||||
for: 2m
|
for: 2m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: Kubernetes memory pressure (instance {{ $labels.instance }})
|
summary: Kubernetes memory pressure (node {{ $labels.node }})
|
||||||
description: "{{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Node {{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: KubernetesDiskPressure
|
- alert: KubernetesNodeDiskPressure
|
||||||
expr: 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1'
|
expr: 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1'
|
||||||
for: 2m
|
for: 2m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: Kubernetes disk pressure (instance {{ $labels.instance }})
|
summary: Kubernetes disk pressure (node {{ $labels.node }})
|
||||||
description: "{{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Node {{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: KubernetesNetworkUnavailable
|
- alert: KubernetesNodeNetworkUnavailable
|
||||||
expr: 'kube_node_status_condition{condition="NetworkUnavailable",status="true"} == 1'
|
expr: 'kube_node_status_condition{condition="NetworkUnavailable",status="true"} == 1'
|
||||||
for: 2m
|
for: 2m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: Kubernetes network unavailable (instance {{ $labels.instance }})
|
summary: Kubernetes Node network unavailable (instance {{ $labels.instance }})
|
||||||
description: "{{ $labels.node }} has NetworkUnavailable condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Node {{ $labels.node }} has NetworkUnavailable condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: KubernetesOutOfCapacity
|
- alert: KubernetesNodeOutOfPodCapacity
|
||||||
expr: 'sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90'
|
expr: 'sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid, instance) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90'
|
||||||
for: 2m
|
for: 2m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: Kubernetes out of capacity (instance {{ $labels.instance }})
|
summary: Kubernetes Node out of pod capacity (instance {{ $labels.instance }})
|
||||||
description: "{{ $labels.node }} is out of capacity\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Node {{ $labels.node }} is out of pod capacity\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: KubernetesContainerOomKiller
|
- alert: KubernetesContainerOomKiller
|
||||||
expr: '(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1'
|
expr: '(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1'
|
||||||
|
|
@ -55,7 +55,7 @@ groups:
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: Kubernetes container oom killer (instance {{ $labels.instance }})
|
summary: Kubernetes container oom killer ({{ $labels.namespace }}/{{ $labels.pod }}:{{ $labels.container }})
|
||||||
description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: KubernetesJobFailed
|
- alert: KubernetesJobFailed
|
||||||
|
|
@ -64,16 +64,25 @@ groups:
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: Kubernetes Job failed (instance {{ $labels.instance }})
|
summary: Kubernetes Job failed ({{ $labels.namespace }}/{{ $labels.job_name }})
|
||||||
description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: KubernetesJobNotStarting
|
||||||
|
expr: 'kube_job_status_active == 0 and kube_job_status_failed == 0 and kube_job_status_succeeded == 0 and (time() - kube_job_status_start_time) > 600'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Kubernetes Job not starting ({{ $labels.namespace }}/{{ $labels.job_name }})
|
||||||
|
description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} did not start for 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: KubernetesCronjobSuspended
|
- alert: KubernetesCronjobSuspended
|
||||||
expr: 'kube_cronjob_spec_suspend != 0'
|
expr: 'kube_cronjob_spec_suspend != 0'
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: Kubernetes CronJob suspended (instance {{ $labels.instance }})
|
summary: Kubernetes CronJob suspended ({{ $labels.namespace }}/{{ $labels.cronjob }})
|
||||||
description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is suspended\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is suspended\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: KubernetesPersistentvolumeclaimPending
|
- alert: KubernetesPersistentvolumeclaimPending
|
||||||
|
|
@ -82,7 +91,7 @@ groups:
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: Kubernetes PersistentVolumeClaim pending (instance {{ $labels.instance }})
|
summary: Kubernetes PersistentVolumeClaim pending ({{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }})
|
||||||
description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pending\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pending\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: KubernetesVolumeOutOfDiskSpace
|
- alert: KubernetesVolumeOutOfDiskSpace
|
||||||
|
|
@ -95,13 +104,13 @@ groups:
|
||||||
description: "Volume is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Volume is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: KubernetesVolumeFullInFourDays
|
- alert: KubernetesVolumeFullInFourDays
|
||||||
expr: 'predict_linear(kubelet_volume_stats_available_bytes[6h], 4 * 24 * 3600) < 0'
|
expr: 'predict_linear(kubelet_volume_stats_available_bytes[6h:5m], 4 * 24 * 3600) < 0'
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: Kubernetes Volume full in four days (instance {{ $labels.instance }})
|
summary: Kubernetes Volume full in four days (instance {{ $labels.instance }})
|
||||||
description: "{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Volume under {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: KubernetesPersistentvolumeError
|
- alert: KubernetesPersistentvolumeError
|
||||||
expr: 'kube_persistentvolume_status_phase{phase=~"Failed|Pending", job="kube-state-metrics"} > 0'
|
expr: 'kube_persistentvolume_status_phase{phase=~"Failed|Pending", job="kube-state-metrics"} > 0'
|
||||||
|
|
@ -109,8 +118,8 @@ groups:
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: Kubernetes PersistentVolume error (instance {{ $labels.instance }})
|
summary: Kubernetes PersistentVolumeClaim pending ({{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }})
|
||||||
description: "Persistent volume is in bad state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Persistent volume {{ $labels.persistentvolume }} is in bad state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: KubernetesStatefulsetDown
|
- alert: KubernetesStatefulsetDown
|
||||||
expr: 'kube_statefulset_replicas != kube_statefulset_status_replicas_ready > 0'
|
expr: 'kube_statefulset_replicas != kube_statefulset_status_replicas_ready > 0'
|
||||||
|
|
@ -118,35 +127,35 @@ groups:
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: Kubernetes StatefulSet down (instance {{ $labels.instance }})
|
summary: Kubernetes StatefulSet down ({{ $labels.namespace }}/{{ $labels.statefulset }})
|
||||||
description: "A StatefulSet went down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} went down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: KubernetesHpaScalingAbility
|
- alert: KubernetesHpaScaleInability
|
||||||
expr: 'kube_horizontalpodautoscaler_status_condition{status="false", condition="AbleToScale"} == 1'
|
expr: '(kube_horizontalpodautoscaler_spec_max_replicas - kube_horizontalpodautoscaler_status_desired_replicas) * on (horizontalpodautoscaler,namespace) (kube_horizontalpodautoscaler_status_condition{condition="ScalingLimited", status="true"} == 1) == 0'
|
||||||
for: 2m
|
for: 2m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: Kubernetes HPA scaling ability (instance {{ $labels.instance }})
|
summary: Kubernetes HPA scale inability (instance {{ $labels.instance }})
|
||||||
description: "Pod is unable to scale\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to scale\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: KubernetesHpaMetricAvailability
|
- alert: KubernetesHpaMetricsUnavailability
|
||||||
expr: 'kube_horizontalpodautoscaler_status_condition{status="false", condition="ScalingActive"} == 1'
|
expr: 'kube_horizontalpodautoscaler_status_condition{status="false", condition="ScalingActive"} == 1'
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: Kubernetes HPA metric availability (instance {{ $labels.instance }})
|
summary: Kubernetes HPA metrics unavailability (instance {{ $labels.instance }})
|
||||||
description: "HPA is not able to collect metrics\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to collect metrics\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: KubernetesHpaScaleCapability
|
- alert: KubernetesHpaScaleMaximum
|
||||||
expr: 'kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas'
|
expr: '(kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas) and (kube_horizontalpodautoscaler_spec_max_replicas > 1) and (kube_horizontalpodautoscaler_spec_min_replicas != kube_horizontalpodautoscaler_spec_max_replicas)'
|
||||||
for: 2m
|
for: 2m
|
||||||
labels:
|
labels:
|
||||||
severity: info
|
severity: info
|
||||||
annotations:
|
annotations:
|
||||||
summary: Kubernetes HPA scale capability (instance {{ $labels.instance }})
|
summary: Kubernetes HPA scale maximum (instance {{ $labels.instance }})
|
||||||
description: "The maximum number of desired Pods has been hit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has hit maximum number of desired pods\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: KubernetesHpaUnderutilized
|
- alert: KubernetesHpaUnderutilized
|
||||||
expr: 'max(quantile_over_time(0.5, kube_horizontalpodautoscaler_status_desired_replicas[1d]) == kube_horizontalpodautoscaler_spec_min_replicas) by (horizontalpodautoscaler) > 3'
|
expr: 'max(quantile_over_time(0.5, kube_horizontalpodautoscaler_status_desired_replicas[1d]) == kube_horizontalpodautoscaler_spec_min_replicas) by (horizontalpodautoscaler) > 3'
|
||||||
|
|
@ -155,7 +164,7 @@ groups:
|
||||||
severity: info
|
severity: info
|
||||||
annotations:
|
annotations:
|
||||||
summary: Kubernetes HPA underutilized (instance {{ $labels.instance }})
|
summary: Kubernetes HPA underutilized (instance {{ $labels.instance }})
|
||||||
description: "HPA is constantly at minimum replicas for 50% of the time. Potential cost saving here.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is constantly at minimum replicas for 50% of the time. Potential cost saving here.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: KubernetesPodNotHealthy
|
- alert: KubernetesPodNotHealthy
|
||||||
expr: 'sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"}) > 0'
|
expr: 'sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"}) > 0'
|
||||||
|
|
@ -163,8 +172,8 @@ groups:
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: Kubernetes Pod not healthy (instance {{ $labels.instance }})
|
summary: Kubernetes Pod not healthy ({{ $labels.namespace }}/{{ $labels.pod }})
|
||||||
description: "Pod has been in a non-ready state for longer than 15 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-running state for longer than 15 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: KubernetesPodCrashLooping
|
- alert: KubernetesPodCrashLooping
|
||||||
expr: 'increase(kube_pod_container_status_restarts_total[1m]) > 3'
|
expr: 'increase(kube_pod_container_status_restarts_total[1m]) > 3'
|
||||||
|
|
@ -172,17 +181,17 @@ groups:
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: Kubernetes pod crash looping (instance {{ $labels.instance }})
|
summary: Kubernetes pod crash looping ({{ $labels.namespace }}/{{ $labels.pod }})
|
||||||
description: "Pod {{ $labels.pod }} is crash looping\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: KubernetesReplicassetMismatch
|
- alert: KubernetesReplicasetReplicasMismatch
|
||||||
expr: 'kube_replicaset_spec_replicas != kube_replicaset_status_ready_replicas'
|
expr: 'kube_replicaset_spec_replicas != kube_replicaset_status_ready_replicas'
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: Kubernetes ReplicasSet mismatch (instance {{ $labels.instance }})
|
summary: Kubernetes ReplicasSet mismatch ({{ $labels.namespace }}/{{ $labels.replicaset }})
|
||||||
description: "Deployment Replicas mismatch\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "ReplicaSet {{ $labels.namespace }}/{{ $labels.replicaset }} replicas mismatch\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: KubernetesDeploymentReplicasMismatch
|
- alert: KubernetesDeploymentReplicasMismatch
|
||||||
expr: 'kube_deployment_spec_replicas != kube_deployment_status_replicas_available'
|
expr: 'kube_deployment_spec_replicas != kube_deployment_status_replicas_available'
|
||||||
|
|
@ -190,8 +199,8 @@ groups:
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: Kubernetes Deployment replicas mismatch (instance {{ $labels.instance }})
|
summary: Kubernetes Deployment replicas mismatch ({{ $labels.namespace }}/{{ $labels.deployment }})
|
||||||
description: "Deployment Replicas mismatch\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replicas mismatch\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: KubernetesStatefulsetReplicasMismatch
|
- alert: KubernetesStatefulsetReplicasMismatch
|
||||||
expr: 'kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas'
|
expr: 'kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas'
|
||||||
|
|
@ -200,7 +209,7 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: Kubernetes StatefulSet replicas mismatch (instance {{ $labels.instance }})
|
summary: Kubernetes StatefulSet replicas mismatch (instance {{ $labels.instance }})
|
||||||
description: "A StatefulSet does not match the expected number of replicas.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "StatefulSet does not match the expected number of replicas.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: KubernetesDeploymentGenerationMismatch
|
- alert: KubernetesDeploymentGenerationMismatch
|
||||||
expr: 'kube_deployment_status_observed_generation != kube_deployment_metadata_generation'
|
expr: 'kube_deployment_status_observed_generation != kube_deployment_metadata_generation'
|
||||||
|
|
@ -208,8 +217,8 @@ groups:
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: Kubernetes Deployment generation mismatch (instance {{ $labels.instance }})
|
summary: Kubernetes Deployment generation mismatch ({{ $labels.namespace }}/{{ $labels.deployment }})
|
||||||
description: "A Deployment has failed but has not been rolled back.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has failed but has not been rolled back.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: KubernetesStatefulsetGenerationMismatch
|
- alert: KubernetesStatefulsetGenerationMismatch
|
||||||
expr: 'kube_statefulset_status_observed_generation != kube_statefulset_metadata_generation'
|
expr: 'kube_statefulset_status_observed_generation != kube_statefulset_metadata_generation'
|
||||||
|
|
@ -217,8 +226,8 @@ groups:
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: Kubernetes StatefulSet generation mismatch (instance {{ $labels.instance }})
|
summary: Kubernetes StatefulSet generation mismatch ({{ $labels.namespace }}/{{ $labels.statefulset }})
|
||||||
description: "A StatefulSet has failed but has not been rolled back.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has failed but has not been rolled back.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: KubernetesStatefulsetUpdateNotRolledOut
|
- alert: KubernetesStatefulsetUpdateNotRolledOut
|
||||||
expr: 'max without (revision) (kube_statefulset_status_current_revision unless kube_statefulset_status_update_revision) * (kube_statefulset_replicas != kube_statefulset_status_replicas_updated)'
|
expr: 'max without (revision) (kube_statefulset_status_current_revision unless kube_statefulset_status_update_revision) * (kube_statefulset_replicas != kube_statefulset_status_replicas_updated)'
|
||||||
|
|
@ -226,8 +235,8 @@ groups:
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: Kubernetes StatefulSet update not rolled out (instance {{ $labels.instance }})
|
summary: Kubernetes StatefulSet update not rolled out ({{ $labels.namespace }}/{{ $labels.statefulset }})
|
||||||
description: "StatefulSet update has not been rolled out.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: KubernetesDaemonsetRolloutStuck
|
- alert: KubernetesDaemonsetRolloutStuck
|
||||||
expr: 'kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 or kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0'
|
expr: 'kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 or kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0'
|
||||||
|
|
@ -235,8 +244,8 @@ groups:
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: Kubernetes DaemonSet rollout stuck (instance {{ $labels.instance }})
|
summary: Kubernetes DaemonSet rollout stuck ({{ $labels.namespace }}/{{ $labels.daemonset }})
|
||||||
description: "Some Pods of DaemonSet are not scheduled or not ready\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled or not ready\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: KubernetesDaemonsetMisscheduled
|
- alert: KubernetesDaemonsetMisscheduled
|
||||||
expr: 'kube_daemonset_status_number_misscheduled > 0'
|
expr: 'kube_daemonset_status_number_misscheduled > 0'
|
||||||
|
|
@ -244,8 +253,8 @@ groups:
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: Kubernetes DaemonSet misscheduled (instance {{ $labels.instance }})
|
summary: Kubernetes DaemonSet misscheduled ({{ $labels.namespace }}/{{ $labels.daemonset }})
|
||||||
description: "Some DaemonSet Pods are running where they are not supposed to run\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: KubernetesCronjobTooLong
|
- alert: KubernetesCronjobTooLong
|
||||||
expr: 'time() - kube_cronjob_next_schedule_time > 3600'
|
expr: 'time() - kube_cronjob_next_schedule_time > 3600'
|
||||||
|
|
@ -253,7 +262,7 @@ groups:
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: Kubernetes CronJob too long (instance {{ $labels.instance }})
|
summary: Kubernetes CronJob too long ({{ $labels.namespace }}/{{ $labels.cronjob }})
|
||||||
description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: KubernetesJobSlowCompletion
|
- alert: KubernetesJobSlowCompletion
|
||||||
|
|
@ -262,11 +271,11 @@ groups:
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: Kubernetes job slow completion (instance {{ $labels.instance }})
|
summary: Kubernetes job slow completion ({{ $labels.namespace }}/{{ $labels.job_name }})
|
||||||
description: "Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in time.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in time.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: KubernetesApiServerErrors
|
- alert: KubernetesApiServerErrors
|
||||||
expr: 'sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[1m])) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) * 100 > 3'
|
expr: 'sum(rate(apiserver_request_total{job="apiserver",code=~"(?:5..)"}[1m])) by (instance, job) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) by (instance, job) * 100 > 3'
|
||||||
for: 2m
|
for: 2m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
|
|
@ -302,7 +311,7 @@ groups:
|
||||||
description: "A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: KubernetesApiServerLatency
|
- alert: KubernetesApiServerLatency
|
||||||
expr: 'histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"} [10m])) WITHOUT (instance, resource)) / 1e+06 > 1'
|
expr: 'histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{verb!~"(?:CONNECT|WATCHLIST|WATCH|PROXY)"} [10m])) WITHOUT (subresource)) > 1'
|
||||||
for: 2m
|
for: 2m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
|
|
||||||
23
dist/rules/meilisearch/embedded-exporter.yml
vendored
Normal file
23
dist/rules/meilisearch/embedded-exporter.yml
vendored
Normal file
|
|
@ -0,0 +1,23 @@
|
||||||
|
groups:
|
||||||
|
|
||||||
|
- name: EmbeddedExporter
|
||||||
|
|
||||||
|
rules:
|
||||||
|
|
||||||
|
- alert: MeilisearchIndexIsEmpty
|
||||||
|
expr: 'meilisearch_index_docs_count == 0'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Meilisearch index is empty (instance {{ $labels.instance }})
|
||||||
|
description: "Meilisearch instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: MeilisearchHttpResponseTime
|
||||||
|
expr: 'meilisearch_http_response_time_seconds > 0.5'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Meilisearch http response time (instance {{ $labels.instance }})
|
||||||
|
description: "Meilisearch http response time is too high\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
2
dist/rules/minio/embedded-exporter.yml
vendored
2
dist/rules/minio/embedded-exporter.yml
vendored
|
|
@ -5,7 +5,7 @@ groups:
|
||||||
rules:
|
rules:
|
||||||
|
|
||||||
- alert: MinioClusterDiskOffline
|
- alert: MinioClusterDiskOffline
|
||||||
expr: 'minio_cluster_disk_offline_total > 0'
|
expr: 'minio_cluster_drive_offline_total > 0'
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
|
|
|
||||||
|
|
@ -66,12 +66,3 @@ groups:
|
||||||
annotations:
|
annotations:
|
||||||
summary: MongoDB too many connections (instance {{ $labels.instance }})
|
summary: MongoDB too many connections (instance {{ $labels.instance }})
|
||||||
description: "Too many connections (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Too many connections (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: MongodbVirtualMemoryUsage
|
|
||||||
expr: '(sum(mongodb_ss_mem_virtual) BY (instance) / sum(mongodb_ss_mem_resident) BY (instance)) > 3'
|
|
||||||
for: 2m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: MongoDB virtual memory usage (instance {{ $labels.instance }})
|
|
||||||
description: "High memory usage\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
||||||
|
|
|
||||||
45
dist/rules/mysql/mysqld-exporter.yml
vendored
45
dist/rules/mysql/mysqld-exporter.yml
vendored
|
|
@ -22,6 +22,15 @@ groups:
|
||||||
summary: MySQL too many connections (> 80%) (instance {{ $labels.instance }})
|
summary: MySQL too many connections (> 80%) (instance {{ $labels.instance }})
|
||||||
description: "More than 80% of MySQL connections are in use on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "More than 80% of MySQL connections are in use on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: MysqlHighPreparedStatementsUtilization(>80%)
|
||||||
|
expr: 'max_over_time(mysql_global_status_prepared_stmt_count[1m]) / mysql_global_variables_max_prepared_stmt_count * 100 > 80'
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: MySQL high prepared statements utilization (> 80%) (instance {{ $labels.instance }})
|
||||||
|
description: "High utilization of prepared statements (>80%) on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: MysqlHighThreadsRunning
|
- alert: MysqlHighThreadsRunning
|
||||||
expr: 'max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections * 100 > 60'
|
expr: 'max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections * 100 > 60'
|
||||||
for: 2m
|
for: 2m
|
||||||
|
|
@ -84,3 +93,39 @@ groups:
|
||||||
annotations:
|
annotations:
|
||||||
summary: MySQL restarted (instance {{ $labels.instance }})
|
summary: MySQL restarted (instance {{ $labels.instance }})
|
||||||
description: "MySQL has just been restarted, less than one minute ago on {{ $labels.instance }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "MySQL has just been restarted, less than one minute ago on {{ $labels.instance }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: MysqlHighQps
|
||||||
|
expr: 'irate(mysql_global_status_questions[1m]) > 10000'
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: info
|
||||||
|
annotations:
|
||||||
|
summary: MySQL High QPS (instance {{ $labels.instance }})
|
||||||
|
description: "MySQL is being overload with unusual QPS (> 10k QPS).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: MysqlTooManyOpenFiles
|
||||||
|
expr: 'mysql_global_status_innodb_num_open_files / mysql_global_variables_open_files_limit * 100 > 75'
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: MySQL too many open files (instance {{ $labels.instance }})
|
||||||
|
description: "MySQL has too many open files, consider increase variables open_files_limit on {{ $labels.instance }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: MysqlInnodbForceRecoveryIsEnabled
|
||||||
|
expr: 'mysql_global_variables_innodb_force_recovery != 0'
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: MySQL InnoDB Force Recovery is enabled (instance {{ $labels.instance }})
|
||||||
|
description: "MySQL InnoDB force recovery is enabled on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: MysqlInnodbHistory_lenTooLong
|
||||||
|
expr: 'mysql_info_schema_innodb_metrics_transaction_trx_rseg_history_len > 50000'
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: MySQL InnoDB history_len too long (instance {{ $labels.instance }})
|
||||||
|
description: "MySQL history_len (undo log) too long on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
|
||||||
146
dist/rules/nats/nats-exporter.yml
vendored
146
dist/rules/nats/nats-exporter.yml
vendored
|
|
@ -32,10 +32,154 @@ groups:
|
||||||
description: "High number of NATS subscriptions ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "High number of NATS subscriptions ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: NatsHighRoutesCount
|
- alert: NatsHighRoutesCount
|
||||||
expr: 'gnatsd_routez_num_routes > 10'
|
expr: 'gnatsd_varz_routes > 10'
|
||||||
for: 3m
|
for: 3m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: Nats high routes count (instance {{ $labels.instance }})
|
summary: Nats high routes count (instance {{ $labels.instance }})
|
||||||
description: "High number of NATS routes ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "High number of NATS routes ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: NatsHighMemoryUsage
|
||||||
|
expr: 'gnatsd_varz_mem > 200 * 1024 * 1024'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Nats high memory usage (instance {{ $labels.instance }})
|
||||||
|
description: "NATS server memory usage is above 200MB for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: NatsSlowConsumers
|
||||||
|
expr: 'gnatsd_varz_slow_consumers > 0'
|
||||||
|
for: 3m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Nats slow consumers (instance {{ $labels.instance }})
|
||||||
|
description: "There are slow consumers in NATS for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: NatsServerDown
|
||||||
|
expr: 'absent(up{job="nats"})'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Nats server down (instance {{ $labels.instance }})
|
||||||
|
description: "NATS server has been down for more than 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: NatsHighCpuUsage
|
||||||
|
expr: 'rate(gnatsd_varz_cpu[5m]) > 0.8'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Nats high CPU usage (instance {{ $labels.instance }})
|
||||||
|
description: "NATS server is using more than 80% CPU for the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: NatsHighNumberOfConnections
|
||||||
|
expr: 'gnatsd_connz_num_connections > 1000'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Nats high number of connections (instance {{ $labels.instance }})
|
||||||
|
description: "NATS server has more than 1000 active connections\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: NatsHighJetstreamStoreUsage
|
||||||
|
expr: 'gnatsd_varz_jetstream_stats_storage / gnatsd_varz_jetstream_config_max_storage > 0.8'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Nats high JetStream store usage (instance {{ $labels.instance }})
|
||||||
|
description: "JetStream store usage is over 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: NatsHighJetstreamMemoryUsage
|
||||||
|
expr: 'gnatsd_varz_jetstream_stats_memory / gnatsd_varz_jetstream_config_max_memory > 0.8'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Nats high JetStream memory usage (instance {{ $labels.instance }})
|
||||||
|
description: "JetStream memory usage is over 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: NatsHighNumberOfSubscriptions
|
||||||
|
expr: 'gnatsd_connz_subscriptions > 1000'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Nats high number of subscriptions (instance {{ $labels.instance }})
|
||||||
|
description: "NATS server has more than 1000 active subscriptions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: NatsHighPendingBytes
|
||||||
|
expr: 'gnatsd_connz_pending_bytes > 100000'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Nats high pending bytes (instance {{ $labels.instance }})
|
||||||
|
description: "NATS server has more than 100,000 pending bytes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: NatsTooManyErrors
|
||||||
|
expr: 'increase(gnatsd_varz_jetstream_stats_api_errors[5m]) > 0'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Nats too many errors (instance {{ $labels.instance }})
|
||||||
|
description: "NATS server has encountered errors in the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: NatsJetstreamConsumersExceeded
|
||||||
|
expr: 'sum(gnatsd_varz_jetstream_stats_accounts) > 100'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Nats JetStream consumers exceeded (instance {{ $labels.instance }})
|
||||||
|
description: "JetStream has more than 100 active consumers\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: NatsFrequentAuthenticationTimeouts
|
||||||
|
expr: 'increase(gnatsd_varz_auth_timeout[5m]) > 5'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Nats frequent authentication timeouts (instance {{ $labels.instance }})
|
||||||
|
description: "There have been more than 5 authentication timeouts in the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: NatsMaxPayloadSizeExceeded
|
||||||
|
expr: 'max(gnatsd_varz_max_payload) > 1024 * 1024'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Nats max payload size exceeded (instance {{ $labels.instance }})
|
||||||
|
description: "The max payload size allowed by NATS has been exceeded (1MB)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: NatsLeafNodeConnectionIssue
|
||||||
|
expr: 'increase(gnatsd_varz_leafnodes[5m]) == 0'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Nats leaf node connection issue (instance {{ $labels.instance }})
|
||||||
|
description: "No leaf node connections have been established in the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: NatsMaxPingOperationsExceeded
|
||||||
|
expr: 'gnatsd_varz_ping_max > 50'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Nats max ping operations exceeded (instance {{ $labels.instance }})
|
||||||
|
description: "The maximum number of ping operations in NATS has exceeded 50\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: NatsWriteDeadlineExceeded
|
||||||
|
expr: 'gnatsd_varz_write_deadline > 10'
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Nats write deadline exceeded (instance {{ $labels.instance }})
|
||||||
|
description: "The write deadline has been exceeded in NATS, indicating potential message delivery issues\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
|
||||||
2
dist/rules/netdata/embedded-exporter.yml
vendored
2
dist/rules/netdata/embedded-exporter.yml
vendored
|
|
@ -23,7 +23,7 @@ groups:
|
||||||
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: NetdataHighMemoryUsage
|
- alert: NetdataHighMemoryUsage
|
||||||
expr: '100 / netdata_system_ram_MB_average * netdata_system_ram_MB_average{dimension=~"free|cached"} < 20'
|
expr: '100 / netdata_system_ram_MiB_average * netdata_system_ram_MiB_average{dimension=~"free|cached"} < 20'
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
|
|
||||||
23
dist/rules/postgresql/postgres-exporter.yml
vendored
23
dist/rules/postgresql/postgres-exporter.yml
vendored
|
|
@ -32,7 +32,7 @@ groups:
|
||||||
description: "Postgresql exporter is showing errors. A query may be buggy in query.yaml\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Postgresql exporter is showing errors. A query may be buggy in query.yaml\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: PostgresqlTableNotAutoVacuumed
|
- alert: PostgresqlTableNotAutoVacuumed
|
||||||
expr: '(pg_stat_user_tables_last_autovacuum > 0) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10'
|
expr: '((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_vacuum_threshold) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10'
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
|
@ -41,7 +41,7 @@ groups:
|
||||||
description: "Table {{ $labels.relname }} has not been auto vacuumed for 10 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Table {{ $labels.relname }} has not been auto vacuumed for 10 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: PostgresqlTableNotAutoAnalyzed
|
- alert: PostgresqlTableNotAutoAnalyzed
|
||||||
expr: '(pg_stat_user_tables_last_autoanalyze > 0) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10'
|
expr: '((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_analyze_threshold) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10'
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
|
@ -50,7 +50,7 @@ groups:
|
||||||
description: "Table {{ $labels.relname }} has not been auto analyzed for 10 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Table {{ $labels.relname }} has not been auto analyzed for 10 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: PostgresqlTooManyConnections
|
- alert: PostgresqlTooManyConnections
|
||||||
expr: ''
|
expr: 'sum by (instance, job, server) (pg_stat_activity_count) > min by (instance, job, server) (pg_settings_max_connections * 0.8)'
|
||||||
for: 2m
|
for: 2m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
|
@ -62,7 +62,7 @@ groups:
|
||||||
expr: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5'
|
expr: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5'
|
||||||
for: 2m
|
for: 2m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: Postgresql not enough connections (instance {{ $labels.instance }})
|
summary: Postgresql not enough connections (instance {{ $labels.instance }})
|
||||||
description: "PostgreSQL instance should have more connections (> 5)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "PostgreSQL instance should have more connections (> 5)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
@ -86,7 +86,7 @@ groups:
|
||||||
description: "Ratio of transactions being aborted compared to committed is > 2 %\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Ratio of transactions being aborted compared to committed is > 2 %\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: PostgresqlCommitRateLow
|
- alert: PostgresqlCommitRateLow
|
||||||
expr: 'rate(pg_stat_database_xact_commit[1m]) < 10'
|
expr: 'increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[5m]) < 5'
|
||||||
for: 2m
|
for: 2m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
|
|
@ -140,7 +140,7 @@ groups:
|
||||||
description: "PostgreSQL dead tuples is too large\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "PostgreSQL dead tuples is too large\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: PostgresqlConfigurationChanged
|
- alert: PostgresqlConfigurationChanged
|
||||||
expr: '{__name__=~"pg_settings_.*"} != ON(__name__) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"} OFFSET 5m'
|
expr: '{__name__=~"pg_settings_.*"} != ON(__name__, instance) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"} OFFSET 5m'
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: info
|
severity: info
|
||||||
|
|
@ -155,7 +155,7 @@ groups:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: Postgresql SSL compression active (instance {{ $labels.instance }})
|
summary: Postgresql SSL compression active (instance {{ $labels.instance }})
|
||||||
description: "Database connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Database allows connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: PostgresqlTooManyLocksAcquired
|
- alert: PostgresqlTooManyLocksAcquired
|
||||||
expr: '((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20'
|
expr: '((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20'
|
||||||
|
|
@ -183,3 +183,12 @@ groups:
|
||||||
annotations:
|
annotations:
|
||||||
summary: Postgresql bloat table high (> 80%) (instance {{ $labels.instance }})
|
summary: Postgresql bloat table high (> 80%) (instance {{ $labels.instance }})
|
||||||
description: "The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PostgresqlInvalidIndex
|
||||||
|
expr: 'pg_general_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}'
|
||||||
|
for: 6h
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Postgresql invalid index (instance {{ $labels.instance }})
|
||||||
|
description: "The table {{ $labels.relname }} has an invalid index: {{ $labels.indexrelname }}. You should execute `DROP INDEX {{ $labels.indexrelname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
|
||||||
|
|
@ -32,7 +32,7 @@ groups:
|
||||||
description: "A Prometheus job does not have living target anymore.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "A Prometheus job does not have living target anymore.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: PrometheusTargetMissingWithWarmupTime
|
- alert: PrometheusTargetMissingWithWarmupTime
|
||||||
expr: 'sum by (instance, job) ((up == 0) * on (instance) group_right(job) (node_time_seconds - node_boot_time_seconds > 600))'
|
expr: 'sum by (instance, job) ((up == 0) * on (instance) group_left(__name__) (node_time_seconds - node_boot_time_seconds > 600))'
|
||||||
for: 0m
|
for: 0m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
|
|
|
||||||
22
dist/rules/rabbitmq/kbudde-rabbitmq-exporter.yml
vendored
22
dist/rules/rabbitmq/kbudde-rabbitmq-exporter.yml
vendored
|
|
@ -10,7 +10,7 @@ groups:
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: Rabbitmq down (instance {{ $labels.instance }})
|
summary: RabbitMQ down (instance {{ $labels.instance }})
|
||||||
description: "RabbitMQ node down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "RabbitMQ node down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: RabbitmqClusterDown
|
- alert: RabbitmqClusterDown
|
||||||
|
|
@ -19,7 +19,7 @@ groups:
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: Rabbitmq cluster down (instance {{ $labels.instance }})
|
summary: RabbitMQ cluster down (instance {{ $labels.instance }})
|
||||||
description: "Less than 3 nodes running in RabbitMQ cluster\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Less than 3 nodes running in RabbitMQ cluster\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: RabbitmqClusterPartition
|
- alert: RabbitmqClusterPartition
|
||||||
|
|
@ -28,7 +28,7 @@ groups:
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: Rabbitmq cluster partition (instance {{ $labels.instance }})
|
summary: RabbitMQ cluster partition (instance {{ $labels.instance }})
|
||||||
description: "Cluster partition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Cluster partition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: RabbitmqOutOfMemory
|
- alert: RabbitmqOutOfMemory
|
||||||
|
|
@ -37,7 +37,7 @@ groups:
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: Rabbitmq out of memory (instance {{ $labels.instance }})
|
summary: RabbitMQ out of memory (instance {{ $labels.instance }})
|
||||||
description: "Memory available for RabbmitMQ is low (< 10%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Memory available for RabbmitMQ is low (< 10%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: RabbitmqTooManyConnections
|
- alert: RabbitmqTooManyConnections
|
||||||
|
|
@ -46,7 +46,7 @@ groups:
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: Rabbitmq too many connections (instance {{ $labels.instance }})
|
summary: RabbitMQ too many connections (instance {{ $labels.instance }})
|
||||||
description: "RabbitMQ instance has too many connections (> 1000)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "RabbitMQ instance has too many connections (> 1000)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: RabbitmqDeadLetterQueueFillingUp
|
- alert: RabbitmqDeadLetterQueueFillingUp
|
||||||
|
|
@ -55,7 +55,7 @@ groups:
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: Rabbitmq dead letter queue filling up (instance {{ $labels.instance }})
|
summary: RabbitMQ dead letter queue filling up (instance {{ $labels.instance }})
|
||||||
description: "Dead letter queue is filling up (> 10 msgs)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Dead letter queue is filling up (> 10 msgs)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: RabbitmqTooManyMessagesInQueue
|
- alert: RabbitmqTooManyMessagesInQueue
|
||||||
|
|
@ -64,7 +64,7 @@ groups:
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: Rabbitmq too many messages in queue (instance {{ $labels.instance }})
|
summary: RabbitMQ too many messages in queue (instance {{ $labels.instance }})
|
||||||
description: "Queue is filling up (> 1000 msgs)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Queue is filling up (> 1000 msgs)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: RabbitmqSlowQueueConsuming
|
- alert: RabbitmqSlowQueueConsuming
|
||||||
|
|
@ -73,7 +73,7 @@ groups:
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: Rabbitmq slow queue consuming (instance {{ $labels.instance }})
|
summary: RabbitMQ slow queue consuming (instance {{ $labels.instance }})
|
||||||
description: "Queue messages are consumed slowly (> 60s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Queue messages are consumed slowly (> 60s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: RabbitmqNoConsumer
|
- alert: RabbitmqNoConsumer
|
||||||
|
|
@ -82,7 +82,7 @@ groups:
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: Rabbitmq no consumer (instance {{ $labels.instance }})
|
summary: RabbitMQ no consumer (instance {{ $labels.instance }})
|
||||||
description: "Queue has no consumer\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Queue has no consumer\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: RabbitmqTooManyConsumers
|
- alert: RabbitmqTooManyConsumers
|
||||||
|
|
@ -91,7 +91,7 @@ groups:
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: Rabbitmq too many consumers (instance {{ $labels.instance }})
|
summary: RabbitMQ too many consumers (instance {{ $labels.instance }})
|
||||||
description: "Queue should have only 1 consumer\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Queue should have only 1 consumer\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: RabbitmqUnactiveExchange
|
- alert: RabbitmqUnactiveExchange
|
||||||
|
|
@ -100,5 +100,5 @@ groups:
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: Rabbitmq unactive exchange (instance {{ $labels.instance }})
|
summary: RabbitMQ unactive exchange (instance {{ $labels.instance }})
|
||||||
description: "Exchange receive less than 5 msgs per second\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Exchange receive less than 5 msgs per second\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
|
||||||
29
dist/rules/rabbitmq/rabbitmq-exporter.yml
vendored
29
dist/rules/rabbitmq/rabbitmq-exporter.yml
vendored
|
|
@ -10,7 +10,7 @@ groups:
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: Rabbitmq node down (instance {{ $labels.instance }})
|
summary: RabbitMQ node down (instance {{ $labels.instance }})
|
||||||
description: "Less than 3 nodes running in RabbitMQ cluster\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Less than 3 nodes running in RabbitMQ cluster\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: RabbitmqNodeNotDistributed
|
- alert: RabbitmqNodeNotDistributed
|
||||||
|
|
@ -19,7 +19,7 @@ groups:
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: Rabbitmq node not distributed (instance {{ $labels.instance }})
|
summary: RabbitMQ node not distributed (instance {{ $labels.instance }})
|
||||||
description: "Distribution link state is not 'up'\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Distribution link state is not 'up'\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: RabbitmqInstancesDifferentVersions
|
- alert: RabbitmqInstancesDifferentVersions
|
||||||
|
|
@ -28,8 +28,8 @@ groups:
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: Rabbitmq instances different versions (instance {{ $labels.instance }})
|
summary: RabbitMQ instances different versions (instance {{ $labels.instance }})
|
||||||
description: "Running different version of Rabbitmq in the same cluster, can lead to failure.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Running different version of RabbitMQ in the same cluster, can lead to failure.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: RabbitmqMemoryHigh
|
- alert: RabbitmqMemoryHigh
|
||||||
expr: 'rabbitmq_process_resident_memory_bytes / rabbitmq_resident_memory_limit_bytes * 100 > 90'
|
expr: 'rabbitmq_process_resident_memory_bytes / rabbitmq_resident_memory_limit_bytes * 100 > 90'
|
||||||
|
|
@ -37,7 +37,7 @@ groups:
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: Rabbitmq memory high (instance {{ $labels.instance }})
|
summary: RabbitMQ memory high (instance {{ $labels.instance }})
|
||||||
description: "A node use more than 90% of allocated RAM\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "A node use more than 90% of allocated RAM\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: RabbitmqFileDescriptorsUsage
|
- alert: RabbitmqFileDescriptorsUsage
|
||||||
|
|
@ -46,16 +46,25 @@ groups:
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: Rabbitmq file descriptors usage (instance {{ $labels.instance }})
|
summary: RabbitMQ file descriptors usage (instance {{ $labels.instance }})
|
||||||
description: "A node use more than 90% of file descriptors\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "A node use more than 90% of file descriptors\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: RabbitmqTooManyReadyMessages
|
||||||
|
expr: 'sum(rabbitmq_queue_messages_ready) BY (queue) > 1000'
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: RabbitMQ too many ready messages (instance {{ $labels.instance }})
|
||||||
|
description: "RabbitMQ too many ready messages on {{ $labels.instace }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: RabbitmqTooManyUnackMessages
|
- alert: RabbitmqTooManyUnackMessages
|
||||||
expr: 'sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000'
|
expr: 'sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000'
|
||||||
for: 1m
|
for: 1m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: Rabbitmq too many unack messages (instance {{ $labels.instance }})
|
summary: RabbitMQ too many unack messages (instance {{ $labels.instance }})
|
||||||
description: "Too many unacknowledged messages\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Too many unacknowledged messages\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: RabbitmqTooManyConnections
|
- alert: RabbitmqTooManyConnections
|
||||||
|
|
@ -64,7 +73,7 @@ groups:
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: Rabbitmq too many connections (instance {{ $labels.instance }})
|
summary: RabbitMQ too many connections (instance {{ $labels.instance }})
|
||||||
description: "The total connections of a node is too high\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "The total connections of a node is too high\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: RabbitmqNoQueueConsumer
|
- alert: RabbitmqNoQueueConsumer
|
||||||
|
|
@ -73,7 +82,7 @@ groups:
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: Rabbitmq no queue consumer (instance {{ $labels.instance }})
|
summary: RabbitMQ no queue consumer (instance {{ $labels.instance }})
|
||||||
description: "A queue has less than 1 consumer\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "A queue has less than 1 consumer\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: RabbitmqUnroutableMessages
|
- alert: RabbitmqUnroutableMessages
|
||||||
|
|
@ -82,5 +91,5 @@ groups:
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: Rabbitmq unroutable messages (instance {{ $labels.instance }})
|
summary: RabbitMQ unroutable messages (instance {{ $labels.instance }})
|
||||||
description: "A queue has unroutable messages\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "A queue has unroutable messages\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
|
||||||
|
|
@ -77,7 +77,7 @@ groups:
|
||||||
description: "Redis is running out of system memory (> 90%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Redis is running out of system memory (> 90%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: RedisOutOfConfiguredMaxmemory
|
- alert: RedisOutOfConfiguredMaxmemory
|
||||||
expr: 'redis_memory_used_bytes / redis_memory_max_bytes * 100 > 90'
|
expr: 'redis_memory_used_bytes / redis_memory_max_bytes * 100 > 90 and on(instance) redis_memory_max_bytes > 0'
|
||||||
for: 2m
|
for: 2m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
|
|
||||||
77
dist/rules/s.m.a.r.t-device-monitoring/smartctl-exporter.yml
vendored
Normal file
77
dist/rules/s.m.a.r.t-device-monitoring/smartctl-exporter.yml
vendored
Normal file
|
|
@ -0,0 +1,77 @@
|
||||||
|
groups:
|
||||||
|
|
||||||
|
- name: SmartctlExporter
|
||||||
|
|
||||||
|
rules:
|
||||||
|
|
||||||
|
- alert: SmartDeviceTemperatureWarning
|
||||||
|
expr: '(avg_over_time(smartctl_device_temperature{temperature_type="current"} [5m]) unless on (instance, device) smartctl_device_temperature{temperature_type="drive_trip"}) > 60'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: SMART device temperature warning (instance {{ $labels.instance }})
|
||||||
|
description: "Device temperature warning on {{ $labels.instance }} drive {{ $labels.device }} over 60°C\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: SmartDeviceTemperatureCritical
|
||||||
|
expr: '(max_over_time(smartctl_device_temperature{temperature_type="current"} [5m]) unless on (instance, device) smartctl_device_temperature{temperature_type="drive_trip"}) > 70'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: SMART device temperature critical (instance {{ $labels.instance }})
|
||||||
|
description: "Device temperature critical on {{ $labels.instance }} drive {{ $labels.device }} over 70°C\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: SmartDeviceTemperatureOverTripValue
|
||||||
|
expr: 'max_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) >= on(device, instance) smartctl_device_temperature{temperature_type="drive_trip"}'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: SMART device temperature over trip value (instance {{ $labels.instance }})
|
||||||
|
description: "Device temperature over trip value on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: SmartDeviceTemperatureNearingTripValue
|
||||||
|
expr: 'max_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) >= on(device, instance) (smartctl_device_temperature{temperature_type="drive_trip"} * .80)'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: SMART device temperature nearing trip value (instance {{ $labels.instance }})
|
||||||
|
description: "Device temperature at 80% of trip value on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: SmartStatus
|
||||||
|
expr: 'smartctl_device_smart_status != 1'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: SMART status (instance {{ $labels.instance }})
|
||||||
|
description: "Device has a SMART status failure on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: SmartCriticalWarning
|
||||||
|
expr: 'smartctl_device_critical_warning > 0'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: SMART critical warning (instance {{ $labels.instance }})
|
||||||
|
description: "Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: SmartMediaErrors
|
||||||
|
expr: 'smartctl_device_media_errors > 0'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: SMART media errors (instance {{ $labels.instance }})
|
||||||
|
description: "Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: SmartWearoutIndicator
|
||||||
|
expr: 'smartctl_device_available_spare < smartctl_device_available_spare_threshold'
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: SMART Wearout Indicator (instance {{ $labels.instance }})
|
||||||
|
description: "Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
2
dist/template.yml
vendored
2
dist/template.yml
vendored
|
|
@ -11,6 +11,6 @@ groups:
|
||||||
labels:
|
labels:
|
||||||
severity: {{ rule.severity }}
|
severity: {{ rule.severity }}
|
||||||
annotations:
|
annotations:
|
||||||
summary: {{ rule.name }} (instance {% raw %}{{ $labels.instance }}{% endraw %})
|
summary: {% if rule.summary %}{{ rule.summary }}{% else %}{{ rule.name }} (instance {% raw %}{{ $labels.instance }}{% endraw %}){% endif %}
|
||||||
description: "{{ rule.description | replace: '"', '\"' }}\n VALUE = {% raw %}{{ $value }}{% endraw %}\n LABELS = {% raw %}{{ $labels }}{% endraw %}"
|
description: "{{ rule.description | replace: '"', '\"' }}\n VALUE = {% raw %}{{ $value }}{% endraw %}\n LABELS = {% raw %}{{ $labels }}{% endraw %}"
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
Loading…
Reference in a new issue