mirror of
https://github.com/samber/awesome-prometheus-alerts.git
synced 2026-06-26 03:17:07 +08:00
Merge branch 'master' into master
This commit is contained in:
commit
860055d870
6 changed files with 341 additions and 116 deletions
4
Gemfile
4
Gemfile
|
|
@ -1,3 +1,3 @@
|
|||
source 'https://rubygems.org'
|
||||
gem 'github-pages', group: :jekyll_plugins
|
||||
gem 'webrick', '~> 1.3', '>= 1.3.1'
|
||||
gem 'github-pages', '>= 232', group: :jekyll_plugins
|
||||
gem 'webrick', '~> 1.8'
|
||||
213
Gemfile.lock
213
Gemfile.lock
|
|
@ -1,66 +1,61 @@
|
|||
GEM
|
||||
remote: https://rubygems.org/
|
||||
specs:
|
||||
activesupport (6.0.6.1)
|
||||
concurrent-ruby (~> 1.0, >= 1.0.2)
|
||||
i18n (>= 0.7, < 2)
|
||||
minitest (~> 5.1)
|
||||
tzinfo (~> 1.1)
|
||||
zeitwerk (~> 2.2, >= 2.2.2)
|
||||
addressable (2.8.0)
|
||||
public_suffix (>= 2.0.2, < 5.0)
|
||||
activesupport (7.2.1)
|
||||
base64
|
||||
bigdecimal
|
||||
concurrent-ruby (~> 1.0, >= 1.3.1)
|
||||
connection_pool (>= 2.2.5)
|
||||
drb
|
||||
i18n (>= 1.6, < 2)
|
||||
logger (>= 1.4.2)
|
||||
minitest (>= 5.1)
|
||||
securerandom (>= 0.3)
|
||||
tzinfo (~> 2.0, >= 2.0.5)
|
||||
addressable (2.8.7)
|
||||
public_suffix (>= 2.0.2, < 7.0)
|
||||
base64 (0.2.0)
|
||||
bigdecimal (3.1.8)
|
||||
coffee-script (2.4.1)
|
||||
coffee-script-source
|
||||
execjs
|
||||
coffee-script-source (1.11.1)
|
||||
coffee-script-source (1.12.2)
|
||||
colorator (1.1.0)
|
||||
commonmarker (0.23.10)
|
||||
concurrent-ruby (1.2.0)
|
||||
dnsruby (1.61.9)
|
||||
simpleidn (~> 0.1)
|
||||
concurrent-ruby (1.3.4)
|
||||
connection_pool (2.4.1)
|
||||
csv (3.3.0)
|
||||
dnsruby (1.72.2)
|
||||
simpleidn (~> 0.2.1)
|
||||
drb (2.2.1)
|
||||
em-websocket (0.5.3)
|
||||
eventmachine (>= 0.12.9)
|
||||
http_parser.rb (~> 0)
|
||||
ethon (0.15.0)
|
||||
ethon (0.16.0)
|
||||
ffi (>= 1.15.0)
|
||||
eventmachine (1.2.7)
|
||||
execjs (2.8.1)
|
||||
faraday (1.10.0)
|
||||
faraday-em_http (~> 1.0)
|
||||
faraday-em_synchrony (~> 1.0)
|
||||
faraday-excon (~> 1.1)
|
||||
faraday-httpclient (~> 1.0)
|
||||
faraday-multipart (~> 1.0)
|
||||
faraday-net_http (~> 1.0)
|
||||
faraday-net_http_persistent (~> 1.0)
|
||||
faraday-patron (~> 1.0)
|
||||
faraday-rack (~> 1.0)
|
||||
faraday-retry (~> 1.0)
|
||||
ruby2_keywords (>= 0.0.4)
|
||||
faraday-em_http (1.0.0)
|
||||
faraday-em_synchrony (1.0.0)
|
||||
faraday-excon (1.1.0)
|
||||
faraday-httpclient (1.0.1)
|
||||
faraday-multipart (1.0.3)
|
||||
multipart-post (>= 1.2, < 3)
|
||||
faraday-net_http (1.0.1)
|
||||
faraday-net_http_persistent (1.2.0)
|
||||
faraday-patron (1.0.0)
|
||||
faraday-rack (1.0.0)
|
||||
faraday-retry (1.0.3)
|
||||
ffi (1.15.5)
|
||||
execjs (2.9.1)
|
||||
faraday (2.12.0)
|
||||
faraday-net_http (>= 2.0, < 3.4)
|
||||
json
|
||||
logger
|
||||
faraday-net_http (3.3.0)
|
||||
net-http
|
||||
ffi (1.17.0)
|
||||
ffi (1.17.0-x86_64-linux-gnu)
|
||||
ffi (1.17.0-x86_64-linux-musl)
|
||||
forwardable-extended (2.6.0)
|
||||
gemoji (3.0.1)
|
||||
github-pages (226)
|
||||
github-pages-health-check (= 1.17.9)
|
||||
jekyll (= 3.9.2)
|
||||
jekyll-avatar (= 0.7.0)
|
||||
jekyll-coffeescript (= 1.1.1)
|
||||
jekyll-commonmark-ghpages (= 0.2.0)
|
||||
jekyll-default-layout (= 0.1.4)
|
||||
jekyll-feed (= 0.15.1)
|
||||
gemoji (4.1.0)
|
||||
github-pages (232)
|
||||
github-pages-health-check (= 1.18.2)
|
||||
jekyll (= 3.10.0)
|
||||
jekyll-avatar (= 0.8.0)
|
||||
jekyll-coffeescript (= 1.2.2)
|
||||
jekyll-commonmark-ghpages (= 0.5.1)
|
||||
jekyll-default-layout (= 0.1.5)
|
||||
jekyll-feed (= 0.17.0)
|
||||
jekyll-gist (= 1.5.0)
|
||||
jekyll-github-metadata (= 2.13.0)
|
||||
jekyll-github-metadata (= 2.16.1)
|
||||
jekyll-include-cache (= 0.2.1)
|
||||
jekyll-mentions (= 1.6.0)
|
||||
jekyll-optional-front-matter (= 0.3.2)
|
||||
|
|
@ -87,32 +82,34 @@ GEM
|
|||
jekyll-theme-tactile (= 0.2.0)
|
||||
jekyll-theme-time-machine (= 0.2.0)
|
||||
jekyll-titles-from-headings (= 0.5.3)
|
||||
jemoji (= 0.12.0)
|
||||
kramdown (= 2.3.2)
|
||||
jemoji (= 0.13.0)
|
||||
kramdown (= 2.4.0)
|
||||
kramdown-parser-gfm (= 1.1.0)
|
||||
liquid (= 4.0.3)
|
||||
liquid (= 4.0.4)
|
||||
mercenary (~> 0.3)
|
||||
minima (= 2.5.1)
|
||||
nokogiri (>= 1.13.4, < 2.0)
|
||||
rouge (= 3.26.0)
|
||||
nokogiri (>= 1.16.2, < 2.0)
|
||||
rouge (= 3.30.0)
|
||||
terminal-table (~> 1.4)
|
||||
github-pages-health-check (1.17.9)
|
||||
webrick (~> 1.8)
|
||||
github-pages-health-check (1.18.2)
|
||||
addressable (~> 2.3)
|
||||
dnsruby (~> 1.60)
|
||||
octokit (~> 4.0)
|
||||
public_suffix (>= 3.0, < 5.0)
|
||||
octokit (>= 4, < 8)
|
||||
public_suffix (>= 3.0, < 6.0)
|
||||
typhoeus (~> 1.3)
|
||||
html-pipeline (2.14.1)
|
||||
html-pipeline (2.14.3)
|
||||
activesupport (>= 2)
|
||||
nokogiri (>= 1.4)
|
||||
http_parser.rb (0.8.0)
|
||||
i18n (0.9.5)
|
||||
i18n (1.14.6)
|
||||
concurrent-ruby (~> 1.0)
|
||||
jekyll (3.9.2)
|
||||
jekyll (3.10.0)
|
||||
addressable (~> 2.4)
|
||||
colorator (~> 1.0)
|
||||
csv (~> 3.0)
|
||||
em-websocket (~> 0.5)
|
||||
i18n (~> 0.7)
|
||||
i18n (>= 0.7, < 2)
|
||||
jekyll-sass-converter (~> 1.0)
|
||||
jekyll-watch (~> 2.0)
|
||||
kramdown (>= 1.17, < 3)
|
||||
|
|
@ -121,27 +118,28 @@ GEM
|
|||
pathutil (~> 0.9)
|
||||
rouge (>= 1.7, < 4)
|
||||
safe_yaml (~> 1.0)
|
||||
jekyll-avatar (0.7.0)
|
||||
webrick (>= 1.0)
|
||||
jekyll-avatar (0.8.0)
|
||||
jekyll (>= 3.0, < 5.0)
|
||||
jekyll-coffeescript (1.1.1)
|
||||
jekyll-coffeescript (1.2.2)
|
||||
coffee-script (~> 2.2)
|
||||
coffee-script-source (~> 1.11.1)
|
||||
coffee-script-source (~> 1.12)
|
||||
jekyll-commonmark (1.4.0)
|
||||
commonmarker (~> 0.22)
|
||||
jekyll-commonmark-ghpages (0.2.0)
|
||||
commonmarker (~> 0.23.4)
|
||||
jekyll (~> 3.9.0)
|
||||
jekyll-commonmark-ghpages (0.5.1)
|
||||
commonmarker (>= 0.23.7, < 1.1.0)
|
||||
jekyll (>= 3.9, < 4.0)
|
||||
jekyll-commonmark (~> 1.4.0)
|
||||
rouge (>= 2.0, < 4.0)
|
||||
jekyll-default-layout (0.1.4)
|
||||
jekyll (~> 3.0)
|
||||
jekyll-feed (0.15.1)
|
||||
rouge (>= 2.0, < 5.0)
|
||||
jekyll-default-layout (0.1.5)
|
||||
jekyll (>= 3.0, < 5.0)
|
||||
jekyll-feed (0.17.0)
|
||||
jekyll (>= 3.7, < 5.0)
|
||||
jekyll-gist (1.5.0)
|
||||
octokit (~> 4.2)
|
||||
jekyll-github-metadata (2.13.0)
|
||||
jekyll-github-metadata (2.16.1)
|
||||
jekyll (>= 3.4, < 5.0)
|
||||
octokit (~> 4.0, != 4.4.0)
|
||||
octokit (>= 4, < 7, != 4.4.0)
|
||||
jekyll-include-cache (0.2.1)
|
||||
jekyll (>= 3.7, < 5.0)
|
||||
jekyll-mentions (1.6.0)
|
||||
|
|
@ -212,41 +210,46 @@ GEM
|
|||
jekyll (>= 3.3, < 5.0)
|
||||
jekyll-watch (2.2.1)
|
||||
listen (~> 3.0)
|
||||
jemoji (0.12.0)
|
||||
gemoji (~> 3.0)
|
||||
jemoji (0.13.0)
|
||||
gemoji (>= 3, < 5)
|
||||
html-pipeline (~> 2.2)
|
||||
jekyll (>= 3.0, < 5.0)
|
||||
kramdown (2.3.2)
|
||||
json (2.7.2)
|
||||
kramdown (2.4.0)
|
||||
rexml
|
||||
kramdown-parser-gfm (1.1.0)
|
||||
kramdown (~> 2.0)
|
||||
liquid (4.0.3)
|
||||
listen (3.7.1)
|
||||
liquid (4.0.4)
|
||||
listen (3.9.0)
|
||||
rb-fsevent (~> 0.10, >= 0.10.3)
|
||||
rb-inotify (~> 0.9, >= 0.9.10)
|
||||
logger (1.6.1)
|
||||
mercenary (0.3.6)
|
||||
mini_portile2 (2.8.7)
|
||||
minima (2.5.1)
|
||||
jekyll (>= 3.5, < 5.0)
|
||||
jekyll-feed (~> 0.9)
|
||||
jekyll-seo-tag (~> 2.1)
|
||||
minitest (5.17.0)
|
||||
multipart-post (2.1.1)
|
||||
nokogiri (1.16.5-x86_64-linux)
|
||||
minitest (5.25.1)
|
||||
net-http (0.4.1)
|
||||
uri
|
||||
nokogiri (1.16.7)
|
||||
mini_portile2 (~> 2.8.2)
|
||||
racc (~> 1.4)
|
||||
octokit (4.22.0)
|
||||
faraday (>= 0.9)
|
||||
sawyer (~> 0.8.0, >= 0.5.3)
|
||||
nokogiri (1.16.7-x86_64-linux)
|
||||
racc (~> 1.4)
|
||||
octokit (4.25.1)
|
||||
faraday (>= 1, < 3)
|
||||
sawyer (~> 0.9)
|
||||
pathutil (0.16.2)
|
||||
forwardable-extended (~> 2.6)
|
||||
public_suffix (4.0.7)
|
||||
racc (1.7.3)
|
||||
rb-fsevent (0.11.1)
|
||||
rb-inotify (0.10.1)
|
||||
public_suffix (5.1.1)
|
||||
racc (1.8.1)
|
||||
rb-fsevent (0.11.2)
|
||||
rb-inotify (0.11.1)
|
||||
ffi (~> 1.0)
|
||||
rexml (3.2.8)
|
||||
strscan (>= 3.0.9)
|
||||
rouge (3.26.0)
|
||||
ruby2_keywords (0.0.5)
|
||||
rexml (3.3.7)
|
||||
rouge (3.30.0)
|
||||
rubyzip (2.3.2)
|
||||
safe_yaml (1.0.5)
|
||||
sass (3.7.4)
|
||||
|
|
@ -254,33 +257,29 @@ GEM
|
|||
sass-listen (4.0.0)
|
||||
rb-fsevent (~> 0.9, >= 0.9.4)
|
||||
rb-inotify (~> 0.9, >= 0.9.7)
|
||||
sawyer (0.8.2)
|
||||
sawyer (0.9.2)
|
||||
addressable (>= 2.3.5)
|
||||
faraday (> 0.8, < 2.0)
|
||||
simpleidn (0.2.1)
|
||||
unf (~> 0.1.4)
|
||||
strscan (3.1.0)
|
||||
faraday (>= 0.17.3, < 3)
|
||||
securerandom (0.3.1)
|
||||
simpleidn (0.2.3)
|
||||
terminal-table (1.8.0)
|
||||
unicode-display_width (~> 1.1, >= 1.1.1)
|
||||
thread_safe (0.3.6)
|
||||
typhoeus (1.4.0)
|
||||
typhoeus (1.4.1)
|
||||
ethon (>= 0.9.0)
|
||||
tzinfo (1.2.11)
|
||||
thread_safe (~> 0.1)
|
||||
unf (0.1.4)
|
||||
unf_ext
|
||||
unf_ext (0.0.8.1)
|
||||
tzinfo (2.0.6)
|
||||
concurrent-ruby (~> 1.0)
|
||||
unicode-display_width (1.8.0)
|
||||
webrick (1.7.0)
|
||||
zeitwerk (2.6.6)
|
||||
uri (0.13.1)
|
||||
webrick (1.8.2)
|
||||
|
||||
PLATFORMS
|
||||
ruby
|
||||
x86_64-linux
|
||||
x86_64-linux-musl
|
||||
|
||||
DEPENDENCIES
|
||||
github-pages
|
||||
webrick (~> 1.3, >= 1.3.1)
|
||||
github-pages (>= 232)
|
||||
webrick (~> 1.8)
|
||||
|
||||
BUNDLED WITH
|
||||
2.3.13
|
||||
|
|
|
|||
|
|
@ -267,8 +267,7 @@ groups:
|
|||
for: 5m
|
||||
- name: Host node overtemperature alarm
|
||||
description: "Physical node temperature alarm triggered"
|
||||
# This is a critical alarm, some things (eg. NVMe) have just the temp alarm.
|
||||
query: 'node_hwmon_temp_crit_alarm_celsius == 1 or node_hwmon_temp_alarm == 1'
|
||||
query: '((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
severity: critical
|
||||
- name: Host Software RAID insufficient drives
|
||||
description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} has insufficient drives remaining."
|
||||
|
|
@ -744,9 +743,11 @@ groups:
|
|||
See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
|
||||
- name: Postgresql invalid index
|
||||
description: "The table {{ $labels.relname }} has an invalid index: {{ $labels.indexrelname }}. You should execute `DROP INDEX {{ $labels.indexrelname }};`"
|
||||
query: 'pg_genaral_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}'
|
||||
query: 'pg_general_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}'
|
||||
severity: warning
|
||||
for: 6h
|
||||
comments: |
|
||||
See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
|
||||
|
||||
- name: SQL Server
|
||||
exporters:
|
||||
|
|
@ -1546,9 +1547,90 @@ groups:
|
|||
for: 3m
|
||||
- name: Nats high routes count
|
||||
description: High number of NATS routes ({{ $value }}) for {{ $labels.instance }}
|
||||
query: "gnatsd_routez_num_routes > 10"
|
||||
query: "gnatsd_varz_routes > 10"
|
||||
severity: warning
|
||||
for: 3m
|
||||
- name: Nats high memory usage
|
||||
description: NATS server memory usage is above 200MB for {{ $labels.instance }}
|
||||
query: "gnatsd_varz_mem > 200 * 1024 * 1024"
|
||||
severity: warning
|
||||
for: 5m
|
||||
- name: Nats slow consumers
|
||||
description: There are slow consumers in NATS for {{ $labels.instance }}
|
||||
query: "gnatsd_varz_slow_consumers > 0"
|
||||
severity: critical
|
||||
for: 3m
|
||||
- name: Nats server down
|
||||
description: NATS server has been down for more than 5 minutes
|
||||
query: 'absent(up{job="nats"})'
|
||||
severity: critical
|
||||
for: 5m
|
||||
- name: Nats high CPU usage
|
||||
description: NATS server is using more than 80% CPU for the last 5 minutes
|
||||
query: "rate(gnatsd_varz_cpu[5m]) > 0.8"
|
||||
severity: warning
|
||||
for: 5m
|
||||
- name: Nats high number of connections
|
||||
description: NATS server has more than 1000 active connections
|
||||
query: "gnatsd_connz_num_connections > 1000"
|
||||
severity: warning
|
||||
for: 5m
|
||||
- name: Nats high JetStream store usage
|
||||
description: JetStream store usage is over 80%
|
||||
query: "gnatsd_varz_jetstream_stats_storage / gnatsd_varz_jetstream_config_max_storage > 0.8"
|
||||
severity: warning
|
||||
for: 5m
|
||||
- name: Nats high JetStream memory usage
|
||||
description: JetStream memory usage is over 80%
|
||||
query: "gnatsd_varz_jetstream_stats_memory / gnatsd_varz_jetstream_config_max_memory > 0.8"
|
||||
severity: warning
|
||||
for: 5m
|
||||
- name: Nats high number of subscriptions
|
||||
description: NATS server has more than 1000 active subscriptions
|
||||
query: "gnatsd_connz_subscriptions > 1000"
|
||||
severity: warning
|
||||
for: 5m
|
||||
- name: Nats high pending bytes
|
||||
description: NATS server has more than 100,000 pending bytes
|
||||
query: "gnatsd_connz_pending_bytes > 100000"
|
||||
severity: warning
|
||||
for: 5m
|
||||
- name: Nats too many errors
|
||||
description: NATS server has encountered errors in the last 5 minutes
|
||||
query: "increase(gnatsd_varz_jetstream_stats_api_errors[5m]) > 0"
|
||||
severity: warning
|
||||
for: 5m
|
||||
- name: Nats JetStream consumers exceeded
|
||||
description: JetStream has more than 100 active consumers
|
||||
query: "sum(gnatsd_varz_jetstream_stats_accounts) > 100"
|
||||
severity: warning
|
||||
for: 5m
|
||||
- name: Nats frequent authentication timeouts
|
||||
description: There have been more than 5 authentication timeouts in the last 5 minutes
|
||||
query: "increase(gnatsd_varz_auth_timeout[5m]) > 5"
|
||||
severity: warning
|
||||
for: 5m
|
||||
- name: Nats max payload size exceeded
|
||||
description: The max payload size allowed by NATS has been exceeded (1MB)
|
||||
query: "max(gnatsd_varz_max_payload) > 1024 * 1024"
|
||||
severity: critical
|
||||
for: 5m
|
||||
- name: Nats leaf node connection issue
|
||||
description: No leaf node connections have been established in the last 5 minutes
|
||||
query: "increase(gnatsd_varz_leafnodes[5m]) == 0"
|
||||
severity: critical
|
||||
for: 5m
|
||||
- name: Nats max ping operations exceeded
|
||||
description: The maximum number of ping operations in NATS has exceeded 50
|
||||
query: "gnatsd_varz_ping_max > 50"
|
||||
severity: warning
|
||||
for: 5m
|
||||
- name: Nats write deadline exceeded
|
||||
description: The write deadline has been exceeded in NATS, indicating potential message delivery issues
|
||||
query: "gnatsd_varz_write_deadline > 10"
|
||||
severity: critical
|
||||
for: 5m
|
||||
|
||||
|
||||
- name: Solr
|
||||
exporters:
|
||||
|
|
|
|||
|
|
@ -206,7 +206,7 @@ groups:
|
|||
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostNodeOvertemperatureAlarm
|
||||
expr: 'node_hwmon_temp_crit_alarm_celsius == 1 or node_hwmon_temp_alarm == 1'
|
||||
expr: '((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
|
|
|
|||
146
dist/rules/nats/nats-exporter.yml
vendored
146
dist/rules/nats/nats-exporter.yml
vendored
|
|
@ -32,10 +32,154 @@ groups:
|
|||
description: "High number of NATS subscriptions ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NatsHighRoutesCount
|
||||
expr: 'gnatsd_routez_num_routes > 10'
|
||||
expr: 'gnatsd_varz_routes > 10'
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Nats high routes count (instance {{ $labels.instance }})
|
||||
description: "High number of NATS routes ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NatsHighMemoryUsage
|
||||
expr: 'gnatsd_varz_mem > 200 * 1024 * 1024'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Nats high memory usage (instance {{ $labels.instance }})
|
||||
description: "NATS server memory usage is above 200MB for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NatsSlowConsumers
|
||||
expr: 'gnatsd_varz_slow_consumers > 0'
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Nats slow consumers (instance {{ $labels.instance }})
|
||||
description: "There are slow consumers in NATS for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NatsServerDown
|
||||
expr: 'absent(up{job="nats"})'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Nats server down (instance {{ $labels.instance }})
|
||||
description: "NATS server has been down for more than 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NatsHighCpuUsage
|
||||
expr: 'rate(gnatsd_varz_cpu[5m]) > 0.8'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Nats high CPU usage (instance {{ $labels.instance }})
|
||||
description: "NATS server is using more than 80% CPU for the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NatsHighNumberOfConnections
|
||||
expr: 'gnatsd_connz_num_connections > 1000'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Nats high number of connections (instance {{ $labels.instance }})
|
||||
description: "NATS server has more than 1000 active connections\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NatsHighJetstreamStoreUsage
|
||||
expr: 'gnatsd_varz_jetstream_stats_storage / gnatsd_varz_jetstream_config_max_storage > 0.8'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Nats high JetStream store usage (instance {{ $labels.instance }})
|
||||
description: "JetStream store usage is over 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NatsHighJetstreamMemoryUsage
|
||||
expr: 'gnatsd_varz_jetstream_stats_memory / gnatsd_varz_jetstream_config_max_memory > 0.8'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Nats high JetStream memory usage (instance {{ $labels.instance }})
|
||||
description: "JetStream memory usage is over 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NatsHighNumberOfSubscriptions
|
||||
expr: 'gnatsd_connz_subscriptions > 1000'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Nats high number of subscriptions (instance {{ $labels.instance }})
|
||||
description: "NATS server has more than 1000 active subscriptions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NatsHighPendingBytes
|
||||
expr: 'gnatsd_connz_pending_bytes > 100000'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Nats high pending bytes (instance {{ $labels.instance }})
|
||||
description: "NATS server has more than 100,000 pending bytes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NatsTooManyErrors
|
||||
expr: 'increase(gnatsd_varz_jetstream_stats_api_errors[5m]) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Nats too many errors (instance {{ $labels.instance }})
|
||||
description: "NATS server has encountered errors in the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NatsJetstreamConsumersExceeded
|
||||
expr: 'sum(gnatsd_varz_jetstream_stats_accounts) > 100'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Nats JetStream consumers exceeded (instance {{ $labels.instance }})
|
||||
description: "JetStream has more than 100 active consumers\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NatsFrequentAuthenticationTimeouts
|
||||
expr: 'increase(gnatsd_varz_auth_timeout[5m]) > 5'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Nats frequent authentication timeouts (instance {{ $labels.instance }})
|
||||
description: "There have been more than 5 authentication timeouts in the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NatsMaxPayloadSizeExceeded
|
||||
expr: 'max(gnatsd_varz_max_payload) > 1024 * 1024'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Nats max payload size exceeded (instance {{ $labels.instance }})
|
||||
description: "The max payload size allowed by NATS has been exceeded (1MB)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NatsLeafNodeConnectionIssue
|
||||
expr: 'increase(gnatsd_varz_leafnodes[5m]) == 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Nats leaf node connection issue (instance {{ $labels.instance }})
|
||||
description: "No leaf node connections have been established in the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NatsMaxPingOperationsExceeded
|
||||
expr: 'gnatsd_varz_ping_max > 50'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Nats max ping operations exceeded (instance {{ $labels.instance }})
|
||||
description: "The maximum number of ping operations in NATS has exceeded 50\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NatsWriteDeadlineExceeded
|
||||
expr: 'gnatsd_varz_write_deadline > 10'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Nats write deadline exceeded (instance {{ $labels.instance }})
|
||||
description: "The write deadline has been exceeded in NATS, indicating potential message delivery issues\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
|
|||
2
dist/rules/postgresql/postgres-exporter.yml
vendored
2
dist/rules/postgresql/postgres-exporter.yml
vendored
|
|
@ -185,7 +185,7 @@ groups:
|
|||
description: "The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlInvalidIndex
|
||||
expr: 'pg_genaral_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}'
|
||||
expr: 'pg_general_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}'
|
||||
for: 6h
|
||||
labels:
|
||||
severity: warning
|
||||
|
|
|
|||
Loading…
Reference in a new issue