Merge branch 'master' into master

This commit is contained in:
guruevi 2024-10-03 08:41:53 -04:00 committed by GitHub
commit 860055d870
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 341 additions and 116 deletions

View file

@ -1,3 +1,3 @@
source 'https://rubygems.org'
gem 'github-pages', group: :jekyll_plugins
gem 'webrick', '~> 1.3', '>= 1.3.1'
gem 'github-pages', '>= 232', group: :jekyll_plugins
gem 'webrick', '~> 1.8'

View file

@ -1,66 +1,61 @@
GEM
remote: https://rubygems.org/
specs:
activesupport (6.0.6.1)
concurrent-ruby (~> 1.0, >= 1.0.2)
i18n (>= 0.7, < 2)
minitest (~> 5.1)
tzinfo (~> 1.1)
zeitwerk (~> 2.2, >= 2.2.2)
addressable (2.8.0)
public_suffix (>= 2.0.2, < 5.0)
activesupport (7.2.1)
base64
bigdecimal
concurrent-ruby (~> 1.0, >= 1.3.1)
connection_pool (>= 2.2.5)
drb
i18n (>= 1.6, < 2)
logger (>= 1.4.2)
minitest (>= 5.1)
securerandom (>= 0.3)
tzinfo (~> 2.0, >= 2.0.5)
addressable (2.8.7)
public_suffix (>= 2.0.2, < 7.0)
base64 (0.2.0)
bigdecimal (3.1.8)
coffee-script (2.4.1)
coffee-script-source
execjs
coffee-script-source (1.11.1)
coffee-script-source (1.12.2)
colorator (1.1.0)
commonmarker (0.23.10)
concurrent-ruby (1.2.0)
dnsruby (1.61.9)
simpleidn (~> 0.1)
concurrent-ruby (1.3.4)
connection_pool (2.4.1)
csv (3.3.0)
dnsruby (1.72.2)
simpleidn (~> 0.2.1)
drb (2.2.1)
em-websocket (0.5.3)
eventmachine (>= 0.12.9)
http_parser.rb (~> 0)
ethon (0.15.0)
ethon (0.16.0)
ffi (>= 1.15.0)
eventmachine (1.2.7)
execjs (2.8.1)
faraday (1.10.0)
faraday-em_http (~> 1.0)
faraday-em_synchrony (~> 1.0)
faraday-excon (~> 1.1)
faraday-httpclient (~> 1.0)
faraday-multipart (~> 1.0)
faraday-net_http (~> 1.0)
faraday-net_http_persistent (~> 1.0)
faraday-patron (~> 1.0)
faraday-rack (~> 1.0)
faraday-retry (~> 1.0)
ruby2_keywords (>= 0.0.4)
faraday-em_http (1.0.0)
faraday-em_synchrony (1.0.0)
faraday-excon (1.1.0)
faraday-httpclient (1.0.1)
faraday-multipart (1.0.3)
multipart-post (>= 1.2, < 3)
faraday-net_http (1.0.1)
faraday-net_http_persistent (1.2.0)
faraday-patron (1.0.0)
faraday-rack (1.0.0)
faraday-retry (1.0.3)
ffi (1.15.5)
execjs (2.9.1)
faraday (2.12.0)
faraday-net_http (>= 2.0, < 3.4)
json
logger
faraday-net_http (3.3.0)
net-http
ffi (1.17.0)
ffi (1.17.0-x86_64-linux-gnu)
ffi (1.17.0-x86_64-linux-musl)
forwardable-extended (2.6.0)
gemoji (3.0.1)
github-pages (226)
github-pages-health-check (= 1.17.9)
jekyll (= 3.9.2)
jekyll-avatar (= 0.7.0)
jekyll-coffeescript (= 1.1.1)
jekyll-commonmark-ghpages (= 0.2.0)
jekyll-default-layout (= 0.1.4)
jekyll-feed (= 0.15.1)
gemoji (4.1.0)
github-pages (232)
github-pages-health-check (= 1.18.2)
jekyll (= 3.10.0)
jekyll-avatar (= 0.8.0)
jekyll-coffeescript (= 1.2.2)
jekyll-commonmark-ghpages (= 0.5.1)
jekyll-default-layout (= 0.1.5)
jekyll-feed (= 0.17.0)
jekyll-gist (= 1.5.0)
jekyll-github-metadata (= 2.13.0)
jekyll-github-metadata (= 2.16.1)
jekyll-include-cache (= 0.2.1)
jekyll-mentions (= 1.6.0)
jekyll-optional-front-matter (= 0.3.2)
@ -87,32 +82,34 @@ GEM
jekyll-theme-tactile (= 0.2.0)
jekyll-theme-time-machine (= 0.2.0)
jekyll-titles-from-headings (= 0.5.3)
jemoji (= 0.12.0)
kramdown (= 2.3.2)
jemoji (= 0.13.0)
kramdown (= 2.4.0)
kramdown-parser-gfm (= 1.1.0)
liquid (= 4.0.3)
liquid (= 4.0.4)
mercenary (~> 0.3)
minima (= 2.5.1)
nokogiri (>= 1.13.4, < 2.0)
rouge (= 3.26.0)
nokogiri (>= 1.16.2, < 2.0)
rouge (= 3.30.0)
terminal-table (~> 1.4)
github-pages-health-check (1.17.9)
webrick (~> 1.8)
github-pages-health-check (1.18.2)
addressable (~> 2.3)
dnsruby (~> 1.60)
octokit (~> 4.0)
public_suffix (>= 3.0, < 5.0)
octokit (>= 4, < 8)
public_suffix (>= 3.0, < 6.0)
typhoeus (~> 1.3)
html-pipeline (2.14.1)
html-pipeline (2.14.3)
activesupport (>= 2)
nokogiri (>= 1.4)
http_parser.rb (0.8.0)
i18n (0.9.5)
i18n (1.14.6)
concurrent-ruby (~> 1.0)
jekyll (3.9.2)
jekyll (3.10.0)
addressable (~> 2.4)
colorator (~> 1.0)
csv (~> 3.0)
em-websocket (~> 0.5)
i18n (~> 0.7)
i18n (>= 0.7, < 2)
jekyll-sass-converter (~> 1.0)
jekyll-watch (~> 2.0)
kramdown (>= 1.17, < 3)
@ -121,27 +118,28 @@ GEM
pathutil (~> 0.9)
rouge (>= 1.7, < 4)
safe_yaml (~> 1.0)
jekyll-avatar (0.7.0)
webrick (>= 1.0)
jekyll-avatar (0.8.0)
jekyll (>= 3.0, < 5.0)
jekyll-coffeescript (1.1.1)
jekyll-coffeescript (1.2.2)
coffee-script (~> 2.2)
coffee-script-source (~> 1.11.1)
coffee-script-source (~> 1.12)
jekyll-commonmark (1.4.0)
commonmarker (~> 0.22)
jekyll-commonmark-ghpages (0.2.0)
commonmarker (~> 0.23.4)
jekyll (~> 3.9.0)
jekyll-commonmark-ghpages (0.5.1)
commonmarker (>= 0.23.7, < 1.1.0)
jekyll (>= 3.9, < 4.0)
jekyll-commonmark (~> 1.4.0)
rouge (>= 2.0, < 4.0)
jekyll-default-layout (0.1.4)
jekyll (~> 3.0)
jekyll-feed (0.15.1)
rouge (>= 2.0, < 5.0)
jekyll-default-layout (0.1.5)
jekyll (>= 3.0, < 5.0)
jekyll-feed (0.17.0)
jekyll (>= 3.7, < 5.0)
jekyll-gist (1.5.0)
octokit (~> 4.2)
jekyll-github-metadata (2.13.0)
jekyll-github-metadata (2.16.1)
jekyll (>= 3.4, < 5.0)
octokit (~> 4.0, != 4.4.0)
octokit (>= 4, < 7, != 4.4.0)
jekyll-include-cache (0.2.1)
jekyll (>= 3.7, < 5.0)
jekyll-mentions (1.6.0)
@ -212,41 +210,46 @@ GEM
jekyll (>= 3.3, < 5.0)
jekyll-watch (2.2.1)
listen (~> 3.0)
jemoji (0.12.0)
gemoji (~> 3.0)
jemoji (0.13.0)
gemoji (>= 3, < 5)
html-pipeline (~> 2.2)
jekyll (>= 3.0, < 5.0)
kramdown (2.3.2)
json (2.7.2)
kramdown (2.4.0)
rexml
kramdown-parser-gfm (1.1.0)
kramdown (~> 2.0)
liquid (4.0.3)
listen (3.7.1)
liquid (4.0.4)
listen (3.9.0)
rb-fsevent (~> 0.10, >= 0.10.3)
rb-inotify (~> 0.9, >= 0.9.10)
logger (1.6.1)
mercenary (0.3.6)
mini_portile2 (2.8.7)
minima (2.5.1)
jekyll (>= 3.5, < 5.0)
jekyll-feed (~> 0.9)
jekyll-seo-tag (~> 2.1)
minitest (5.17.0)
multipart-post (2.1.1)
nokogiri (1.16.5-x86_64-linux)
minitest (5.25.1)
net-http (0.4.1)
uri
nokogiri (1.16.7)
mini_portile2 (~> 2.8.2)
racc (~> 1.4)
octokit (4.22.0)
faraday (>= 0.9)
sawyer (~> 0.8.0, >= 0.5.3)
nokogiri (1.16.7-x86_64-linux)
racc (~> 1.4)
octokit (4.25.1)
faraday (>= 1, < 3)
sawyer (~> 0.9)
pathutil (0.16.2)
forwardable-extended (~> 2.6)
public_suffix (4.0.7)
racc (1.7.3)
rb-fsevent (0.11.1)
rb-inotify (0.10.1)
public_suffix (5.1.1)
racc (1.8.1)
rb-fsevent (0.11.2)
rb-inotify (0.11.1)
ffi (~> 1.0)
rexml (3.2.8)
strscan (>= 3.0.9)
rouge (3.26.0)
ruby2_keywords (0.0.5)
rexml (3.3.7)
rouge (3.30.0)
rubyzip (2.3.2)
safe_yaml (1.0.5)
sass (3.7.4)
@ -254,33 +257,29 @@ GEM
sass-listen (4.0.0)
rb-fsevent (~> 0.9, >= 0.9.4)
rb-inotify (~> 0.9, >= 0.9.7)
sawyer (0.8.2)
sawyer (0.9.2)
addressable (>= 2.3.5)
faraday (> 0.8, < 2.0)
simpleidn (0.2.1)
unf (~> 0.1.4)
strscan (3.1.0)
faraday (>= 0.17.3, < 3)
securerandom (0.3.1)
simpleidn (0.2.3)
terminal-table (1.8.0)
unicode-display_width (~> 1.1, >= 1.1.1)
thread_safe (0.3.6)
typhoeus (1.4.0)
typhoeus (1.4.1)
ethon (>= 0.9.0)
tzinfo (1.2.11)
thread_safe (~> 0.1)
unf (0.1.4)
unf_ext
unf_ext (0.0.8.1)
tzinfo (2.0.6)
concurrent-ruby (~> 1.0)
unicode-display_width (1.8.0)
webrick (1.7.0)
zeitwerk (2.6.6)
uri (0.13.1)
webrick (1.8.2)
PLATFORMS
ruby
x86_64-linux
x86_64-linux-musl
DEPENDENCIES
github-pages
webrick (~> 1.3, >= 1.3.1)
github-pages (>= 232)
webrick (~> 1.8)
BUNDLED WITH
2.3.13

View file

@ -267,8 +267,7 @@ groups:
for: 5m
- name: Host node overtemperature alarm
description: "Physical node temperature alarm triggered"
# This is a critical alarm, some things (eg. NVMe) have just the temp alarm.
query: 'node_hwmon_temp_crit_alarm_celsius == 1 or node_hwmon_temp_alarm == 1'
query: '((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
severity: critical
- name: Host Software RAID insufficient drives
description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} has insufficient drives remaining."
@ -744,9 +743,11 @@ groups:
See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
- name: Postgresql invalid index
description: "The table {{ $labels.relname }} has an invalid index: {{ $labels.indexrelname }}. You should execute `DROP INDEX {{ $labels.indexrelname }};`"
query: 'pg_genaral_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}'
query: 'pg_general_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}'
severity: warning
for: 6h
comments: |
See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
- name: SQL Server
exporters:
@ -1546,9 +1547,90 @@ groups:
for: 3m
- name: Nats high routes count
description: High number of NATS routes ({{ $value }}) for {{ $labels.instance }}
query: "gnatsd_routez_num_routes > 10"
query: "gnatsd_varz_routes > 10"
severity: warning
for: 3m
- name: Nats high memory usage
description: NATS server memory usage is above 200MB for {{ $labels.instance }}
query: "gnatsd_varz_mem > 200 * 1024 * 1024"
severity: warning
for: 5m
- name: Nats slow consumers
description: There are slow consumers in NATS for {{ $labels.instance }}
query: "gnatsd_varz_slow_consumers > 0"
severity: critical
for: 3m
- name: Nats server down
description: NATS server has been down for more than 5 minutes
query: 'absent(up{job="nats"})'
severity: critical
for: 5m
- name: Nats high CPU usage
description: NATS server is using more than 80% CPU for the last 5 minutes
query: "rate(gnatsd_varz_cpu[5m]) > 0.8"
severity: warning
for: 5m
- name: Nats high number of connections
description: NATS server has more than 1000 active connections
query: "gnatsd_connz_num_connections > 1000"
severity: warning
for: 5m
- name: Nats high JetStream store usage
description: JetStream store usage is over 80%
query: "gnatsd_varz_jetstream_stats_storage / gnatsd_varz_jetstream_config_max_storage > 0.8"
severity: warning
for: 5m
- name: Nats high JetStream memory usage
description: JetStream memory usage is over 80%
query: "gnatsd_varz_jetstream_stats_memory / gnatsd_varz_jetstream_config_max_memory > 0.8"
severity: warning
for: 5m
- name: Nats high number of subscriptions
description: NATS server has more than 1000 active subscriptions
query: "gnatsd_connz_subscriptions > 1000"
severity: warning
for: 5m
- name: Nats high pending bytes
description: NATS server has more than 100,000 pending bytes
query: "gnatsd_connz_pending_bytes > 100000"
severity: warning
for: 5m
- name: Nats too many errors
description: NATS server has encountered errors in the last 5 minutes
query: "increase(gnatsd_varz_jetstream_stats_api_errors[5m]) > 0"
severity: warning
for: 5m
- name: Nats JetStream consumers exceeded
description: JetStream has more than 100 active consumers
query: "sum(gnatsd_varz_jetstream_stats_accounts) > 100"
severity: warning
for: 5m
- name: Nats frequent authentication timeouts
description: There have been more than 5 authentication timeouts in the last 5 minutes
query: "increase(gnatsd_varz_auth_timeout[5m]) > 5"
severity: warning
for: 5m
- name: Nats max payload size exceeded
description: The max payload size allowed by NATS has been exceeded (1MB)
query: "max(gnatsd_varz_max_payload) > 1024 * 1024"
severity: critical
for: 5m
- name: Nats leaf node connection issue
description: No leaf node connections have been established in the last 5 minutes
query: "increase(gnatsd_varz_leafnodes[5m]) == 0"
severity: critical
for: 5m
- name: Nats max ping operations exceeded
description: The maximum number of ping operations in NATS has exceeded 50
query: "gnatsd_varz_ping_max > 50"
severity: warning
for: 5m
- name: Nats write deadline exceeded
description: The write deadline has been exceeded in NATS, indicating potential message delivery issues
query: "gnatsd_varz_write_deadline > 10"
severity: critical
for: 5m
- name: Solr
exporters:

View file

@ -206,7 +206,7 @@ groups:
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNodeOvertemperatureAlarm
expr: 'node_hwmon_temp_crit_alarm_celsius == 1 or node_hwmon_temp_alarm == 1'
expr: '((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 0m
labels:
severity: critical

View file

@ -32,10 +32,154 @@ groups:
description: "High number of NATS subscriptions ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsHighRoutesCount
expr: 'gnatsd_routez_num_routes > 10'
expr: 'gnatsd_varz_routes > 10'
for: 3m
labels:
severity: warning
annotations:
summary: Nats high routes count (instance {{ $labels.instance }})
description: "High number of NATS routes ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsHighMemoryUsage
expr: 'gnatsd_varz_mem > 200 * 1024 * 1024'
for: 5m
labels:
severity: warning
annotations:
summary: Nats high memory usage (instance {{ $labels.instance }})
description: "NATS server memory usage is above 200MB for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsSlowConsumers
expr: 'gnatsd_varz_slow_consumers > 0'
for: 3m
labels:
severity: critical
annotations:
summary: Nats slow consumers (instance {{ $labels.instance }})
description: "There are slow consumers in NATS for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsServerDown
expr: 'absent(up{job="nats"})'
for: 5m
labels:
severity: critical
annotations:
summary: Nats server down (instance {{ $labels.instance }})
description: "NATS server has been down for more than 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsHighCpuUsage
expr: 'rate(gnatsd_varz_cpu[5m]) > 0.8'
for: 5m
labels:
severity: warning
annotations:
summary: Nats high CPU usage (instance {{ $labels.instance }})
description: "NATS server is using more than 80% CPU for the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsHighNumberOfConnections
expr: 'gnatsd_connz_num_connections > 1000'
for: 5m
labels:
severity: warning
annotations:
summary: Nats high number of connections (instance {{ $labels.instance }})
description: "NATS server has more than 1000 active connections\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsHighJetstreamStoreUsage
expr: 'gnatsd_varz_jetstream_stats_storage / gnatsd_varz_jetstream_config_max_storage > 0.8'
for: 5m
labels:
severity: warning
annotations:
summary: Nats high JetStream store usage (instance {{ $labels.instance }})
description: "JetStream store usage is over 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsHighJetstreamMemoryUsage
expr: 'gnatsd_varz_jetstream_stats_memory / gnatsd_varz_jetstream_config_max_memory > 0.8'
for: 5m
labels:
severity: warning
annotations:
summary: Nats high JetStream memory usage (instance {{ $labels.instance }})
description: "JetStream memory usage is over 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsHighNumberOfSubscriptions
expr: 'gnatsd_connz_subscriptions > 1000'
for: 5m
labels:
severity: warning
annotations:
summary: Nats high number of subscriptions (instance {{ $labels.instance }})
description: "NATS server has more than 1000 active subscriptions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsHighPendingBytes
expr: 'gnatsd_connz_pending_bytes > 100000'
for: 5m
labels:
severity: warning
annotations:
summary: Nats high pending bytes (instance {{ $labels.instance }})
description: "NATS server has more than 100,000 pending bytes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsTooManyErrors
expr: 'increase(gnatsd_varz_jetstream_stats_api_errors[5m]) > 0'
for: 5m
labels:
severity: warning
annotations:
summary: Nats too many errors (instance {{ $labels.instance }})
description: "NATS server has encountered errors in the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsJetstreamConsumersExceeded
expr: 'sum(gnatsd_varz_jetstream_stats_accounts) > 100'
for: 5m
labels:
severity: warning
annotations:
summary: Nats JetStream consumers exceeded (instance {{ $labels.instance }})
description: "JetStream has more than 100 active consumers\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsFrequentAuthenticationTimeouts
expr: 'increase(gnatsd_varz_auth_timeout[5m]) > 5'
for: 5m
labels:
severity: warning
annotations:
summary: Nats frequent authentication timeouts (instance {{ $labels.instance }})
description: "There have been more than 5 authentication timeouts in the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsMaxPayloadSizeExceeded
expr: 'max(gnatsd_varz_max_payload) > 1024 * 1024'
for: 5m
labels:
severity: critical
annotations:
summary: Nats max payload size exceeded (instance {{ $labels.instance }})
description: "The max payload size allowed by NATS has been exceeded (1MB)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsLeafNodeConnectionIssue
expr: 'increase(gnatsd_varz_leafnodes[5m]) == 0'
for: 5m
labels:
severity: critical
annotations:
summary: Nats leaf node connection issue (instance {{ $labels.instance }})
description: "No leaf node connections have been established in the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsMaxPingOperationsExceeded
expr: 'gnatsd_varz_ping_max > 50'
for: 5m
labels:
severity: warning
annotations:
summary: Nats max ping operations exceeded (instance {{ $labels.instance }})
description: "The maximum number of ping operations in NATS has exceeded 50\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsWriteDeadlineExceeded
expr: 'gnatsd_varz_write_deadline > 10'
for: 5m
labels:
severity: critical
annotations:
summary: Nats write deadline exceeded (instance {{ $labels.instance }})
description: "The write deadline has been exceeded in NATS, indicating potential message delivery issues\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -185,7 +185,7 @@ groups:
description: "The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlInvalidIndex
expr: 'pg_genaral_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}'
expr: 'pg_general_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}'
for: 6h
labels:
severity: warning