diff --git a/Gemfile b/Gemfile index 31ddf5d..cddfa60 100644 --- a/Gemfile +++ b/Gemfile @@ -1,3 +1,3 @@ source 'https://rubygems.org' -gem 'github-pages', group: :jekyll_plugins -gem 'webrick', '~> 1.3', '>= 1.3.1' \ No newline at end of file +gem 'github-pages', '>= 232', group: :jekyll_plugins +gem 'webrick', '~> 1.8' \ No newline at end of file diff --git a/Gemfile.lock b/Gemfile.lock index ca3c33c..1afa3ed 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,66 +1,61 @@ GEM remote: https://rubygems.org/ specs: - activesupport (6.0.6.1) - concurrent-ruby (~> 1.0, >= 1.0.2) - i18n (>= 0.7, < 2) - minitest (~> 5.1) - tzinfo (~> 1.1) - zeitwerk (~> 2.2, >= 2.2.2) - addressable (2.8.0) - public_suffix (>= 2.0.2, < 5.0) + activesupport (7.2.1) + base64 + bigdecimal + concurrent-ruby (~> 1.0, >= 1.3.1) + connection_pool (>= 2.2.5) + drb + i18n (>= 1.6, < 2) + logger (>= 1.4.2) + minitest (>= 5.1) + securerandom (>= 0.3) + tzinfo (~> 2.0, >= 2.0.5) + addressable (2.8.7) + public_suffix (>= 2.0.2, < 7.0) + base64 (0.2.0) + bigdecimal (3.1.8) coffee-script (2.4.1) coffee-script-source execjs - coffee-script-source (1.11.1) + coffee-script-source (1.12.2) colorator (1.1.0) commonmarker (0.23.10) - concurrent-ruby (1.2.0) - dnsruby (1.61.9) - simpleidn (~> 0.1) + concurrent-ruby (1.3.4) + connection_pool (2.4.1) + csv (3.3.0) + dnsruby (1.72.2) + simpleidn (~> 0.2.1) + drb (2.2.1) em-websocket (0.5.3) eventmachine (>= 0.12.9) http_parser.rb (~> 0) - ethon (0.15.0) + ethon (0.16.0) ffi (>= 1.15.0) eventmachine (1.2.7) - execjs (2.8.1) - faraday (1.10.0) - faraday-em_http (~> 1.0) - faraday-em_synchrony (~> 1.0) - faraday-excon (~> 1.1) - faraday-httpclient (~> 1.0) - faraday-multipart (~> 1.0) - faraday-net_http (~> 1.0) - faraday-net_http_persistent (~> 1.0) - faraday-patron (~> 1.0) - faraday-rack (~> 1.0) - faraday-retry (~> 1.0) - ruby2_keywords (>= 0.0.4) - faraday-em_http (1.0.0) - faraday-em_synchrony (1.0.0) - faraday-excon (1.1.0) - faraday-httpclient (1.0.1) - faraday-multipart (1.0.3) - multipart-post (>= 1.2, < 3) - faraday-net_http (1.0.1) - faraday-net_http_persistent (1.2.0) - faraday-patron (1.0.0) - faraday-rack (1.0.0) - faraday-retry (1.0.3) - ffi (1.15.5) + execjs (2.9.1) + faraday (2.12.0) + faraday-net_http (>= 2.0, < 3.4) + json + logger + faraday-net_http (3.3.0) + net-http + ffi (1.17.0) + ffi (1.17.0-x86_64-linux-gnu) + ffi (1.17.0-x86_64-linux-musl) forwardable-extended (2.6.0) - gemoji (3.0.1) - github-pages (226) - github-pages-health-check (= 1.17.9) - jekyll (= 3.9.2) - jekyll-avatar (= 0.7.0) - jekyll-coffeescript (= 1.1.1) - jekyll-commonmark-ghpages (= 0.2.0) - jekyll-default-layout (= 0.1.4) - jekyll-feed (= 0.15.1) + gemoji (4.1.0) + github-pages (232) + github-pages-health-check (= 1.18.2) + jekyll (= 3.10.0) + jekyll-avatar (= 0.8.0) + jekyll-coffeescript (= 1.2.2) + jekyll-commonmark-ghpages (= 0.5.1) + jekyll-default-layout (= 0.1.5) + jekyll-feed (= 0.17.0) jekyll-gist (= 1.5.0) - jekyll-github-metadata (= 2.13.0) + jekyll-github-metadata (= 2.16.1) jekyll-include-cache (= 0.2.1) jekyll-mentions (= 1.6.0) jekyll-optional-front-matter (= 0.3.2) @@ -87,32 +82,34 @@ GEM jekyll-theme-tactile (= 0.2.0) jekyll-theme-time-machine (= 0.2.0) jekyll-titles-from-headings (= 0.5.3) - jemoji (= 0.12.0) - kramdown (= 2.3.2) + jemoji (= 0.13.0) + kramdown (= 2.4.0) kramdown-parser-gfm (= 1.1.0) - liquid (= 4.0.3) + liquid (= 4.0.4) mercenary (~> 0.3) minima (= 2.5.1) - nokogiri (>= 1.13.4, < 2.0) - rouge (= 3.26.0) + nokogiri (>= 1.16.2, < 2.0) + rouge (= 3.30.0) terminal-table (~> 1.4) - github-pages-health-check (1.17.9) + webrick (~> 1.8) + github-pages-health-check (1.18.2) addressable (~> 2.3) dnsruby (~> 1.60) - octokit (~> 4.0) - public_suffix (>= 3.0, < 5.0) + octokit (>= 4, < 8) + public_suffix (>= 3.0, < 6.0) typhoeus (~> 1.3) - html-pipeline (2.14.1) + html-pipeline (2.14.3) activesupport (>= 2) nokogiri (>= 1.4) http_parser.rb (0.8.0) - i18n (0.9.5) + i18n (1.14.6) concurrent-ruby (~> 1.0) - jekyll (3.9.2) + jekyll (3.10.0) addressable (~> 2.4) colorator (~> 1.0) + csv (~> 3.0) em-websocket (~> 0.5) - i18n (~> 0.7) + i18n (>= 0.7, < 2) jekyll-sass-converter (~> 1.0) jekyll-watch (~> 2.0) kramdown (>= 1.17, < 3) @@ -121,27 +118,28 @@ GEM pathutil (~> 0.9) rouge (>= 1.7, < 4) safe_yaml (~> 1.0) - jekyll-avatar (0.7.0) + webrick (>= 1.0) + jekyll-avatar (0.8.0) jekyll (>= 3.0, < 5.0) - jekyll-coffeescript (1.1.1) + jekyll-coffeescript (1.2.2) coffee-script (~> 2.2) - coffee-script-source (~> 1.11.1) + coffee-script-source (~> 1.12) jekyll-commonmark (1.4.0) commonmarker (~> 0.22) - jekyll-commonmark-ghpages (0.2.0) - commonmarker (~> 0.23.4) - jekyll (~> 3.9.0) + jekyll-commonmark-ghpages (0.5.1) + commonmarker (>= 0.23.7, < 1.1.0) + jekyll (>= 3.9, < 4.0) jekyll-commonmark (~> 1.4.0) - rouge (>= 2.0, < 4.0) - jekyll-default-layout (0.1.4) - jekyll (~> 3.0) - jekyll-feed (0.15.1) + rouge (>= 2.0, < 5.0) + jekyll-default-layout (0.1.5) + jekyll (>= 3.0, < 5.0) + jekyll-feed (0.17.0) jekyll (>= 3.7, < 5.0) jekyll-gist (1.5.0) octokit (~> 4.2) - jekyll-github-metadata (2.13.0) + jekyll-github-metadata (2.16.1) jekyll (>= 3.4, < 5.0) - octokit (~> 4.0, != 4.4.0) + octokit (>= 4, < 7, != 4.4.0) jekyll-include-cache (0.2.1) jekyll (>= 3.7, < 5.0) jekyll-mentions (1.6.0) @@ -212,41 +210,46 @@ GEM jekyll (>= 3.3, < 5.0) jekyll-watch (2.2.1) listen (~> 3.0) - jemoji (0.12.0) - gemoji (~> 3.0) + jemoji (0.13.0) + gemoji (>= 3, < 5) html-pipeline (~> 2.2) jekyll (>= 3.0, < 5.0) - kramdown (2.3.2) + json (2.7.2) + kramdown (2.4.0) rexml kramdown-parser-gfm (1.1.0) kramdown (~> 2.0) - liquid (4.0.3) - listen (3.7.1) + liquid (4.0.4) + listen (3.9.0) rb-fsevent (~> 0.10, >= 0.10.3) rb-inotify (~> 0.9, >= 0.9.10) + logger (1.6.1) mercenary (0.3.6) + mini_portile2 (2.8.7) minima (2.5.1) jekyll (>= 3.5, < 5.0) jekyll-feed (~> 0.9) jekyll-seo-tag (~> 2.1) - minitest (5.17.0) - multipart-post (2.1.1) - nokogiri (1.16.5-x86_64-linux) + minitest (5.25.1) + net-http (0.4.1) + uri + nokogiri (1.16.7) + mini_portile2 (~> 2.8.2) racc (~> 1.4) - octokit (4.22.0) - faraday (>= 0.9) - sawyer (~> 0.8.0, >= 0.5.3) + nokogiri (1.16.7-x86_64-linux) + racc (~> 1.4) + octokit (4.25.1) + faraday (>= 1, < 3) + sawyer (~> 0.9) pathutil (0.16.2) forwardable-extended (~> 2.6) - public_suffix (4.0.7) - racc (1.7.3) - rb-fsevent (0.11.1) - rb-inotify (0.10.1) + public_suffix (5.1.1) + racc (1.8.1) + rb-fsevent (0.11.2) + rb-inotify (0.11.1) ffi (~> 1.0) - rexml (3.2.8) - strscan (>= 3.0.9) - rouge (3.26.0) - ruby2_keywords (0.0.5) + rexml (3.3.7) + rouge (3.30.0) rubyzip (2.3.2) safe_yaml (1.0.5) sass (3.7.4) @@ -254,33 +257,29 @@ GEM sass-listen (4.0.0) rb-fsevent (~> 0.9, >= 0.9.4) rb-inotify (~> 0.9, >= 0.9.7) - sawyer (0.8.2) + sawyer (0.9.2) addressable (>= 2.3.5) - faraday (> 0.8, < 2.0) - simpleidn (0.2.1) - unf (~> 0.1.4) - strscan (3.1.0) + faraday (>= 0.17.3, < 3) + securerandom (0.3.1) + simpleidn (0.2.3) terminal-table (1.8.0) unicode-display_width (~> 1.1, >= 1.1.1) - thread_safe (0.3.6) - typhoeus (1.4.0) + typhoeus (1.4.1) ethon (>= 0.9.0) - tzinfo (1.2.11) - thread_safe (~> 0.1) - unf (0.1.4) - unf_ext - unf_ext (0.0.8.1) + tzinfo (2.0.6) + concurrent-ruby (~> 1.0) unicode-display_width (1.8.0) - webrick (1.7.0) - zeitwerk (2.6.6) + uri (0.13.1) + webrick (1.8.2) PLATFORMS + ruby x86_64-linux x86_64-linux-musl DEPENDENCIES - github-pages - webrick (~> 1.3, >= 1.3.1) + github-pages (>= 232) + webrick (~> 1.8) BUNDLED WITH 2.3.13 diff --git a/_data/rules.yml b/_data/rules.yml index 0121157..c155775 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -267,8 +267,7 @@ groups: for: 5m - name: Host node overtemperature alarm description: "Physical node temperature alarm triggered" - # This is a critical alarm, some things (eg. NVMe) have just the temp alarm. - query: 'node_hwmon_temp_crit_alarm_celsius == 1 or node_hwmon_temp_alarm == 1' + query: '((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' severity: critical - name: Host Software RAID insufficient drives description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} has insufficient drives remaining." @@ -744,9 +743,11 @@ groups: See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737 - name: Postgresql invalid index description: "The table {{ $labels.relname }} has an invalid index: {{ $labels.indexrelname }}. You should execute `DROP INDEX {{ $labels.indexrelname }};`" - query: 'pg_genaral_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}' + query: 'pg_general_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}' severity: warning for: 6h + comments: | + See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737 - name: SQL Server exporters: @@ -1546,9 +1547,90 @@ groups: for: 3m - name: Nats high routes count description: High number of NATS routes ({{ $value }}) for {{ $labels.instance }} - query: "gnatsd_routez_num_routes > 10" + query: "gnatsd_varz_routes > 10" severity: warning for: 3m + - name: Nats high memory usage + description: NATS server memory usage is above 200MB for {{ $labels.instance }} + query: "gnatsd_varz_mem > 200 * 1024 * 1024" + severity: warning + for: 5m + - name: Nats slow consumers + description: There are slow consumers in NATS for {{ $labels.instance }} + query: "gnatsd_varz_slow_consumers > 0" + severity: critical + for: 3m + - name: Nats server down + description: NATS server has been down for more than 5 minutes + query: 'absent(up{job="nats"})' + severity: critical + for: 5m + - name: Nats high CPU usage + description: NATS server is using more than 80% CPU for the last 5 minutes + query: "rate(gnatsd_varz_cpu[5m]) > 0.8" + severity: warning + for: 5m + - name: Nats high number of connections + description: NATS server has more than 1000 active connections + query: "gnatsd_connz_num_connections > 1000" + severity: warning + for: 5m + - name: Nats high JetStream store usage + description: JetStream store usage is over 80% + query: "gnatsd_varz_jetstream_stats_storage / gnatsd_varz_jetstream_config_max_storage > 0.8" + severity: warning + for: 5m + - name: Nats high JetStream memory usage + description: JetStream memory usage is over 80% + query: "gnatsd_varz_jetstream_stats_memory / gnatsd_varz_jetstream_config_max_memory > 0.8" + severity: warning + for: 5m + - name: Nats high number of subscriptions + description: NATS server has more than 1000 active subscriptions + query: "gnatsd_connz_subscriptions > 1000" + severity: warning + for: 5m + - name: Nats high pending bytes + description: NATS server has more than 100,000 pending bytes + query: "gnatsd_connz_pending_bytes > 100000" + severity: warning + for: 5m + - name: Nats too many errors + description: NATS server has encountered errors in the last 5 minutes + query: "increase(gnatsd_varz_jetstream_stats_api_errors[5m]) > 0" + severity: warning + for: 5m + - name: Nats JetStream consumers exceeded + description: JetStream has more than 100 active consumers + query: "sum(gnatsd_varz_jetstream_stats_accounts) > 100" + severity: warning + for: 5m + - name: Nats frequent authentication timeouts + description: There have been more than 5 authentication timeouts in the last 5 minutes + query: "increase(gnatsd_varz_auth_timeout[5m]) > 5" + severity: warning + for: 5m + - name: Nats max payload size exceeded + description: The max payload size allowed by NATS has been exceeded (1MB) + query: "max(gnatsd_varz_max_payload) > 1024 * 1024" + severity: critical + for: 5m + - name: Nats leaf node connection issue + description: No leaf node connections have been established in the last 5 minutes + query: "increase(gnatsd_varz_leafnodes[5m]) == 0" + severity: critical + for: 5m + - name: Nats max ping operations exceeded + description: The maximum number of ping operations in NATS has exceeded 50 + query: "gnatsd_varz_ping_max > 50" + severity: warning + for: 5m + - name: Nats write deadline exceeded + description: The write deadline has been exceeded in NATS, indicating potential message delivery issues + query: "gnatsd_varz_write_deadline > 10" + severity: critical + for: 5m + - name: Solr exporters: diff --git a/dist/rules/host-and-hardware/node-exporter.yml b/dist/rules/host-and-hardware/node-exporter.yml index 301ae91..89eae23 100644 --- a/dist/rules/host-and-hardware/node-exporter.yml +++ b/dist/rules/host-and-hardware/node-exporter.yml @@ -206,7 +206,7 @@ groups: description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostNodeOvertemperatureAlarm - expr: 'node_hwmon_temp_crit_alarm_celsius == 1 or node_hwmon_temp_alarm == 1' + expr: '((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 0m labels: severity: critical diff --git a/dist/rules/nats/nats-exporter.yml b/dist/rules/nats/nats-exporter.yml index 13eda2b..7648762 100644 --- a/dist/rules/nats/nats-exporter.yml +++ b/dist/rules/nats/nats-exporter.yml @@ -32,10 +32,154 @@ groups: description: "High number of NATS subscriptions ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NatsHighRoutesCount - expr: 'gnatsd_routez_num_routes > 10' + expr: 'gnatsd_varz_routes > 10' for: 3m labels: severity: warning annotations: summary: Nats high routes count (instance {{ $labels.instance }}) description: "High number of NATS routes ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: NatsHighMemoryUsage + expr: 'gnatsd_varz_mem > 200 * 1024 * 1024' + for: 5m + labels: + severity: warning + annotations: + summary: Nats high memory usage (instance {{ $labels.instance }}) + description: "NATS server memory usage is above 200MB for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: NatsSlowConsumers + expr: 'gnatsd_varz_slow_consumers > 0' + for: 3m + labels: + severity: critical + annotations: + summary: Nats slow consumers (instance {{ $labels.instance }}) + description: "There are slow consumers in NATS for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: NatsServerDown + expr: 'absent(up{job="nats"})' + for: 5m + labels: + severity: critical + annotations: + summary: Nats server down (instance {{ $labels.instance }}) + description: "NATS server has been down for more than 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: NatsHighCpuUsage + expr: 'rate(gnatsd_varz_cpu[5m]) > 0.8' + for: 5m + labels: + severity: warning + annotations: + summary: Nats high CPU usage (instance {{ $labels.instance }}) + description: "NATS server is using more than 80% CPU for the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: NatsHighNumberOfConnections + expr: 'gnatsd_connz_num_connections > 1000' + for: 5m + labels: + severity: warning + annotations: + summary: Nats high number of connections (instance {{ $labels.instance }}) + description: "NATS server has more than 1000 active connections\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: NatsHighJetstreamStoreUsage + expr: 'gnatsd_varz_jetstream_stats_storage / gnatsd_varz_jetstream_config_max_storage > 0.8' + for: 5m + labels: + severity: warning + annotations: + summary: Nats high JetStream store usage (instance {{ $labels.instance }}) + description: "JetStream store usage is over 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: NatsHighJetstreamMemoryUsage + expr: 'gnatsd_varz_jetstream_stats_memory / gnatsd_varz_jetstream_config_max_memory > 0.8' + for: 5m + labels: + severity: warning + annotations: + summary: Nats high JetStream memory usage (instance {{ $labels.instance }}) + description: "JetStream memory usage is over 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: NatsHighNumberOfSubscriptions + expr: 'gnatsd_connz_subscriptions > 1000' + for: 5m + labels: + severity: warning + annotations: + summary: Nats high number of subscriptions (instance {{ $labels.instance }}) + description: "NATS server has more than 1000 active subscriptions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: NatsHighPendingBytes + expr: 'gnatsd_connz_pending_bytes > 100000' + for: 5m + labels: + severity: warning + annotations: + summary: Nats high pending bytes (instance {{ $labels.instance }}) + description: "NATS server has more than 100,000 pending bytes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: NatsTooManyErrors + expr: 'increase(gnatsd_varz_jetstream_stats_api_errors[5m]) > 0' + for: 5m + labels: + severity: warning + annotations: + summary: Nats too many errors (instance {{ $labels.instance }}) + description: "NATS server has encountered errors in the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: NatsJetstreamConsumersExceeded + expr: 'sum(gnatsd_varz_jetstream_stats_accounts) > 100' + for: 5m + labels: + severity: warning + annotations: + summary: Nats JetStream consumers exceeded (instance {{ $labels.instance }}) + description: "JetStream has more than 100 active consumers\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: NatsFrequentAuthenticationTimeouts + expr: 'increase(gnatsd_varz_auth_timeout[5m]) > 5' + for: 5m + labels: + severity: warning + annotations: + summary: Nats frequent authentication timeouts (instance {{ $labels.instance }}) + description: "There have been more than 5 authentication timeouts in the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: NatsMaxPayloadSizeExceeded + expr: 'max(gnatsd_varz_max_payload) > 1024 * 1024' + for: 5m + labels: + severity: critical + annotations: + summary: Nats max payload size exceeded (instance {{ $labels.instance }}) + description: "The max payload size allowed by NATS has been exceeded (1MB)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: NatsLeafNodeConnectionIssue + expr: 'increase(gnatsd_varz_leafnodes[5m]) == 0' + for: 5m + labels: + severity: critical + annotations: + summary: Nats leaf node connection issue (instance {{ $labels.instance }}) + description: "No leaf node connections have been established in the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: NatsMaxPingOperationsExceeded + expr: 'gnatsd_varz_ping_max > 50' + for: 5m + labels: + severity: warning + annotations: + summary: Nats max ping operations exceeded (instance {{ $labels.instance }}) + description: "The maximum number of ping operations in NATS has exceeded 50\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: NatsWriteDeadlineExceeded + expr: 'gnatsd_varz_write_deadline > 10' + for: 5m + labels: + severity: critical + annotations: + summary: Nats write deadline exceeded (instance {{ $labels.instance }}) + description: "The write deadline has been exceeded in NATS, indicating potential message delivery issues\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/dist/rules/postgresql/postgres-exporter.yml b/dist/rules/postgresql/postgres-exporter.yml index 48e8f36..5cc6e03 100644 --- a/dist/rules/postgresql/postgres-exporter.yml +++ b/dist/rules/postgresql/postgres-exporter.yml @@ -185,7 +185,7 @@ groups: description: "The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PostgresqlInvalidIndex - expr: 'pg_genaral_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}' + expr: 'pg_general_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}' for: 6h labels: severity: warning