From 2c764df932e61bfeb81c396ece3d93bd7a4cf3bc Mon Sep 17 00:00:00 2001 From: Samuel Berthe Date: Thu, 18 Jul 2024 10:14:45 +0200 Subject: [PATCH 01/13] fix: Gemfile & Gemfile.lock to reduce vulnerabilities (#426) The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-RUBY-REXML-7462086 Co-authored-by: snyk-bot --- Gemfile | 2 +- Gemfile.lock | 189 ++++++++++++++++++++++++--------------------------- 2 files changed, 90 insertions(+), 101 deletions(-) diff --git a/Gemfile b/Gemfile index 31ddf5d..eef87b6 100644 --- a/Gemfile +++ b/Gemfile @@ -1,3 +1,3 @@ source 'https://rubygems.org' -gem 'github-pages', group: :jekyll_plugins +gem 'github-pages', '>= 227', group: :jekyll_plugins gem 'webrick', '~> 1.3', '>= 1.3.1' \ No newline at end of file diff --git a/Gemfile.lock b/Gemfile.lock index ca3c33c..c11fe91 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,66 +1,56 @@ GEM remote: https://rubygems.org/ specs: - activesupport (6.0.6.1) + activesupport (7.1.3.4) + base64 + bigdecimal concurrent-ruby (~> 1.0, >= 1.0.2) - i18n (>= 0.7, < 2) - minitest (~> 5.1) - tzinfo (~> 1.1) - zeitwerk (~> 2.2, >= 2.2.2) - addressable (2.8.0) - public_suffix (>= 2.0.2, < 5.0) + connection_pool (>= 2.2.5) + drb + i18n (>= 1.6, < 2) + minitest (>= 5.1) + mutex_m + tzinfo (~> 2.0) + addressable (2.8.7) + public_suffix (>= 2.0.2, < 7.0) + base64 (0.2.0) + bigdecimal (3.1.8) coffee-script (2.4.1) coffee-script-source execjs - coffee-script-source (1.11.1) + coffee-script-source (1.12.2) colorator (1.1.0) commonmarker (0.23.10) - concurrent-ruby (1.2.0) - dnsruby (1.61.9) - simpleidn (~> 0.1) + concurrent-ruby (1.3.3) + connection_pool (2.4.1) + dnsruby (1.72.2) + simpleidn (~> 0.2.1) + drb (2.2.1) em-websocket (0.5.3) eventmachine (>= 0.12.9) http_parser.rb (~> 0) - ethon (0.15.0) + ethon (0.16.0) ffi (>= 1.15.0) eventmachine (1.2.7) - execjs (2.8.1) - faraday (1.10.0) - faraday-em_http (~> 1.0) - faraday-em_synchrony (~> 1.0) - faraday-excon (~> 1.1) - faraday-httpclient (~> 1.0) - faraday-multipart (~> 1.0) - faraday-net_http (~> 1.0) - faraday-net_http_persistent (~> 1.0) - faraday-patron (~> 1.0) - faraday-rack (~> 1.0) - faraday-retry (~> 1.0) + execjs (2.9.1) + faraday (2.8.1) + base64 + faraday-net_http (>= 2.0, < 3.1) ruby2_keywords (>= 0.0.4) - faraday-em_http (1.0.0) - faraday-em_synchrony (1.0.0) - faraday-excon (1.1.0) - faraday-httpclient (1.0.1) - faraday-multipart (1.0.3) - multipart-post (>= 1.2, < 3) - faraday-net_http (1.0.1) - faraday-net_http_persistent (1.2.0) - faraday-patron (1.0.0) - faraday-rack (1.0.0) - faraday-retry (1.0.3) - ffi (1.15.5) + faraday-net_http (3.0.2) + ffi (1.16.3) forwardable-extended (2.6.0) - gemoji (3.0.1) - github-pages (226) - github-pages-health-check (= 1.17.9) - jekyll (= 3.9.2) - jekyll-avatar (= 0.7.0) - jekyll-coffeescript (= 1.1.1) - jekyll-commonmark-ghpages (= 0.2.0) - jekyll-default-layout (= 0.1.4) - jekyll-feed (= 0.15.1) + gemoji (4.1.0) + github-pages (231) + github-pages-health-check (= 1.18.2) + jekyll (= 3.9.5) + jekyll-avatar (= 0.8.0) + jekyll-coffeescript (= 1.2.2) + jekyll-commonmark-ghpages (= 0.4.0) + jekyll-default-layout (= 0.1.5) + jekyll-feed (= 0.17.0) jekyll-gist (= 1.5.0) - jekyll-github-metadata (= 2.13.0) + jekyll-github-metadata (= 2.16.1) jekyll-include-cache (= 0.2.1) jekyll-mentions (= 1.6.0) jekyll-optional-front-matter (= 0.3.2) @@ -87,32 +77,32 @@ GEM jekyll-theme-tactile (= 0.2.0) jekyll-theme-time-machine (= 0.2.0) jekyll-titles-from-headings (= 0.5.3) - jemoji (= 0.12.0) - kramdown (= 2.3.2) + jemoji (= 0.13.0) + kramdown (= 2.4.0) kramdown-parser-gfm (= 1.1.0) - liquid (= 4.0.3) + liquid (= 4.0.4) mercenary (~> 0.3) minima (= 2.5.1) - nokogiri (>= 1.13.4, < 2.0) - rouge (= 3.26.0) + nokogiri (>= 1.13.6, < 2.0) + rouge (= 3.30.0) terminal-table (~> 1.4) - github-pages-health-check (1.17.9) + github-pages-health-check (1.18.2) addressable (~> 2.3) dnsruby (~> 1.60) - octokit (~> 4.0) - public_suffix (>= 3.0, < 5.0) + octokit (>= 4, < 8) + public_suffix (>= 3.0, < 6.0) typhoeus (~> 1.3) - html-pipeline (2.14.1) + html-pipeline (2.14.3) activesupport (>= 2) nokogiri (>= 1.4) http_parser.rb (0.8.0) - i18n (0.9.5) + i18n (1.14.5) concurrent-ruby (~> 1.0) - jekyll (3.9.2) + jekyll (3.9.5) addressable (~> 2.4) colorator (~> 1.0) em-websocket (~> 0.5) - i18n (~> 0.7) + i18n (>= 0.7, < 2) jekyll-sass-converter (~> 1.0) jekyll-watch (~> 2.0) kramdown (>= 1.17, < 3) @@ -121,27 +111,27 @@ GEM pathutil (~> 0.9) rouge (>= 1.7, < 4) safe_yaml (~> 1.0) - jekyll-avatar (0.7.0) + jekyll-avatar (0.8.0) jekyll (>= 3.0, < 5.0) - jekyll-coffeescript (1.1.1) + jekyll-coffeescript (1.2.2) coffee-script (~> 2.2) - coffee-script-source (~> 1.11.1) + coffee-script-source (~> 1.12) jekyll-commonmark (1.4.0) commonmarker (~> 0.22) - jekyll-commonmark-ghpages (0.2.0) - commonmarker (~> 0.23.4) + jekyll-commonmark-ghpages (0.4.0) + commonmarker (~> 0.23.7) jekyll (~> 3.9.0) jekyll-commonmark (~> 1.4.0) - rouge (>= 2.0, < 4.0) - jekyll-default-layout (0.1.4) - jekyll (~> 3.0) - jekyll-feed (0.15.1) + rouge (>= 2.0, < 5.0) + jekyll-default-layout (0.1.5) + jekyll (>= 3.0, < 5.0) + jekyll-feed (0.17.0) jekyll (>= 3.7, < 5.0) jekyll-gist (1.5.0) octokit (~> 4.2) - jekyll-github-metadata (2.13.0) + jekyll-github-metadata (2.16.1) jekyll (>= 3.4, < 5.0) - octokit (~> 4.0, != 4.4.0) + octokit (>= 4, < 7, != 4.4.0) jekyll-include-cache (0.2.1) jekyll (>= 3.7, < 5.0) jekyll-mentions (1.6.0) @@ -212,40 +202,44 @@ GEM jekyll (>= 3.3, < 5.0) jekyll-watch (2.2.1) listen (~> 3.0) - jemoji (0.12.0) - gemoji (~> 3.0) + jemoji (0.13.0) + gemoji (>= 3, < 5) html-pipeline (~> 2.2) jekyll (>= 3.0, < 5.0) - kramdown (2.3.2) + kramdown (2.4.0) rexml kramdown-parser-gfm (1.1.0) kramdown (~> 2.0) - liquid (4.0.3) - listen (3.7.1) + liquid (4.0.4) + listen (3.9.0) rb-fsevent (~> 0.10, >= 0.10.3) rb-inotify (~> 0.9, >= 0.9.10) mercenary (0.3.6) + mini_portile2 (2.8.7) minima (2.5.1) jekyll (>= 3.5, < 5.0) jekyll-feed (~> 0.9) jekyll-seo-tag (~> 2.1) - minitest (5.17.0) - multipart-post (2.1.1) - nokogiri (1.16.5-x86_64-linux) + minitest (5.24.1) + mutex_m (0.2.0) + nokogiri (1.15.6) + mini_portile2 (~> 2.8.2) racc (~> 1.4) - octokit (4.22.0) - faraday (>= 0.9) - sawyer (~> 0.8.0, >= 0.5.3) + nokogiri (1.15.6-x86_64-linux) + racc (~> 1.4) + octokit (4.25.1) + faraday (>= 1, < 3) + sawyer (~> 0.9) pathutil (0.16.2) forwardable-extended (~> 2.6) - public_suffix (4.0.7) - racc (1.7.3) - rb-fsevent (0.11.1) - rb-inotify (0.10.1) + public_suffix (5.1.1) + racc (1.8.0) + rb-fsevent (0.11.2) + rb-inotify (0.11.1) ffi (~> 1.0) - rexml (3.2.8) - strscan (>= 3.0.9) - rouge (3.26.0) + rexml (3.3.2) + strscan + rouge (3.30.0) ruby2_keywords (0.0.5) rubyzip (2.3.2) safe_yaml (1.0.5) @@ -254,32 +248,27 @@ GEM sass-listen (4.0.0) rb-fsevent (~> 0.9, >= 0.9.4) rb-inotify (~> 0.9, >= 0.9.7) - sawyer (0.8.2) + sawyer (0.9.2) addressable (>= 2.3.5) - faraday (> 0.8, < 2.0) - simpleidn (0.2.1) - unf (~> 0.1.4) + faraday (>= 0.17.3, < 3) + simpleidn (0.2.3) strscan (3.1.0) terminal-table (1.8.0) unicode-display_width (~> 1.1, >= 1.1.1) - thread_safe (0.3.6) - typhoeus (1.4.0) + typhoeus (1.4.1) ethon (>= 0.9.0) - tzinfo (1.2.11) - thread_safe (~> 0.1) - unf (0.1.4) - unf_ext - unf_ext (0.0.8.1) + tzinfo (2.0.6) + concurrent-ruby (~> 1.0) unicode-display_width (1.8.0) webrick (1.7.0) - zeitwerk (2.6.6) PLATFORMS + ruby x86_64-linux x86_64-linux-musl DEPENDENCIES - github-pages + github-pages (>= 227) webrick (~> 1.3, >= 1.3.1) BUNDLED WITH From 225607cf7f72d075c759d52921203906a3a10fd6 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 30 Jul 2024 17:25:23 +0200 Subject: [PATCH 02/13] build(deps-dev): bump nokogiri from 1.15.6 to 1.16.5 (#427) Bumps [nokogiri](https://github.com/sparklemotion/nokogiri) from 1.15.6 to 1.16.5. - [Release notes](https://github.com/sparklemotion/nokogiri/releases) - [Changelog](https://github.com/sparklemotion/nokogiri/blob/main/CHANGELOG.md) - [Commits](https://github.com/sparklemotion/nokogiri/compare/v1.15.6...v1.16.5) --- updated-dependencies: - dependency-name: nokogiri dependency-type: indirect ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Gemfile.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index c11fe91..f41c2f6 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -222,10 +222,10 @@ GEM jekyll-seo-tag (~> 2.1) minitest (5.24.1) mutex_m (0.2.0) - nokogiri (1.15.6) + nokogiri (1.16.5) mini_portile2 (~> 2.8.2) racc (~> 1.4) - nokogiri (1.15.6-x86_64-linux) + nokogiri (1.16.5-x86_64-linux) racc (~> 1.4) octokit (4.25.1) faraday (>= 1, < 3) @@ -233,7 +233,7 @@ GEM pathutil (0.16.2) forwardable-extended (~> 2.6) public_suffix (5.1.1) - racc (1.8.0) + racc (1.8.1) rb-fsevent (0.11.2) rb-inotify (0.11.1) ffi (~> 1.0) From 61da73d5171e02f542be374d85bc0a2655857666 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 2 Aug 2024 14:14:26 +0200 Subject: [PATCH 03/13] build(deps-dev): bump rexml from 3.3.2 to 3.3.3 (#428) Bumps [rexml](https://github.com/ruby/rexml) from 3.3.2 to 3.3.3. - [Release notes](https://github.com/ruby/rexml/releases) - [Changelog](https://github.com/ruby/rexml/blob/master/NEWS.md) - [Commits](https://github.com/ruby/rexml/compare/v3.3.2...v3.3.3) --- updated-dependencies: - dependency-name: rexml dependency-type: indirect ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Gemfile.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Gemfile.lock b/Gemfile.lock index f41c2f6..f2d1111 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -237,7 +237,7 @@ GEM rb-fsevent (0.11.2) rb-inotify (0.11.1) ffi (~> 1.0) - rexml (3.3.2) + rexml (3.3.3) strscan rouge (3.30.0) ruby2_keywords (0.0.5) From d1715de75150fb714a4bd6cd489935ccc7d6282b Mon Sep 17 00:00:00 2001 From: Samuel Berthe Date: Tue, 20 Aug 2024 18:31:08 +0200 Subject: [PATCH 04/13] fix PostgresqlInvalidIndex rule --- _data/rules.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/_data/rules.yml b/_data/rules.yml index 0216beb..b9506d2 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -730,9 +730,11 @@ groups: See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737 - name: Postgresql invalid index description: "The table {{ $labels.relname }} has an invalid index: {{ $labels.indexrelname }}. You should execute `DROP INDEX {{ $labels.indexrelname }};`" - query: 'pg_genaral_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}' + query: 'pg_general_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}' severity: warning for: 6h + comments: | + See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737 - name: SQL Server exporters: From 02687db33d657d387045a6a3e43fae793e6f2dfd Mon Sep 17 00:00:00 2001 From: samber Date: Tue, 20 Aug 2024 16:32:36 +0000 Subject: [PATCH 05/13] Publish --- dist/rules/postgresql/postgres-exporter.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dist/rules/postgresql/postgres-exporter.yml b/dist/rules/postgresql/postgres-exporter.yml index 0e1f473..2ab461f 100644 --- a/dist/rules/postgresql/postgres-exporter.yml +++ b/dist/rules/postgresql/postgres-exporter.yml @@ -185,7 +185,7 @@ groups: description: "The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PostgresqlInvalidIndex - expr: 'pg_genaral_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}' + expr: 'pg_general_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}' for: 6h labels: severity: warning From 8c0bdc2b24e9ba6b07e659e8260020788755b4c3 Mon Sep 17 00:00:00 2001 From: Somrat Dutta <38795369+somratdutta@users.noreply.github.com> Date: Wed, 21 Aug 2024 00:07:03 +0530 Subject: [PATCH 06/13] feat: Add NATS and JetStream Prometheus alert rules (#430) * feat: Add comprehensive NATS and JetStream Prometheus alert rules - Added multiple Prometheus alert rules for monitoring NATS server and JetStream metrics. - Included alerts for: - High connection count - High pending bytes - High subscriptions count - High routes count - High memory usage - Slow consumers - NATS server downtime - High CPU usage - High number of active connections - High JetStream store and memory usage - Subscription limits exceeded - High pending messages - Authentication timeouts - Errors in NATS (JetStream API errors) - JetStream consumers limit exceeded - Exceeding max payload size - Leaf node connection issues - Ping operations limit exceeded - Write deadline exceeded - Ensured consistency between `exporter.yml` and `rules.yml` files. - Improved overall NATS and JetStream monitoring to prevent performance degradation and ensure system reliability. This commit enhances the visibility of NATS and JetStream operations by providing key metrics to alert on potential issues and optimize system performance. * Update rules.yml * - minor changes, rollback rules.yml - address comment changes - revert to old rules.yml as they are generated * - minor changes, rollback rules.yml - address comment changes - revert to old rules.yml as they are generated * fix indentation --------- Co-authored-by: somratdutta Co-authored-by: Samuel Berthe Co-authored-by: somrat.dutta --- _data/rules.yml | 83 ++++++++++++++++++++++++++++++- dist/rules/nats/nats-exporter.yml | 2 +- 2 files changed, 83 insertions(+), 2 deletions(-) diff --git a/_data/rules.yml b/_data/rules.yml index b9506d2..6f5d04d 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -1534,9 +1534,90 @@ groups: for: 3m - name: Nats high routes count description: High number of NATS routes ({{ $value }}) for {{ $labels.instance }} - query: "gnatsd_routez_num_routes > 10" + query: "gnatsd_varz_routes > 10" severity: warning for: 3m + - name: Nats high memory usage + description: NATS server memory usage is above 200MB for {{ $labels.instance }} + query: "gnatsd_varz_mem > 200 * 1024 * 1024" + severity: warning + for: 5m + - name: Nats slow consumers + description: There are slow consumers in NATS for {{ $labels.instance }} + query: "gnatsd_varz_slow_consumers > 0" + severity: critical + for: 3m + - name: Nats server down + description: NATS server has been down for more than 5 minutes + query: "absent(up{job="nats"})" + severity: critical + for: 5m + - name: Nats high CPU usage + description: NATS server is using more than 80% CPU for the last 5 minutes + query: "rate(gnatsd_varz_cpu[5m]) > 0.8" + severity: warning + for: 5m + - name: Nats high number of connections + description: NATS server has more than 1000 active connections + query: "gnatsd_connz_num_connections > 1000" + severity: warning + for: 5m + - name: Nats high JetStream store usage + description: JetStream store usage is over 80% + query: "gnatsd_varz_jetstream_stats_storage / gnatsd_varz_jetstream_config_max_storage > 0.8" + severity: warning + for: 5m + - name: Nats high JetStream memory usage + description: JetStream memory usage is over 80% + query: "gnatsd_varz_jetstream_stats_memory / gnatsd_varz_jetstream_config_max_memory > 0.8" + severity: warning + for: 5m + - name: Nats high number of subscriptions + description: NATS server has more than 1000 active subscriptions + query: "gnatsd_connz_subscriptions > 1000" + severity: warning + for: 5m + - name: Nats high pending bytes + description: NATS server has more than 100,000 pending bytes + query: "gnatsd_connz_pending_bytes > 100000" + severity: warning + for: 5m + - name: Nats too many errors + description: NATS server has encountered errors in the last 5 minutes + query: "increase(gnatsd_varz_jetstream_stats_api_errors[5m]) > 0" + severity: warning + for: 5m + - name: Nats JetStream consumers exceeded + description: JetStream has more than 100 active consumers + query: "sum(gnatsd_varz_jetstream_stats_accounts) > 100" + severity: warning + for: 5m + - name: Nats frequent authentication timeouts + description: There have been more than 5 authentication timeouts in the last 5 minutes + query: "increase(gnatsd_varz_auth_timeout[5m]) > 5" + severity: warning + for: 5m + - name: Nats max payload size exceeded + description: The max payload size allowed by NATS has been exceeded (1MB) + query: "max(gnatsd_varz_max_payload) > 1024 * 1024" + severity: critical + for: 5m + - name: Nats leaf node connection issue + description: No leaf node connections have been established in the last 5 minutes + query: "increase(gnatsd_varz_leafnodes[5m]) == 0" + severity: critical + for: 5m + - name: Nats max ping operations exceeded + description: The maximum number of ping operations in NATS has exceeded 50 + query: "gnatsd_varz_ping_max > 50" + severity: warning + for: 5m + - name: Nats write deadline exceeded + description: The write deadline has been exceeded in NATS, indicating potential message delivery issues + query: "gnatsd_varz_write_deadline > 10" + severity: critical + for: 5m + - name: Solr exporters: diff --git a/dist/rules/nats/nats-exporter.yml b/dist/rules/nats/nats-exporter.yml index 13eda2b..a9a74fa 100644 --- a/dist/rules/nats/nats-exporter.yml +++ b/dist/rules/nats/nats-exporter.yml @@ -38,4 +38,4 @@ groups: severity: warning annotations: summary: Nats high routes count (instance {{ $labels.instance }}) - description: "High number of NATS routes ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "High number of NATS routes ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" \ No newline at end of file From 3bf8d6d824e5d61221ce4e380d5979532f28f68a Mon Sep 17 00:00:00 2001 From: Samuel Berthe Date: Sat, 24 Aug 2024 10:42:21 +0200 Subject: [PATCH 07/13] fix: Gemfile to reduce vulnerabilities (#432) The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-RUBY-REXML-7814166 Co-authored-by: snyk-bot --- Gemfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Gemfile b/Gemfile index eef87b6..c958185 100644 --- a/Gemfile +++ b/Gemfile @@ -1,3 +1,3 @@ source 'https://rubygems.org' -gem 'github-pages', '>= 227', group: :jekyll_plugins +gem 'github-pages', '>= 232', group: :jekyll_plugins gem 'webrick', '~> 1.3', '>= 1.3.1' \ No newline at end of file From 995ab4d27a5cdaa8045f1103f15a7161c4b245b3 Mon Sep 17 00:00:00 2001 From: Samuel Berthe Date: Wed, 28 Aug 2024 08:46:41 +0200 Subject: [PATCH 08/13] Update rules.yml --- _data/rules.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_data/rules.yml b/_data/rules.yml index 6f5d04d..f5ac4bf 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -267,7 +267,7 @@ groups: for: 5m - name: Host node overtemperature alarm description: "Physical node temperature alarm triggered" - query: '(node_hwmon_temp_crit_alarm_celsius == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + query: '((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' severity: critical - name: Host RAID array got inactive description: "RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically." From f08e8df514173515f14a09b2f8a805010316c4f0 Mon Sep 17 00:00:00 2001 From: Samuel Berthe Date: Wed, 28 Aug 2024 08:48:42 +0200 Subject: [PATCH 09/13] oops --- _data/rules.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_data/rules.yml b/_data/rules.yml index f5ac4bf..9b94c17 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -1549,7 +1549,7 @@ groups: for: 3m - name: Nats server down description: NATS server has been down for more than 5 minutes - query: "absent(up{job="nats"})" + query: 'absent(up{job="nats"})' severity: critical for: 5m - name: Nats high CPU usage From 4aa45dee059a05bb5e8268b506197699296221aa Mon Sep 17 00:00:00 2001 From: samber Date: Wed, 28 Aug 2024 06:49:52 +0000 Subject: [PATCH 10/13] Publish --- .../rules/host-and-hardware/node-exporter.yml | 2 +- dist/rules/nats/nats-exporter.yml | 148 +++++++++++++++++- 2 files changed, 147 insertions(+), 3 deletions(-) diff --git a/dist/rules/host-and-hardware/node-exporter.yml b/dist/rules/host-and-hardware/node-exporter.yml index 0d80c16..6a465d9 100644 --- a/dist/rules/host-and-hardware/node-exporter.yml +++ b/dist/rules/host-and-hardware/node-exporter.yml @@ -215,7 +215,7 @@ groups: description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostNodeOvertemperatureAlarm - expr: '(node_hwmon_temp_crit_alarm_celsius == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + expr: '((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 0m labels: severity: critical diff --git a/dist/rules/nats/nats-exporter.yml b/dist/rules/nats/nats-exporter.yml index a9a74fa..7648762 100644 --- a/dist/rules/nats/nats-exporter.yml +++ b/dist/rules/nats/nats-exporter.yml @@ -32,10 +32,154 @@ groups: description: "High number of NATS subscriptions ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NatsHighRoutesCount - expr: 'gnatsd_routez_num_routes > 10' + expr: 'gnatsd_varz_routes > 10' for: 3m labels: severity: warning annotations: summary: Nats high routes count (instance {{ $labels.instance }}) - description: "High number of NATS routes ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" \ No newline at end of file + description: "High number of NATS routes ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: NatsHighMemoryUsage + expr: 'gnatsd_varz_mem > 200 * 1024 * 1024' + for: 5m + labels: + severity: warning + annotations: + summary: Nats high memory usage (instance {{ $labels.instance }}) + description: "NATS server memory usage is above 200MB for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: NatsSlowConsumers + expr: 'gnatsd_varz_slow_consumers > 0' + for: 3m + labels: + severity: critical + annotations: + summary: Nats slow consumers (instance {{ $labels.instance }}) + description: "There are slow consumers in NATS for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: NatsServerDown + expr: 'absent(up{job="nats"})' + for: 5m + labels: + severity: critical + annotations: + summary: Nats server down (instance {{ $labels.instance }}) + description: "NATS server has been down for more than 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: NatsHighCpuUsage + expr: 'rate(gnatsd_varz_cpu[5m]) > 0.8' + for: 5m + labels: + severity: warning + annotations: + summary: Nats high CPU usage (instance {{ $labels.instance }}) + description: "NATS server is using more than 80% CPU for the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: NatsHighNumberOfConnections + expr: 'gnatsd_connz_num_connections > 1000' + for: 5m + labels: + severity: warning + annotations: + summary: Nats high number of connections (instance {{ $labels.instance }}) + description: "NATS server has more than 1000 active connections\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: NatsHighJetstreamStoreUsage + expr: 'gnatsd_varz_jetstream_stats_storage / gnatsd_varz_jetstream_config_max_storage > 0.8' + for: 5m + labels: + severity: warning + annotations: + summary: Nats high JetStream store usage (instance {{ $labels.instance }}) + description: "JetStream store usage is over 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: NatsHighJetstreamMemoryUsage + expr: 'gnatsd_varz_jetstream_stats_memory / gnatsd_varz_jetstream_config_max_memory > 0.8' + for: 5m + labels: + severity: warning + annotations: + summary: Nats high JetStream memory usage (instance {{ $labels.instance }}) + description: "JetStream memory usage is over 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: NatsHighNumberOfSubscriptions + expr: 'gnatsd_connz_subscriptions > 1000' + for: 5m + labels: + severity: warning + annotations: + summary: Nats high number of subscriptions (instance {{ $labels.instance }}) + description: "NATS server has more than 1000 active subscriptions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: NatsHighPendingBytes + expr: 'gnatsd_connz_pending_bytes > 100000' + for: 5m + labels: + severity: warning + annotations: + summary: Nats high pending bytes (instance {{ $labels.instance }}) + description: "NATS server has more than 100,000 pending bytes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: NatsTooManyErrors + expr: 'increase(gnatsd_varz_jetstream_stats_api_errors[5m]) > 0' + for: 5m + labels: + severity: warning + annotations: + summary: Nats too many errors (instance {{ $labels.instance }}) + description: "NATS server has encountered errors in the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: NatsJetstreamConsumersExceeded + expr: 'sum(gnatsd_varz_jetstream_stats_accounts) > 100' + for: 5m + labels: + severity: warning + annotations: + summary: Nats JetStream consumers exceeded (instance {{ $labels.instance }}) + description: "JetStream has more than 100 active consumers\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: NatsFrequentAuthenticationTimeouts + expr: 'increase(gnatsd_varz_auth_timeout[5m]) > 5' + for: 5m + labels: + severity: warning + annotations: + summary: Nats frequent authentication timeouts (instance {{ $labels.instance }}) + description: "There have been more than 5 authentication timeouts in the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: NatsMaxPayloadSizeExceeded + expr: 'max(gnatsd_varz_max_payload) > 1024 * 1024' + for: 5m + labels: + severity: critical + annotations: + summary: Nats max payload size exceeded (instance {{ $labels.instance }}) + description: "The max payload size allowed by NATS has been exceeded (1MB)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: NatsLeafNodeConnectionIssue + expr: 'increase(gnatsd_varz_leafnodes[5m]) == 0' + for: 5m + labels: + severity: critical + annotations: + summary: Nats leaf node connection issue (instance {{ $labels.instance }}) + description: "No leaf node connections have been established in the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: NatsMaxPingOperationsExceeded + expr: 'gnatsd_varz_ping_max > 50' + for: 5m + labels: + severity: warning + annotations: + summary: Nats max ping operations exceeded (instance {{ $labels.instance }}) + description: "The maximum number of ping operations in NATS has exceeded 50\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: NatsWriteDeadlineExceeded + expr: 'gnatsd_varz_write_deadline > 10' + for: 5m + labels: + severity: critical + annotations: + summary: Nats write deadline exceeded (instance {{ $labels.instance }}) + description: "The write deadline has been exceeded in NATS, indicating potential message delivery issues\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" From 65a5f586cb8f72c44310acf357f1eaae476cd591 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 9 Sep 2024 20:09:20 +0200 Subject: [PATCH 11/13] build(deps-dev): bump rexml from 3.3.3 to 3.3.6 (#431) Bumps [rexml](https://github.com/ruby/rexml) from 3.3.3 to 3.3.6. - [Release notes](https://github.com/ruby/rexml/releases) - [Changelog](https://github.com/ruby/rexml/blob/master/NEWS.md) - [Commits](https://github.com/ruby/rexml/compare/v3.3.3...v3.3.6) --- updated-dependencies: - dependency-name: rexml dependency-type: indirect ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Gemfile.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Gemfile.lock b/Gemfile.lock index f2d1111..71b8bda 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -237,7 +237,7 @@ GEM rb-fsevent (0.11.2) rb-inotify (0.11.1) ffi (~> 1.0) - rexml (3.3.3) + rexml (3.3.6) strscan rouge (3.30.0) ruby2_keywords (0.0.5) From d6d6ae4ef843bfc63d021008772a0d55c02beb51 Mon Sep 17 00:00:00 2001 From: Samuel Berthe Date: Thu, 26 Sep 2024 11:31:21 +0200 Subject: [PATCH 12/13] fix: Gemfile to reduce vulnerabilities (#434) The following vulnerabilities are fixed with an upgrade: - https://snyk.io/vuln/SNYK-RUBY-WEBRICK-8068535 Co-authored-by: snyk-bot --- Gemfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Gemfile b/Gemfile index c958185..1ff80aa 100644 --- a/Gemfile +++ b/Gemfile @@ -1,3 +1,3 @@ source 'https://rubygems.org' gem 'github-pages', '>= 232', group: :jekyll_plugins -gem 'webrick', '~> 1.3', '>= 1.3.1' \ No newline at end of file +gem 'webrick', '~> 1.8', '>= 1.8.2' \ No newline at end of file From 35596c866f129e3134f7ac705e90f50002dae073 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 27 Sep 2024 22:24:21 +0200 Subject: [PATCH 13/13] build(deps): bump webrick from 1.7.0 to 1.8.2 (#435) Bumps [webrick](https://github.com/ruby/webrick) from 1.7.0 to 1.8.2. - [Release notes](https://github.com/ruby/webrick/releases) - [Commits](https://github.com/ruby/webrick/compare/v1.7.0...v1.8.2) --- updated-dependencies: - dependency-name: webrick dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Gemfile | 2 +- Gemfile.lock | 72 ++++++++++++++++++++++++++++++---------------------- 2 files changed, 42 insertions(+), 32 deletions(-) diff --git a/Gemfile b/Gemfile index 1ff80aa..cddfa60 100644 --- a/Gemfile +++ b/Gemfile @@ -1,3 +1,3 @@ source 'https://rubygems.org' gem 'github-pages', '>= 232', group: :jekyll_plugins -gem 'webrick', '~> 1.8', '>= 1.8.2' \ No newline at end of file +gem 'webrick', '~> 1.8' \ No newline at end of file diff --git a/Gemfile.lock b/Gemfile.lock index 71b8bda..1afa3ed 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,16 +1,17 @@ GEM remote: https://rubygems.org/ specs: - activesupport (7.1.3.4) + activesupport (7.2.1) base64 bigdecimal - concurrent-ruby (~> 1.0, >= 1.0.2) + concurrent-ruby (~> 1.0, >= 1.3.1) connection_pool (>= 2.2.5) drb i18n (>= 1.6, < 2) + logger (>= 1.4.2) minitest (>= 5.1) - mutex_m - tzinfo (~> 2.0) + securerandom (>= 0.3) + tzinfo (~> 2.0, >= 2.0.5) addressable (2.8.7) public_suffix (>= 2.0.2, < 7.0) base64 (0.2.0) @@ -21,8 +22,9 @@ GEM coffee-script-source (1.12.2) colorator (1.1.0) commonmarker (0.23.10) - concurrent-ruby (1.3.3) + concurrent-ruby (1.3.4) connection_pool (2.4.1) + csv (3.3.0) dnsruby (1.72.2) simpleidn (~> 0.2.1) drb (2.2.1) @@ -33,20 +35,23 @@ GEM ffi (>= 1.15.0) eventmachine (1.2.7) execjs (2.9.1) - faraday (2.8.1) - base64 - faraday-net_http (>= 2.0, < 3.1) - ruby2_keywords (>= 0.0.4) - faraday-net_http (3.0.2) - ffi (1.16.3) + faraday (2.12.0) + faraday-net_http (>= 2.0, < 3.4) + json + logger + faraday-net_http (3.3.0) + net-http + ffi (1.17.0) + ffi (1.17.0-x86_64-linux-gnu) + ffi (1.17.0-x86_64-linux-musl) forwardable-extended (2.6.0) gemoji (4.1.0) - github-pages (231) + github-pages (232) github-pages-health-check (= 1.18.2) - jekyll (= 3.9.5) + jekyll (= 3.10.0) jekyll-avatar (= 0.8.0) jekyll-coffeescript (= 1.2.2) - jekyll-commonmark-ghpages (= 0.4.0) + jekyll-commonmark-ghpages (= 0.5.1) jekyll-default-layout (= 0.1.5) jekyll-feed (= 0.17.0) jekyll-gist (= 1.5.0) @@ -83,9 +88,10 @@ GEM liquid (= 4.0.4) mercenary (~> 0.3) minima (= 2.5.1) - nokogiri (>= 1.13.6, < 2.0) + nokogiri (>= 1.16.2, < 2.0) rouge (= 3.30.0) terminal-table (~> 1.4) + webrick (~> 1.8) github-pages-health-check (1.18.2) addressable (~> 2.3) dnsruby (~> 1.60) @@ -96,11 +102,12 @@ GEM activesupport (>= 2) nokogiri (>= 1.4) http_parser.rb (0.8.0) - i18n (1.14.5) + i18n (1.14.6) concurrent-ruby (~> 1.0) - jekyll (3.9.5) + jekyll (3.10.0) addressable (~> 2.4) colorator (~> 1.0) + csv (~> 3.0) em-websocket (~> 0.5) i18n (>= 0.7, < 2) jekyll-sass-converter (~> 1.0) @@ -111,6 +118,7 @@ GEM pathutil (~> 0.9) rouge (>= 1.7, < 4) safe_yaml (~> 1.0) + webrick (>= 1.0) jekyll-avatar (0.8.0) jekyll (>= 3.0, < 5.0) jekyll-coffeescript (1.2.2) @@ -118,9 +126,9 @@ GEM coffee-script-source (~> 1.12) jekyll-commonmark (1.4.0) commonmarker (~> 0.22) - jekyll-commonmark-ghpages (0.4.0) - commonmarker (~> 0.23.7) - jekyll (~> 3.9.0) + jekyll-commonmark-ghpages (0.5.1) + commonmarker (>= 0.23.7, < 1.1.0) + jekyll (>= 3.9, < 4.0) jekyll-commonmark (~> 1.4.0) rouge (>= 2.0, < 5.0) jekyll-default-layout (0.1.5) @@ -206,6 +214,7 @@ GEM gemoji (>= 3, < 5) html-pipeline (~> 2.2) jekyll (>= 3.0, < 5.0) + json (2.7.2) kramdown (2.4.0) rexml kramdown-parser-gfm (1.1.0) @@ -214,18 +223,20 @@ GEM listen (3.9.0) rb-fsevent (~> 0.10, >= 0.10.3) rb-inotify (~> 0.9, >= 0.9.10) + logger (1.6.1) mercenary (0.3.6) mini_portile2 (2.8.7) minima (2.5.1) jekyll (>= 3.5, < 5.0) jekyll-feed (~> 0.9) jekyll-seo-tag (~> 2.1) - minitest (5.24.1) - mutex_m (0.2.0) - nokogiri (1.16.5) + minitest (5.25.1) + net-http (0.4.1) + uri + nokogiri (1.16.7) mini_portile2 (~> 2.8.2) racc (~> 1.4) - nokogiri (1.16.5-x86_64-linux) + nokogiri (1.16.7-x86_64-linux) racc (~> 1.4) octokit (4.25.1) faraday (>= 1, < 3) @@ -237,10 +248,8 @@ GEM rb-fsevent (0.11.2) rb-inotify (0.11.1) ffi (~> 1.0) - rexml (3.3.6) - strscan + rexml (3.3.7) rouge (3.30.0) - ruby2_keywords (0.0.5) rubyzip (2.3.2) safe_yaml (1.0.5) sass (3.7.4) @@ -251,8 +260,8 @@ GEM sawyer (0.9.2) addressable (>= 2.3.5) faraday (>= 0.17.3, < 3) + securerandom (0.3.1) simpleidn (0.2.3) - strscan (3.1.0) terminal-table (1.8.0) unicode-display_width (~> 1.1, >= 1.1.1) typhoeus (1.4.1) @@ -260,7 +269,8 @@ GEM tzinfo (2.0.6) concurrent-ruby (~> 1.0) unicode-display_width (1.8.0) - webrick (1.7.0) + uri (0.13.1) + webrick (1.8.2) PLATFORMS ruby @@ -268,8 +278,8 @@ PLATFORMS x86_64-linux-musl DEPENDENCIES - github-pages (>= 227) - webrick (~> 1.3, >= 1.3.1) + github-pages (>= 232) + webrick (~> 1.8) BUNDLED WITH 2.3.13