From 8c0bdc2b24e9ba6b07e659e8260020788755b4c3 Mon Sep 17 00:00:00 2001 From: Somrat Dutta <38795369+somratdutta@users.noreply.github.com> Date: Wed, 21 Aug 2024 00:07:03 +0530 Subject: [PATCH] feat: Add NATS and JetStream Prometheus alert rules (#430) * feat: Add comprehensive NATS and JetStream Prometheus alert rules - Added multiple Prometheus alert rules for monitoring NATS server and JetStream metrics. - Included alerts for: - High connection count - High pending bytes - High subscriptions count - High routes count - High memory usage - Slow consumers - NATS server downtime - High CPU usage - High number of active connections - High JetStream store and memory usage - Subscription limits exceeded - High pending messages - Authentication timeouts - Errors in NATS (JetStream API errors) - JetStream consumers limit exceeded - Exceeding max payload size - Leaf node connection issues - Ping operations limit exceeded - Write deadline exceeded - Ensured consistency between `exporter.yml` and `rules.yml` files. - Improved overall NATS and JetStream monitoring to prevent performance degradation and ensure system reliability. This commit enhances the visibility of NATS and JetStream operations by providing key metrics to alert on potential issues and optimize system performance. * Update rules.yml * - minor changes, rollback rules.yml - address comment changes - revert to old rules.yml as they are generated * - minor changes, rollback rules.yml - address comment changes - revert to old rules.yml as they are generated * fix indentation --------- Co-authored-by: somratdutta Co-authored-by: Samuel Berthe Co-authored-by: somrat.dutta --- _data/rules.yml | 83 ++++++++++++++++++++++++++++++- dist/rules/nats/nats-exporter.yml | 2 +- 2 files changed, 83 insertions(+), 2 deletions(-) diff --git a/_data/rules.yml b/_data/rules.yml index b9506d2..6f5d04d 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -1534,9 +1534,90 @@ groups: for: 3m - name: Nats high routes count description: High number of NATS routes ({{ $value }}) for {{ $labels.instance }} - query: "gnatsd_routez_num_routes > 10" + query: "gnatsd_varz_routes > 10" severity: warning for: 3m + - name: Nats high memory usage + description: NATS server memory usage is above 200MB for {{ $labels.instance }} + query: "gnatsd_varz_mem > 200 * 1024 * 1024" + severity: warning + for: 5m + - name: Nats slow consumers + description: There are slow consumers in NATS for {{ $labels.instance }} + query: "gnatsd_varz_slow_consumers > 0" + severity: critical + for: 3m + - name: Nats server down + description: NATS server has been down for more than 5 minutes + query: "absent(up{job="nats"})" + severity: critical + for: 5m + - name: Nats high CPU usage + description: NATS server is using more than 80% CPU for the last 5 minutes + query: "rate(gnatsd_varz_cpu[5m]) > 0.8" + severity: warning + for: 5m + - name: Nats high number of connections + description: NATS server has more than 1000 active connections + query: "gnatsd_connz_num_connections > 1000" + severity: warning + for: 5m + - name: Nats high JetStream store usage + description: JetStream store usage is over 80% + query: "gnatsd_varz_jetstream_stats_storage / gnatsd_varz_jetstream_config_max_storage > 0.8" + severity: warning + for: 5m + - name: Nats high JetStream memory usage + description: JetStream memory usage is over 80% + query: "gnatsd_varz_jetstream_stats_memory / gnatsd_varz_jetstream_config_max_memory > 0.8" + severity: warning + for: 5m + - name: Nats high number of subscriptions + description: NATS server has more than 1000 active subscriptions + query: "gnatsd_connz_subscriptions > 1000" + severity: warning + for: 5m + - name: Nats high pending bytes + description: NATS server has more than 100,000 pending bytes + query: "gnatsd_connz_pending_bytes > 100000" + severity: warning + for: 5m + - name: Nats too many errors + description: NATS server has encountered errors in the last 5 minutes + query: "increase(gnatsd_varz_jetstream_stats_api_errors[5m]) > 0" + severity: warning + for: 5m + - name: Nats JetStream consumers exceeded + description: JetStream has more than 100 active consumers + query: "sum(gnatsd_varz_jetstream_stats_accounts) > 100" + severity: warning + for: 5m + - name: Nats frequent authentication timeouts + description: There have been more than 5 authentication timeouts in the last 5 minutes + query: "increase(gnatsd_varz_auth_timeout[5m]) > 5" + severity: warning + for: 5m + - name: Nats max payload size exceeded + description: The max payload size allowed by NATS has been exceeded (1MB) + query: "max(gnatsd_varz_max_payload) > 1024 * 1024" + severity: critical + for: 5m + - name: Nats leaf node connection issue + description: No leaf node connections have been established in the last 5 minutes + query: "increase(gnatsd_varz_leafnodes[5m]) == 0" + severity: critical + for: 5m + - name: Nats max ping operations exceeded + description: The maximum number of ping operations in NATS has exceeded 50 + query: "gnatsd_varz_ping_max > 50" + severity: warning + for: 5m + - name: Nats write deadline exceeded + description: The write deadline has been exceeded in NATS, indicating potential message delivery issues + query: "gnatsd_varz_write_deadline > 10" + severity: critical + for: 5m + - name: Solr exporters: diff --git a/dist/rules/nats/nats-exporter.yml b/dist/rules/nats/nats-exporter.yml index 13eda2b..a9a74fa 100644 --- a/dist/rules/nats/nats-exporter.yml +++ b/dist/rules/nats/nats-exporter.yml @@ -38,4 +38,4 @@ groups: severity: warning annotations: summary: Nats high routes count (instance {{ $labels.instance }}) - description: "High number of NATS routes ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "High number of NATS routes ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" \ No newline at end of file