diff --git a/_data/rules.yml b/_data/rules.yml index 0216beb..6d4ba8b 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -1532,9 +1532,90 @@ groups: for: 3m - name: Nats high routes count description: High number of NATS routes ({{ $value }}) for {{ $labels.instance }} - query: "gnatsd_routez_num_routes > 10" + query: "gnatsd_varz_routes > 10" severity: warning for: 3m + - name: Nats high memory usage + description: NATS server memory usage is above 200MB for {{ $labels.instance }} + query: "gnatsd_varz_mem > 200000000" + severity: warning + for: 5m + - name: Nats slow consumers + description: There are slow consumers in NATS for {{ $labels.instance }} + query: "gnatsd_varz_slow_consumers > 0" + severity: critical + for: 3m + - name: Nats server down + description: NATS server has been down for more than 5 minutes + query: "absent(gnatsd_connz_total)" + severity: critical + for: 5m + - name: High CPU usage on NATS server + description: NATS server is using more than 80% CPU for the last 5 minutes + query: "rate(gnatsd_varz_cpu[5m]) > 0.8" + severity: warning + for: 5m + - name: High number of connections in NATS + description: NATS server has more than 1000 active connections + query: "gnatsd_connz_num_connections > 1000" + severity: warning + for: 5m + - name: High JetStream store usage + description: JetStream store usage is over 80% + query: "gnatsd_varz_jetstream_stats_storage / gnatsd_varz_jetstream_config_max_storage > 0.8" + severity: warning + for: 5m + - name: High JetStream memory usage + description: JetStream memory usage is over 80% + query: "gnatsd_varz_jetstream_stats_memory / gnatsd_varz_jetstream_config_max_memory > 0.8" + severity: warning + for: 5m + - name: Nats high number of subscriptions + description: NATS server has more than 1000 active subscriptions + query: "gnatsd_connz_subscriptions > 1000" + severity: warning + for: 5m + - name: High pending messages in NATS + description: NATS server has more than 100,000 pending messages + query: "gnatsd_connz_pending_bytes > 100000" + severity: warning + for: 5m + - name: Errors in NATS + description: NATS server has encountered errors in the last 5 minutes + query: "increase(gnatsd_varz_jetstream_stats_api_errors[5m]) > 0" + severity: warning + for: 5m + - name: JetStream consumers exceeded + description: JetStream has more than 100 active consumers + query: "sum(gnatsd_varz_jetstream_stats_accounts) > 100" + severity: warning + for: 5m + - name: Frequent authentication timeouts in NATS + description: There have been more than 5 authentication timeouts in the last 5 minutes + query: "increase(gnatsd_varz_auth_timeout[5m]) > 5" + severity: warning + for: 5m + - name: Max payload size exceeded in NATS + description: The max payload size allowed by NATS has been exceeded + query: "max(gnatsd_varz_max_payload) > 1000000" + severity: critical + for: 5m + - name: Leaf node connection issue in NATS + description: No leaf node connections have been established in the last 5 minutes + query: "increase(gnatsd_varz_leafnodes[5m]) == 0" + severity: critical + for: 5m + - name: Max ping operations exceeded in NATS + description: The maximum number of ping operations in NATS has exceeded 50 + query: "gnatsd_varz_ping_max > 50" + severity: warning + for: 5m + - name: Write deadline exceeded in NATS + description: The write deadline has been exceeded in NATS, indicating potential message delivery issues + query: "gnatsd_varz_write_deadline > 10" + severity: critical + for: 5m + - name: Solr exporters: diff --git a/dist/rules/nats/nats-exporter.yml b/dist/rules/nats/nats-exporter.yml index 13eda2b..05a2413 100644 --- a/dist/rules/nats/nats-exporter.yml +++ b/dist/rules/nats/nats-exporter.yml @@ -32,10 +32,154 @@ groups: description: "High number of NATS subscriptions ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NatsHighRoutesCount - expr: 'gnatsd_routez_num_routes > 10' + expr: 'gnatsd_varz_routes > 10' for: 3m labels: severity: warning annotations: summary: Nats high routes count (instance {{ $labels.instance }}) description: "High number of NATS routes ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: NatsHighMemoryUsage + expr: 'gnatsd_varz_mem > 200000000' # Adjust based on NATS instance sizing + for: 5m + labels: + severity: warning + annotations: + summary: NATS high memory usage (instance {{ $labels.instance }}) + description: "NATS server memory usage is above 200MB for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: NatsSlowConsumers + expr: 'gnatsd_varz_slow_consumers > 0' + for: 3m + labels: + severity: critical + annotations: + summary: Slow consumers in NATS (instance {{ $labels.instance }}) + description: "There are slow consumers in NATS for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: NatsServerDown + expr: absent(gnatsd_connz_total) + for: 5m + labels: + severity: critical + annotations: + summary: "NATS server is down" + description: "NATS server has been down for more than 5 minutes." + + - alert: HighNatsCpuUsage + expr: rate(gnatsd_varz_cpu[5m]) > 0.8 + for: 5m + labels: + severity: warning + annotations: + summary: "High CPU usage on NATS server" + description: "NATS server is using more than 80% CPU for the last 5 minutes." + + - alert: HighNatsConnections + expr: gnatsd_connz_num_connections > 1000 + for: 5m + labels: + severity: warning + annotations: + summary: "High number of connections in NATS" + description: "NATS server has more than 1000 active connections." + + - alert: HighJetStreamStoreUsage + expr: gnatsd_varz_jetstream_stats_storage / gnatsd_varz_jetstream_config_max_storage > 0.8 + for: 5m + labels: + severity: warning + annotations: + summary: "High JetStream store usage" + description: "JetStream store usage is over 80%." + + - alert: HighJetStreamMemoryUsage + expr: gnatsd_varz_jetstream_stats_memory / gnatsd_varz_jetstream_config_max_memory > 0.8 + for: 5m + labels: + severity: warning + annotations: + summary: "High JetStream memory usage" + description: "JetStream memory usage is over 80%." + + - alert: NatsSubscriptionsExceeded + expr: gnatsd_connz_subscriptions > 1000 + for: 5m + labels: + severity: warning + annotations: + summary: "High number of subscriptions in NATS" + description: "NATS server has more than 1000 active subscriptions." + + - alert: NatsMessagesPending + expr: gnatsd_connz_pending_bytes > 100000 + for: 5m + labels: + severity: warning + annotations: + summary: "High number of pending messages in NATS" + description: "NATS server has more than 100,000 pending messages." + + - alert: NatsErrors + expr: increase(gnatsd_varz_jetstream_stats_api_errors[5m]) > 0 + for: 5m + labels: + severity: warning + annotations: + summary: "Errors in NATS" + description: "NATS server has encountered errors in the last 5 minutes." + + - alert: JetStreamConsumersExceeded + expr: sum(gnatsd_varz_jetstream_stats_accounts) > 100 + for: 5m + labels: + severity: warning + annotations: + summary: "High number of JetStream consumers" + description: "JetStream has more than 100 active consumers." + + - alert: NatsAuthTimeouts + expr: increase(gnatsd_varz_auth_timeout[5m]) > 5 + for: 5m + labels: + severity: warning + annotations: + summary: "Frequent authentication timeouts on NATS" + description: "There have been more than 5 authentication timeouts in the last 5 minutes." + + - alert: NatsMaxPayloadExceeded + expr: max(gnatsd_varz_max_payload) > 1000000 + for: 5m + labels: + severity: critical + annotations: + summary: "Max payload size exceeded in NATS" + description: "The max payload size allowed by NATS has been exceeded." + + - alert: NatsLeafNodeIssues + expr: increase(gnatsd_varz_leafnodes[5m]) == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Leaf node connection issue in NATS" + description: "No leaf node connections have been established in the last 5 minutes." + + - alert: NatsPingMaxExceeded + expr: gnatsd_varz_ping_max > 50 + for: 5m + labels: + severity: warning + annotations: + summary: "Max ping operations exceeded in NATS" + description: "The maximum number of ping operations in NATS has exceeded 50." + + - alert: NatsWriteDeadlineExceeded + expr: gnatsd_varz_write_deadline > 10 + for: 5m + labels: + severity: critical + annotations: + summary: "Write deadline exceeded in NATS" + description: "The write deadline has been exceeded in NATS, indicating potential message delivery issues."