diff --git a/_data/rules.yml b/_data/rules.yml index 6d4ba8b..fb325c7 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -1537,7 +1537,7 @@ groups: for: 3m - name: Nats high memory usage description: NATS server memory usage is above 200MB for {{ $labels.instance }} - query: "gnatsd_varz_mem > 200000000" + query: "gnatsd_varz_mem > 200 * 1024 * 1024" severity: warning for: 5m - name: Nats slow consumers @@ -1547,7 +1547,7 @@ groups: for: 3m - name: Nats server down description: NATS server has been down for more than 5 minutes - query: "absent(gnatsd_connz_total)" + query: "absent(up{job='nats'})" severity: critical for: 5m - name: High CPU usage on NATS server @@ -1576,7 +1576,7 @@ groups: severity: warning for: 5m - name: High pending messages in NATS - description: NATS server has more than 100,000 pending messages + description: NATS server has more than 100,000 pending bytes query: "gnatsd_connz_pending_bytes > 100000" severity: warning for: 5m @@ -1596,8 +1596,8 @@ groups: severity: warning for: 5m - name: Max payload size exceeded in NATS - description: The max payload size allowed by NATS has been exceeded - query: "max(gnatsd_varz_max_payload) > 1000000" + description: The max payload size allowed by NATS has been exceeded (1MB) + query: "max(gnatsd_varz_max_payload) > 1024 * 1024" severity: critical for: 5m - name: Leaf node connection issue in NATS diff --git a/dist/rules/nats/nats-exporter.yml b/dist/rules/nats/nats-exporter.yml index 05a2413..faaf7c3 100644 --- a/dist/rules/nats/nats-exporter.yml +++ b/dist/rules/nats/nats-exporter.yml @@ -1,185 +1,41 @@ groups: -- name: NatsExporter + - name: NatsExporter - rules: + rules: - - alert: NatsHighConnectionCount - expr: 'gnatsd_varz_connections > 100' - for: 3m - labels: - severity: warning - annotations: - summary: Nats high connection count (instance {{ $labels.instance }}) - description: "High number of NATS connections ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: NatsHighConnectionCount + expr: 'gnatsd_varz_connections > 100' + for: 3m + labels: + severity: warning + annotations: + summary: Nats high connection count (instance {{ $labels.instance }}) + description: "High number of NATS connections ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: NatsHighPendingBytes - expr: 'gnatsd_connz_pending_bytes > 100000' - for: 3m - labels: - severity: warning - annotations: - summary: Nats high pending bytes (instance {{ $labels.instance }}) - description: "High number of NATS pending bytes ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: NatsHighPendingBytes + expr: 'gnatsd_connz_pending_bytes > 100000' + for: 3m + labels: + severity: warning + annotations: + summary: Nats high pending bytes (instance {{ $labels.instance }}) + description: "High number of NATS pending bytes ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: NatsHighSubscriptionsCount - expr: 'gnatsd_connz_subscriptions > 50' - for: 3m - labels: - severity: warning - annotations: - summary: Nats high subscriptions count (instance {{ $labels.instance }}) - description: "High number of NATS subscriptions ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: NatsHighSubscriptionsCount + expr: 'gnatsd_connz_subscriptions > 50' + for: 3m + labels: + severity: warning + annotations: + summary: Nats high subscriptions count (instance {{ $labels.instance }}) + description: "High number of NATS subscriptions ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: NatsHighRoutesCount - expr: 'gnatsd_varz_routes > 10' - for: 3m - labels: - severity: warning - annotations: - summary: Nats high routes count (instance {{ $labels.instance }}) - description: "High number of NATS routes ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: NatsHighMemoryUsage - expr: 'gnatsd_varz_mem > 200000000' # Adjust based on NATS instance sizing - for: 5m - labels: - severity: warning - annotations: - summary: NATS high memory usage (instance {{ $labels.instance }}) - description: "NATS server memory usage is above 200MB for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: NatsSlowConsumers - expr: 'gnatsd_varz_slow_consumers > 0' - for: 3m - labels: - severity: critical - annotations: - summary: Slow consumers in NATS (instance {{ $labels.instance }}) - description: "There are slow consumers in NATS for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: NatsServerDown - expr: absent(gnatsd_connz_total) - for: 5m - labels: - severity: critical - annotations: - summary: "NATS server is down" - description: "NATS server has been down for more than 5 minutes." - - - alert: HighNatsCpuUsage - expr: rate(gnatsd_varz_cpu[5m]) > 0.8 - for: 5m - labels: - severity: warning - annotations: - summary: "High CPU usage on NATS server" - description: "NATS server is using more than 80% CPU for the last 5 minutes." - - - alert: HighNatsConnections - expr: gnatsd_connz_num_connections > 1000 - for: 5m - labels: - severity: warning - annotations: - summary: "High number of connections in NATS" - description: "NATS server has more than 1000 active connections." - - - alert: HighJetStreamStoreUsage - expr: gnatsd_varz_jetstream_stats_storage / gnatsd_varz_jetstream_config_max_storage > 0.8 - for: 5m - labels: - severity: warning - annotations: - summary: "High JetStream store usage" - description: "JetStream store usage is over 80%." - - - alert: HighJetStreamMemoryUsage - expr: gnatsd_varz_jetstream_stats_memory / gnatsd_varz_jetstream_config_max_memory > 0.8 - for: 5m - labels: - severity: warning - annotations: - summary: "High JetStream memory usage" - description: "JetStream memory usage is over 80%." - - - alert: NatsSubscriptionsExceeded - expr: gnatsd_connz_subscriptions > 1000 - for: 5m - labels: - severity: warning - annotations: - summary: "High number of subscriptions in NATS" - description: "NATS server has more than 1000 active subscriptions." - - - alert: NatsMessagesPending - expr: gnatsd_connz_pending_bytes > 100000 - for: 5m - labels: - severity: warning - annotations: - summary: "High number of pending messages in NATS" - description: "NATS server has more than 100,000 pending messages." - - - alert: NatsErrors - expr: increase(gnatsd_varz_jetstream_stats_api_errors[5m]) > 0 - for: 5m - labels: - severity: warning - annotations: - summary: "Errors in NATS" - description: "NATS server has encountered errors in the last 5 minutes." - - - alert: JetStreamConsumersExceeded - expr: sum(gnatsd_varz_jetstream_stats_accounts) > 100 - for: 5m - labels: - severity: warning - annotations: - summary: "High number of JetStream consumers" - description: "JetStream has more than 100 active consumers." - - - alert: NatsAuthTimeouts - expr: increase(gnatsd_varz_auth_timeout[5m]) > 5 - for: 5m - labels: - severity: warning - annotations: - summary: "Frequent authentication timeouts on NATS" - description: "There have been more than 5 authentication timeouts in the last 5 minutes." - - - alert: NatsMaxPayloadExceeded - expr: max(gnatsd_varz_max_payload) > 1000000 - for: 5m - labels: - severity: critical - annotations: - summary: "Max payload size exceeded in NATS" - description: "The max payload size allowed by NATS has been exceeded." - - - alert: NatsLeafNodeIssues - expr: increase(gnatsd_varz_leafnodes[5m]) == 0 - for: 5m - labels: - severity: critical - annotations: - summary: "Leaf node connection issue in NATS" - description: "No leaf node connections have been established in the last 5 minutes." - - - alert: NatsPingMaxExceeded - expr: gnatsd_varz_ping_max > 50 - for: 5m - labels: - severity: warning - annotations: - summary: "Max ping operations exceeded in NATS" - description: "The maximum number of ping operations in NATS has exceeded 50." - - - alert: NatsWriteDeadlineExceeded - expr: gnatsd_varz_write_deadline > 10 - for: 5m - labels: - severity: critical - annotations: - summary: "Write deadline exceeded in NATS" - description: "The write deadline has been exceeded in NATS, indicating potential message delivery issues." + - alert: NatsHighRoutesCount + expr: 'gnatsd_routez_num_routes > 10' + for: 3m + labels: + severity: warning + annotations: + summary: Nats high routes count (instance {{ $labels.instance }}) + description: "High number of NATS routes ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" \ No newline at end of file