groups: - name: NatsExporter rules: - alert: NatsHighConnectionCount expr: 'gnatsd_varz_connections > 100' for: 3m labels: severity: warning annotations: summary: Nats high connection count (instance {{ $labels.instance }}) description: "High number of NATS connections ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NatsHighPendingBytes expr: 'gnatsd_connz_pending_bytes > 100000' for: 3m labels: severity: warning annotations: summary: Nats high pending bytes (instance {{ $labels.instance }}) description: "High number of NATS pending bytes ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NatsHighSubscriptionsCount expr: 'gnatsd_connz_subscriptions > 50' for: 3m labels: severity: warning annotations: summary: Nats high subscriptions count (instance {{ $labels.instance }}) description: "High number of NATS subscriptions ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NatsHighRoutesCount expr: 'gnatsd_varz_routes > 10' for: 3m labels: severity: warning annotations: summary: Nats high routes count (instance {{ $labels.instance }}) description: "High number of NATS routes ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NatsHighMemoryUsage expr: 'gnatsd_varz_mem > 200000000' # Adjust based on NATS instance sizing for: 5m labels: severity: warning annotations: summary: NATS high memory usage (instance {{ $labels.instance }}) description: "NATS server memory usage is above 200MB for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NatsSlowConsumers expr: 'gnatsd_varz_slow_consumers > 0' for: 3m labels: severity: critical annotations: summary: Slow consumers in NATS (instance {{ $labels.instance }}) description: "There are slow consumers in NATS for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NatsServerDown expr: absent(gnatsd_connz_total) for: 5m labels: severity: critical annotations: summary: "NATS server is down" description: "NATS server has been down for more than 5 minutes." - alert: HighNatsCpuUsage expr: rate(gnatsd_varz_cpu[5m]) > 0.8 for: 5m labels: severity: warning annotations: summary: "High CPU usage on NATS server" description: "NATS server is using more than 80% CPU for the last 5 minutes." - alert: HighNatsConnections expr: gnatsd_connz_num_connections > 1000 for: 5m labels: severity: warning annotations: summary: "High number of connections in NATS" description: "NATS server has more than 1000 active connections." - alert: HighJetStreamStoreUsage expr: gnatsd_varz_jetstream_stats_storage / gnatsd_varz_jetstream_config_max_storage > 0.8 for: 5m labels: severity: warning annotations: summary: "High JetStream store usage" description: "JetStream store usage is over 80%." - alert: HighJetStreamMemoryUsage expr: gnatsd_varz_jetstream_stats_memory / gnatsd_varz_jetstream_config_max_memory > 0.8 for: 5m labels: severity: warning annotations: summary: "High JetStream memory usage" description: "JetStream memory usage is over 80%." - alert: NatsSubscriptionsExceeded expr: gnatsd_connz_subscriptions > 1000 for: 5m labels: severity: warning annotations: summary: "High number of subscriptions in NATS" description: "NATS server has more than 1000 active subscriptions." - alert: NatsMessagesPending expr: gnatsd_connz_pending_bytes > 100000 for: 5m labels: severity: warning annotations: summary: "High number of pending messages in NATS" description: "NATS server has more than 100,000 pending messages." - alert: NatsErrors expr: increase(gnatsd_varz_jetstream_stats_api_errors[5m]) > 0 for: 5m labels: severity: warning annotations: summary: "Errors in NATS" description: "NATS server has encountered errors in the last 5 minutes." - alert: JetStreamConsumersExceeded expr: sum(gnatsd_varz_jetstream_stats_accounts) > 100 for: 5m labels: severity: warning annotations: summary: "High number of JetStream consumers" description: "JetStream has more than 100 active consumers." - alert: NatsAuthTimeouts expr: increase(gnatsd_varz_auth_timeout[5m]) > 5 for: 5m labels: severity: warning annotations: summary: "Frequent authentication timeouts on NATS" description: "There have been more than 5 authentication timeouts in the last 5 minutes." - alert: NatsMaxPayloadExceeded expr: max(gnatsd_varz_max_payload) > 1000000 for: 5m labels: severity: critical annotations: summary: "Max payload size exceeded in NATS" description: "The max payload size allowed by NATS has been exceeded." - alert: NatsLeafNodeIssues expr: increase(gnatsd_varz_leafnodes[5m]) == 0 for: 5m labels: severity: critical annotations: summary: "Leaf node connection issue in NATS" description: "No leaf node connections have been established in the last 5 minutes." - alert: NatsPingMaxExceeded expr: gnatsd_varz_ping_max > 50 for: 5m labels: severity: warning annotations: summary: "Max ping operations exceeded in NATS" description: "The maximum number of ping operations in NATS has exceeded 50." - alert: NatsWriteDeadlineExceeded expr: gnatsd_varz_write_deadline > 10 for: 5m labels: severity: critical annotations: summary: "Write deadline exceeded in NATS" description: "The write deadline has been exceeded in NATS, indicating potential message delivery issues."