feat: Add comprehensive NATS and JetStream Prometheus alert rules

- Added multiple Prometheus alert rules for monitoring NATS server and JetStream metrics.
- Included alerts for:
  - High connection count
  - High pending bytes
  - High subscriptions count
  - High routes count
  - High memory usage
  - Slow consumers
  - NATS server downtime
  - High CPU usage
  - High number of active connections
  - High JetStream store and memory usage
  - Subscription limits exceeded
  - High pending messages
  - Authentication timeouts
  - Errors in NATS (JetStream API errors)
  - JetStream consumers limit exceeded
  - Exceeding max payload size
  - Leaf node connection issues
  - Ping operations limit exceeded
  - Write deadline exceeded
- Ensured consistency between `exporter.yml` and `rules.yml` files.
- Improved overall NATS and JetStream monitoring to prevent performance degradation and ensure system reliability.

This commit enhances the visibility of NATS and JetStream operations by providing key metrics to alert on potential issues and optimize system performance.
This commit is contained in:
somratdutta 2024-08-20 20:37:00 +05:30
parent 61da73d517
commit 1a9ba83feb
2 changed files with 227 additions and 2 deletions

View file

@ -1532,9 +1532,90 @@ groups:
for: 3m
- name: Nats high routes count
description: High number of NATS routes ({{ $value }}) for {{ $labels.instance }}
query: "gnatsd_routez_num_routes > 10"
query: "gnatsd_varz_routes > 10"
severity: warning
for: 3m
- name: Nats high memory usage
description: NATS server memory usage is above 200MB for {{ $labels.instance }}
query: "gnatsd_varz_mem > 200000000"
severity: warning
for: 5m
- name: Nats slow consumers
description: There are slow consumers in NATS for {{ $labels.instance }}
query: "gnatsd_varz_slow_consumers > 0"
severity: critical
for: 3m
- name: Nats server down
description: NATS server has been down for more than 5 minutes
query: "absent(gnatsd_connz_total)"
severity: critical
for: 5m
- name: High CPU usage on NATS server
description: NATS server is using more than 80% CPU for the last 5 minutes
query: "rate(gnatsd_varz_cpu[5m]) > 0.8"
severity: warning
for: 5m
- name: High number of connections in NATS
description: NATS server has more than 1000 active connections
query: "gnatsd_connz_num_connections > 1000"
severity: warning
for: 5m
- name: High JetStream store usage
description: JetStream store usage is over 80%
query: "gnatsd_varz_jetstream_stats_storage / gnatsd_varz_jetstream_config_max_storage > 0.8"
severity: warning
for: 5m
- name: High JetStream memory usage
description: JetStream memory usage is over 80%
query: "gnatsd_varz_jetstream_stats_memory / gnatsd_varz_jetstream_config_max_memory > 0.8"
severity: warning
for: 5m
- name: Nats high number of subscriptions
description: NATS server has more than 1000 active subscriptions
query: "gnatsd_connz_subscriptions > 1000"
severity: warning
for: 5m
- name: High pending messages in NATS
description: NATS server has more than 100,000 pending messages
query: "gnatsd_connz_pending_bytes > 100000"
severity: warning
for: 5m
- name: Errors in NATS
description: NATS server has encountered errors in the last 5 minutes
query: "increase(gnatsd_varz_jetstream_stats_api_errors[5m]) > 0"
severity: warning
for: 5m
- name: JetStream consumers exceeded
description: JetStream has more than 100 active consumers
query: "sum(gnatsd_varz_jetstream_stats_accounts) > 100"
severity: warning
for: 5m
- name: Frequent authentication timeouts in NATS
description: There have been more than 5 authentication timeouts in the last 5 minutes
query: "increase(gnatsd_varz_auth_timeout[5m]) > 5"
severity: warning
for: 5m
- name: Max payload size exceeded in NATS
description: The max payload size allowed by NATS has been exceeded
query: "max(gnatsd_varz_max_payload) > 1000000"
severity: critical
for: 5m
- name: Leaf node connection issue in NATS
description: No leaf node connections have been established in the last 5 minutes
query: "increase(gnatsd_varz_leafnodes[5m]) == 0"
severity: critical
for: 5m
- name: Max ping operations exceeded in NATS
description: The maximum number of ping operations in NATS has exceeded 50
query: "gnatsd_varz_ping_max > 50"
severity: warning
for: 5m
- name: Write deadline exceeded in NATS
description: The write deadline has been exceeded in NATS, indicating potential message delivery issues
query: "gnatsd_varz_write_deadline > 10"
severity: critical
for: 5m
- name: Solr
exporters:

View file

@ -32,10 +32,154 @@ groups:
description: "High number of NATS subscriptions ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsHighRoutesCount
expr: 'gnatsd_routez_num_routes > 10'
expr: 'gnatsd_varz_routes > 10'
for: 3m
labels:
severity: warning
annotations:
summary: Nats high routes count (instance {{ $labels.instance }})
description: "High number of NATS routes ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsHighMemoryUsage
expr: 'gnatsd_varz_mem > 200000000' # Adjust based on NATS instance sizing
for: 5m
labels:
severity: warning
annotations:
summary: NATS high memory usage (instance {{ $labels.instance }})
description: "NATS server memory usage is above 200MB for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsSlowConsumers
expr: 'gnatsd_varz_slow_consumers > 0'
for: 3m
labels:
severity: critical
annotations:
summary: Slow consumers in NATS (instance {{ $labels.instance }})
description: "There are slow consumers in NATS for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsServerDown
expr: absent(gnatsd_connz_total)
for: 5m
labels:
severity: critical
annotations:
summary: "NATS server is down"
description: "NATS server has been down for more than 5 minutes."
- alert: HighNatsCpuUsage
expr: rate(gnatsd_varz_cpu[5m]) > 0.8
for: 5m
labels:
severity: warning
annotations:
summary: "High CPU usage on NATS server"
description: "NATS server is using more than 80% CPU for the last 5 minutes."
- alert: HighNatsConnections
expr: gnatsd_connz_num_connections > 1000
for: 5m
labels:
severity: warning
annotations:
summary: "High number of connections in NATS"
description: "NATS server has more than 1000 active connections."
- alert: HighJetStreamStoreUsage
expr: gnatsd_varz_jetstream_stats_storage / gnatsd_varz_jetstream_config_max_storage > 0.8
for: 5m
labels:
severity: warning
annotations:
summary: "High JetStream store usage"
description: "JetStream store usage is over 80%."
- alert: HighJetStreamMemoryUsage
expr: gnatsd_varz_jetstream_stats_memory / gnatsd_varz_jetstream_config_max_memory > 0.8
for: 5m
labels:
severity: warning
annotations:
summary: "High JetStream memory usage"
description: "JetStream memory usage is over 80%."
- alert: NatsSubscriptionsExceeded
expr: gnatsd_connz_subscriptions > 1000
for: 5m
labels:
severity: warning
annotations:
summary: "High number of subscriptions in NATS"
description: "NATS server has more than 1000 active subscriptions."
- alert: NatsMessagesPending
expr: gnatsd_connz_pending_bytes > 100000
for: 5m
labels:
severity: warning
annotations:
summary: "High number of pending messages in NATS"
description: "NATS server has more than 100,000 pending messages."
- alert: NatsErrors
expr: increase(gnatsd_varz_jetstream_stats_api_errors[5m]) > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Errors in NATS"
description: "NATS server has encountered errors in the last 5 minutes."
- alert: JetStreamConsumersExceeded
expr: sum(gnatsd_varz_jetstream_stats_accounts) > 100
for: 5m
labels:
severity: warning
annotations:
summary: "High number of JetStream consumers"
description: "JetStream has more than 100 active consumers."
- alert: NatsAuthTimeouts
expr: increase(gnatsd_varz_auth_timeout[5m]) > 5
for: 5m
labels:
severity: warning
annotations:
summary: "Frequent authentication timeouts on NATS"
description: "There have been more than 5 authentication timeouts in the last 5 minutes."
- alert: NatsMaxPayloadExceeded
expr: max(gnatsd_varz_max_payload) > 1000000
for: 5m
labels:
severity: critical
annotations:
summary: "Max payload size exceeded in NATS"
description: "The max payload size allowed by NATS has been exceeded."
- alert: NatsLeafNodeIssues
expr: increase(gnatsd_varz_leafnodes[5m]) == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Leaf node connection issue in NATS"
description: "No leaf node connections have been established in the last 5 minutes."
- alert: NatsPingMaxExceeded
expr: gnatsd_varz_ping_max > 50
for: 5m
labels:
severity: warning
annotations:
summary: "Max ping operations exceeded in NATS"
description: "The maximum number of ping operations in NATS has exceeded 50."
- alert: NatsWriteDeadlineExceeded
expr: gnatsd_varz_write_deadline > 10
for: 5m
labels:
severity: critical
annotations:
summary: "Write deadline exceeded in NATS"
description: "The write deadline has been exceeded in NATS, indicating potential message delivery issues."