mirror of
https://github.com/samber/awesome-prometheus-alerts.git
synced 2026-06-26 03:17:07 +08:00
feat: Add comprehensive NATS and JetStream Prometheus alert rules
- Added multiple Prometheus alert rules for monitoring NATS server and JetStream metrics. - Included alerts for: - High connection count - High pending bytes - High subscriptions count - High routes count - High memory usage - Slow consumers - NATS server downtime - High CPU usage - High number of active connections - High JetStream store and memory usage - Subscription limits exceeded - High pending messages - Authentication timeouts - Errors in NATS (JetStream API errors) - JetStream consumers limit exceeded - Exceeding max payload size - Leaf node connection issues - Ping operations limit exceeded - Write deadline exceeded - Ensured consistency between `exporter.yml` and `rules.yml` files. - Improved overall NATS and JetStream monitoring to prevent performance degradation and ensure system reliability. This commit enhances the visibility of NATS and JetStream operations by providing key metrics to alert on potential issues and optimize system performance.
This commit is contained in:
parent
61da73d517
commit
1a9ba83feb
2 changed files with 227 additions and 2 deletions
|
|
@ -1532,9 +1532,90 @@ groups:
|
|||
for: 3m
|
||||
- name: Nats high routes count
|
||||
description: High number of NATS routes ({{ $value }}) for {{ $labels.instance }}
|
||||
query: "gnatsd_routez_num_routes > 10"
|
||||
query: "gnatsd_varz_routes > 10"
|
||||
severity: warning
|
||||
for: 3m
|
||||
- name: Nats high memory usage
|
||||
description: NATS server memory usage is above 200MB for {{ $labels.instance }}
|
||||
query: "gnatsd_varz_mem > 200000000"
|
||||
severity: warning
|
||||
for: 5m
|
||||
- name: Nats slow consumers
|
||||
description: There are slow consumers in NATS for {{ $labels.instance }}
|
||||
query: "gnatsd_varz_slow_consumers > 0"
|
||||
severity: critical
|
||||
for: 3m
|
||||
- name: Nats server down
|
||||
description: NATS server has been down for more than 5 minutes
|
||||
query: "absent(gnatsd_connz_total)"
|
||||
severity: critical
|
||||
for: 5m
|
||||
- name: High CPU usage on NATS server
|
||||
description: NATS server is using more than 80% CPU for the last 5 minutes
|
||||
query: "rate(gnatsd_varz_cpu[5m]) > 0.8"
|
||||
severity: warning
|
||||
for: 5m
|
||||
- name: High number of connections in NATS
|
||||
description: NATS server has more than 1000 active connections
|
||||
query: "gnatsd_connz_num_connections > 1000"
|
||||
severity: warning
|
||||
for: 5m
|
||||
- name: High JetStream store usage
|
||||
description: JetStream store usage is over 80%
|
||||
query: "gnatsd_varz_jetstream_stats_storage / gnatsd_varz_jetstream_config_max_storage > 0.8"
|
||||
severity: warning
|
||||
for: 5m
|
||||
- name: High JetStream memory usage
|
||||
description: JetStream memory usage is over 80%
|
||||
query: "gnatsd_varz_jetstream_stats_memory / gnatsd_varz_jetstream_config_max_memory > 0.8"
|
||||
severity: warning
|
||||
for: 5m
|
||||
- name: Nats high number of subscriptions
|
||||
description: NATS server has more than 1000 active subscriptions
|
||||
query: "gnatsd_connz_subscriptions > 1000"
|
||||
severity: warning
|
||||
for: 5m
|
||||
- name: High pending messages in NATS
|
||||
description: NATS server has more than 100,000 pending messages
|
||||
query: "gnatsd_connz_pending_bytes > 100000"
|
||||
severity: warning
|
||||
for: 5m
|
||||
- name: Errors in NATS
|
||||
description: NATS server has encountered errors in the last 5 minutes
|
||||
query: "increase(gnatsd_varz_jetstream_stats_api_errors[5m]) > 0"
|
||||
severity: warning
|
||||
for: 5m
|
||||
- name: JetStream consumers exceeded
|
||||
description: JetStream has more than 100 active consumers
|
||||
query: "sum(gnatsd_varz_jetstream_stats_accounts) > 100"
|
||||
severity: warning
|
||||
for: 5m
|
||||
- name: Frequent authentication timeouts in NATS
|
||||
description: There have been more than 5 authentication timeouts in the last 5 minutes
|
||||
query: "increase(gnatsd_varz_auth_timeout[5m]) > 5"
|
||||
severity: warning
|
||||
for: 5m
|
||||
- name: Max payload size exceeded in NATS
|
||||
description: The max payload size allowed by NATS has been exceeded
|
||||
query: "max(gnatsd_varz_max_payload) > 1000000"
|
||||
severity: critical
|
||||
for: 5m
|
||||
- name: Leaf node connection issue in NATS
|
||||
description: No leaf node connections have been established in the last 5 minutes
|
||||
query: "increase(gnatsd_varz_leafnodes[5m]) == 0"
|
||||
severity: critical
|
||||
for: 5m
|
||||
- name: Max ping operations exceeded in NATS
|
||||
description: The maximum number of ping operations in NATS has exceeded 50
|
||||
query: "gnatsd_varz_ping_max > 50"
|
||||
severity: warning
|
||||
for: 5m
|
||||
- name: Write deadline exceeded in NATS
|
||||
description: The write deadline has been exceeded in NATS, indicating potential message delivery issues
|
||||
query: "gnatsd_varz_write_deadline > 10"
|
||||
severity: critical
|
||||
for: 5m
|
||||
|
||||
|
||||
- name: Solr
|
||||
exporters:
|
||||
|
|
|
|||
146
dist/rules/nats/nats-exporter.yml
vendored
146
dist/rules/nats/nats-exporter.yml
vendored
|
|
@ -32,10 +32,154 @@ groups:
|
|||
description: "High number of NATS subscriptions ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NatsHighRoutesCount
|
||||
expr: 'gnatsd_routez_num_routes > 10'
|
||||
expr: 'gnatsd_varz_routes > 10'
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Nats high routes count (instance {{ $labels.instance }})
|
||||
description: "High number of NATS routes ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NatsHighMemoryUsage
|
||||
expr: 'gnatsd_varz_mem > 200000000' # Adjust based on NATS instance sizing
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: NATS high memory usage (instance {{ $labels.instance }})
|
||||
description: "NATS server memory usage is above 200MB for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NatsSlowConsumers
|
||||
expr: 'gnatsd_varz_slow_consumers > 0'
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Slow consumers in NATS (instance {{ $labels.instance }})
|
||||
description: "There are slow consumers in NATS for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NatsServerDown
|
||||
expr: absent(gnatsd_connz_total)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "NATS server is down"
|
||||
description: "NATS server has been down for more than 5 minutes."
|
||||
|
||||
- alert: HighNatsCpuUsage
|
||||
expr: rate(gnatsd_varz_cpu[5m]) > 0.8
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High CPU usage on NATS server"
|
||||
description: "NATS server is using more than 80% CPU for the last 5 minutes."
|
||||
|
||||
- alert: HighNatsConnections
|
||||
expr: gnatsd_connz_num_connections > 1000
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High number of connections in NATS"
|
||||
description: "NATS server has more than 1000 active connections."
|
||||
|
||||
- alert: HighJetStreamStoreUsage
|
||||
expr: gnatsd_varz_jetstream_stats_storage / gnatsd_varz_jetstream_config_max_storage > 0.8
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High JetStream store usage"
|
||||
description: "JetStream store usage is over 80%."
|
||||
|
||||
- alert: HighJetStreamMemoryUsage
|
||||
expr: gnatsd_varz_jetstream_stats_memory / gnatsd_varz_jetstream_config_max_memory > 0.8
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High JetStream memory usage"
|
||||
description: "JetStream memory usage is over 80%."
|
||||
|
||||
- alert: NatsSubscriptionsExceeded
|
||||
expr: gnatsd_connz_subscriptions > 1000
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High number of subscriptions in NATS"
|
||||
description: "NATS server has more than 1000 active subscriptions."
|
||||
|
||||
- alert: NatsMessagesPending
|
||||
expr: gnatsd_connz_pending_bytes > 100000
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High number of pending messages in NATS"
|
||||
description: "NATS server has more than 100,000 pending messages."
|
||||
|
||||
- alert: NatsErrors
|
||||
expr: increase(gnatsd_varz_jetstream_stats_api_errors[5m]) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Errors in NATS"
|
||||
description: "NATS server has encountered errors in the last 5 minutes."
|
||||
|
||||
- alert: JetStreamConsumersExceeded
|
||||
expr: sum(gnatsd_varz_jetstream_stats_accounts) > 100
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High number of JetStream consumers"
|
||||
description: "JetStream has more than 100 active consumers."
|
||||
|
||||
- alert: NatsAuthTimeouts
|
||||
expr: increase(gnatsd_varz_auth_timeout[5m]) > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Frequent authentication timeouts on NATS"
|
||||
description: "There have been more than 5 authentication timeouts in the last 5 minutes."
|
||||
|
||||
- alert: NatsMaxPayloadExceeded
|
||||
expr: max(gnatsd_varz_max_payload) > 1000000
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Max payload size exceeded in NATS"
|
||||
description: "The max payload size allowed by NATS has been exceeded."
|
||||
|
||||
- alert: NatsLeafNodeIssues
|
||||
expr: increase(gnatsd_varz_leafnodes[5m]) == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Leaf node connection issue in NATS"
|
||||
description: "No leaf node connections have been established in the last 5 minutes."
|
||||
|
||||
- alert: NatsPingMaxExceeded
|
||||
expr: gnatsd_varz_ping_max > 50
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Max ping operations exceeded in NATS"
|
||||
description: "The maximum number of ping operations in NATS has exceeded 50."
|
||||
|
||||
- alert: NatsWriteDeadlineExceeded
|
||||
expr: gnatsd_varz_write_deadline > 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Write deadline exceeded in NATS"
|
||||
description: "The write deadline has been exceeded in NATS, indicating potential message delivery issues."
|
||||
|
|
|
|||
Loading…
Reference in a new issue