mirror of
https://github.com/samber/awesome-prometheus-alerts.git
synced 2026-06-22 17:37:19 +08:00
124 lines
4.9 KiB
YAML
124 lines
4.9 KiB
YAML
groups:
|
|
|
|
- name: NatsExporter
|
|
|
|
|
|
rules:
|
|
|
|
- alert: NatsHighRoutesCount
|
|
expr: 'gnatsd_varz_routes > 10'
|
|
for: 3m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Nats high routes count (instance {{ $labels.instance }})
|
|
description: "High number of NATS routes ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: NatsHighMemoryUsage
|
|
expr: 'gnatsd_varz_mem > 200 * 1024 * 1024'
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Nats high memory usage (instance {{ $labels.instance }})
|
|
description: "NATS server memory usage is above 200MB for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: NatsSlowConsumers
|
|
expr: 'gnatsd_varz_slow_consumers > 0'
|
|
for: 3m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Nats slow consumers (instance {{ $labels.instance }})
|
|
description: "There are slow consumers in NATS for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: NatsServerDown
|
|
expr: 'absent(up{job="nats"})'
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Nats server down (instance {{ $labels.instance }})
|
|
description: "NATS server has been down for more than 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
# gnatsd_varz_cpu is a gauge reporting CPU percentage (0-100 scale).
|
|
- alert: NatsHighCpuUsage
|
|
expr: 'gnatsd_varz_cpu > 80'
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Nats high CPU usage (instance {{ $labels.instance }})
|
|
description: "NATS server is using more than 80% CPU for the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: NatsHighNumberOfConnections
|
|
expr: 'gnatsd_connz_num_connections > 1000'
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Nats high number of connections (instance {{ $labels.instance }})
|
|
description: "NATS server has more than 1000 active connections\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: NatsHighJetstreamStoreUsage
|
|
expr: 'gnatsd_varz_jetstream_stats_storage / gnatsd_varz_jetstream_config_max_storage > 0.8 and gnatsd_varz_jetstream_config_max_storage > 0'
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Nats high JetStream store usage (instance {{ $labels.instance }})
|
|
description: "JetStream store usage is over 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: NatsHighJetstreamMemoryUsage
|
|
expr: 'gnatsd_varz_jetstream_stats_memory / gnatsd_varz_jetstream_config_max_memory > 0.8 and gnatsd_varz_jetstream_config_max_memory > 0'
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Nats high JetStream memory usage (instance {{ $labels.instance }})
|
|
description: "JetStream memory usage is over 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: NatsHighNumberOfSubscriptions
|
|
expr: 'gnatsd_connz_subscriptions > 1000'
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Nats high number of subscriptions (instance {{ $labels.instance }})
|
|
description: "NATS server has more than 1000 active subscriptions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: NatsHighPendingBytes
|
|
expr: 'gnatsd_connz_pending_bytes > 100000'
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Nats high pending bytes (instance {{ $labels.instance }})
|
|
description: "NATS server has more than 100,000 pending bytes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: NatsTooManyErrors
|
|
expr: 'increase(gnatsd_varz_jetstream_stats_api_errors[5m]) > 0'
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Nats too many errors (instance {{ $labels.instance }})
|
|
description: "NATS server has encountered errors in the last 5 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: NatsJetstreamAccountsExceeded
|
|
expr: 'sum(gnatsd_varz_jetstream_stats_accounts) > 100'
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Nats JetStream accounts exceeded (instance {{ $labels.instance }})
|
|
description: "JetStream has more than 100 active accounts\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: NatsLeafNodeConnectionIssue
|
|
expr: 'gnatsd_varz_leafnodes == 0'
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Nats leaf node connection issue (instance {{ $labels.instance }})
|
|
description: "No leaf node connections on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|