mirror of
https://github.com/samber/awesome-prometheus-alerts.git
synced 2026-06-26 19:37:27 +08:00
- minor changes, rollback rules.yml
- address comment changes - revert to old rules.yml as they are generated
This commit is contained in:
parent
1a9ba83feb
commit
be55ba1e5a
2 changed files with 39 additions and 183 deletions
|
|
@ -1537,7 +1537,7 @@ groups:
|
|||
for: 3m
|
||||
- name: Nats high memory usage
|
||||
description: NATS server memory usage is above 200MB for {{ $labels.instance }}
|
||||
query: "gnatsd_varz_mem > 200000000"
|
||||
query: "gnatsd_varz_mem > 200 * 1024 * 1024"
|
||||
severity: warning
|
||||
for: 5m
|
||||
- name: Nats slow consumers
|
||||
|
|
@ -1547,7 +1547,7 @@ groups:
|
|||
for: 3m
|
||||
- name: Nats server down
|
||||
description: NATS server has been down for more than 5 minutes
|
||||
query: "absent(gnatsd_connz_total)"
|
||||
query: "absent(up{job='nats'})"
|
||||
severity: critical
|
||||
for: 5m
|
||||
- name: High CPU usage on NATS server
|
||||
|
|
@ -1576,7 +1576,7 @@ groups:
|
|||
severity: warning
|
||||
for: 5m
|
||||
- name: High pending messages in NATS
|
||||
description: NATS server has more than 100,000 pending messages
|
||||
description: NATS server has more than 100,000 pending bytes
|
||||
query: "gnatsd_connz_pending_bytes > 100000"
|
||||
severity: warning
|
||||
for: 5m
|
||||
|
|
@ -1596,8 +1596,8 @@ groups:
|
|||
severity: warning
|
||||
for: 5m
|
||||
- name: Max payload size exceeded in NATS
|
||||
description: The max payload size allowed by NATS has been exceeded
|
||||
query: "max(gnatsd_varz_max_payload) > 1000000"
|
||||
description: The max payload size allowed by NATS has been exceeded (1MB)
|
||||
query: "max(gnatsd_varz_max_payload) > 1024 * 1024"
|
||||
severity: critical
|
||||
for: 5m
|
||||
- name: Leaf node connection issue in NATS
|
||||
|
|
|
|||
212
dist/rules/nats/nats-exporter.yml
vendored
212
dist/rules/nats/nats-exporter.yml
vendored
|
|
@ -1,185 +1,41 @@
|
|||
groups:
|
||||
|
||||
- name: NatsExporter
|
||||
- name: NatsExporter
|
||||
|
||||
rules:
|
||||
rules:
|
||||
|
||||
- alert: NatsHighConnectionCount
|
||||
expr: 'gnatsd_varz_connections > 100'
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Nats high connection count (instance {{ $labels.instance }})
|
||||
description: "High number of NATS connections ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
- alert: NatsHighConnectionCount
|
||||
expr: 'gnatsd_varz_connections > 100'
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Nats high connection count (instance {{ $labels.instance }})
|
||||
description: "High number of NATS connections ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NatsHighPendingBytes
|
||||
expr: 'gnatsd_connz_pending_bytes > 100000'
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Nats high pending bytes (instance {{ $labels.instance }})
|
||||
description: "High number of NATS pending bytes ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
- alert: NatsHighPendingBytes
|
||||
expr: 'gnatsd_connz_pending_bytes > 100000'
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Nats high pending bytes (instance {{ $labels.instance }})
|
||||
description: "High number of NATS pending bytes ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NatsHighSubscriptionsCount
|
||||
expr: 'gnatsd_connz_subscriptions > 50'
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Nats high subscriptions count (instance {{ $labels.instance }})
|
||||
description: "High number of NATS subscriptions ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
- alert: NatsHighSubscriptionsCount
|
||||
expr: 'gnatsd_connz_subscriptions > 50'
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Nats high subscriptions count (instance {{ $labels.instance }})
|
||||
description: "High number of NATS subscriptions ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NatsHighRoutesCount
|
||||
expr: 'gnatsd_varz_routes > 10'
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Nats high routes count (instance {{ $labels.instance }})
|
||||
description: "High number of NATS routes ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NatsHighMemoryUsage
|
||||
expr: 'gnatsd_varz_mem > 200000000' # Adjust based on NATS instance sizing
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: NATS high memory usage (instance {{ $labels.instance }})
|
||||
description: "NATS server memory usage is above 200MB for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NatsSlowConsumers
|
||||
expr: 'gnatsd_varz_slow_consumers > 0'
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Slow consumers in NATS (instance {{ $labels.instance }})
|
||||
description: "There are slow consumers in NATS for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: NatsServerDown
|
||||
expr: absent(gnatsd_connz_total)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "NATS server is down"
|
||||
description: "NATS server has been down for more than 5 minutes."
|
||||
|
||||
- alert: HighNatsCpuUsage
|
||||
expr: rate(gnatsd_varz_cpu[5m]) > 0.8
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High CPU usage on NATS server"
|
||||
description: "NATS server is using more than 80% CPU for the last 5 minutes."
|
||||
|
||||
- alert: HighNatsConnections
|
||||
expr: gnatsd_connz_num_connections > 1000
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High number of connections in NATS"
|
||||
description: "NATS server has more than 1000 active connections."
|
||||
|
||||
- alert: HighJetStreamStoreUsage
|
||||
expr: gnatsd_varz_jetstream_stats_storage / gnatsd_varz_jetstream_config_max_storage > 0.8
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High JetStream store usage"
|
||||
description: "JetStream store usage is over 80%."
|
||||
|
||||
- alert: HighJetStreamMemoryUsage
|
||||
expr: gnatsd_varz_jetstream_stats_memory / gnatsd_varz_jetstream_config_max_memory > 0.8
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High JetStream memory usage"
|
||||
description: "JetStream memory usage is over 80%."
|
||||
|
||||
- alert: NatsSubscriptionsExceeded
|
||||
expr: gnatsd_connz_subscriptions > 1000
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High number of subscriptions in NATS"
|
||||
description: "NATS server has more than 1000 active subscriptions."
|
||||
|
||||
- alert: NatsMessagesPending
|
||||
expr: gnatsd_connz_pending_bytes > 100000
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High number of pending messages in NATS"
|
||||
description: "NATS server has more than 100,000 pending messages."
|
||||
|
||||
- alert: NatsErrors
|
||||
expr: increase(gnatsd_varz_jetstream_stats_api_errors[5m]) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Errors in NATS"
|
||||
description: "NATS server has encountered errors in the last 5 minutes."
|
||||
|
||||
- alert: JetStreamConsumersExceeded
|
||||
expr: sum(gnatsd_varz_jetstream_stats_accounts) > 100
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High number of JetStream consumers"
|
||||
description: "JetStream has more than 100 active consumers."
|
||||
|
||||
- alert: NatsAuthTimeouts
|
||||
expr: increase(gnatsd_varz_auth_timeout[5m]) > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Frequent authentication timeouts on NATS"
|
||||
description: "There have been more than 5 authentication timeouts in the last 5 minutes."
|
||||
|
||||
- alert: NatsMaxPayloadExceeded
|
||||
expr: max(gnatsd_varz_max_payload) > 1000000
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Max payload size exceeded in NATS"
|
||||
description: "The max payload size allowed by NATS has been exceeded."
|
||||
|
||||
- alert: NatsLeafNodeIssues
|
||||
expr: increase(gnatsd_varz_leafnodes[5m]) == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Leaf node connection issue in NATS"
|
||||
description: "No leaf node connections have been established in the last 5 minutes."
|
||||
|
||||
- alert: NatsPingMaxExceeded
|
||||
expr: gnatsd_varz_ping_max > 50
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Max ping operations exceeded in NATS"
|
||||
description: "The maximum number of ping operations in NATS has exceeded 50."
|
||||
|
||||
- alert: NatsWriteDeadlineExceeded
|
||||
expr: gnatsd_varz_write_deadline > 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Write deadline exceeded in NATS"
|
||||
description: "The write deadline has been exceeded in NATS, indicating potential message delivery issues."
|
||||
- alert: NatsHighRoutesCount
|
||||
expr: 'gnatsd_routez_num_routes > 10'
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Nats high routes count (instance {{ $labels.instance }})
|
||||
description: "High number of NATS routes ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
Loading…
Reference in a new issue