mirror of
https://github.com/samber/awesome-prometheus-alerts.git
synced 2026-06-26 11:27:00 +08:00
- minor changes, rollback rules.yml
- address comment changes - revert to old rules.yml as they are generated
This commit is contained in:
parent
1a9ba83feb
commit
be55ba1e5a
2 changed files with 39 additions and 183 deletions
|
|
@ -1537,7 +1537,7 @@ groups:
|
||||||
for: 3m
|
for: 3m
|
||||||
- name: Nats high memory usage
|
- name: Nats high memory usage
|
||||||
description: NATS server memory usage is above 200MB for {{ $labels.instance }}
|
description: NATS server memory usage is above 200MB for {{ $labels.instance }}
|
||||||
query: "gnatsd_varz_mem > 200000000"
|
query: "gnatsd_varz_mem > 200 * 1024 * 1024"
|
||||||
severity: warning
|
severity: warning
|
||||||
for: 5m
|
for: 5m
|
||||||
- name: Nats slow consumers
|
- name: Nats slow consumers
|
||||||
|
|
@ -1547,7 +1547,7 @@ groups:
|
||||||
for: 3m
|
for: 3m
|
||||||
- name: Nats server down
|
- name: Nats server down
|
||||||
description: NATS server has been down for more than 5 minutes
|
description: NATS server has been down for more than 5 minutes
|
||||||
query: "absent(gnatsd_connz_total)"
|
query: "absent(up{job='nats'})"
|
||||||
severity: critical
|
severity: critical
|
||||||
for: 5m
|
for: 5m
|
||||||
- name: High CPU usage on NATS server
|
- name: High CPU usage on NATS server
|
||||||
|
|
@ -1576,7 +1576,7 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
for: 5m
|
for: 5m
|
||||||
- name: High pending messages in NATS
|
- name: High pending messages in NATS
|
||||||
description: NATS server has more than 100,000 pending messages
|
description: NATS server has more than 100,000 pending bytes
|
||||||
query: "gnatsd_connz_pending_bytes > 100000"
|
query: "gnatsd_connz_pending_bytes > 100000"
|
||||||
severity: warning
|
severity: warning
|
||||||
for: 5m
|
for: 5m
|
||||||
|
|
@ -1596,8 +1596,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
for: 5m
|
for: 5m
|
||||||
- name: Max payload size exceeded in NATS
|
- name: Max payload size exceeded in NATS
|
||||||
description: The max payload size allowed by NATS has been exceeded
|
description: The max payload size allowed by NATS has been exceeded (1MB)
|
||||||
query: "max(gnatsd_varz_max_payload) > 1000000"
|
query: "max(gnatsd_varz_max_payload) > 1024 * 1024"
|
||||||
severity: critical
|
severity: critical
|
||||||
for: 5m
|
for: 5m
|
||||||
- name: Leaf node connection issue in NATS
|
- name: Leaf node connection issue in NATS
|
||||||
|
|
|
||||||
148
dist/rules/nats/nats-exporter.yml
vendored
148
dist/rules/nats/nats-exporter.yml
vendored
|
|
@ -1,6 +1,6 @@
|
||||||
groups:
|
groups:
|
||||||
|
|
||||||
- name: NatsExporter
|
- name: NatsExporter
|
||||||
|
|
||||||
rules:
|
rules:
|
||||||
|
|
||||||
|
|
@ -32,154 +32,10 @@ groups:
|
||||||
description: "High number of NATS subscriptions ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "High number of NATS subscriptions ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: NatsHighRoutesCount
|
- alert: NatsHighRoutesCount
|
||||||
expr: 'gnatsd_varz_routes > 10'
|
expr: 'gnatsd_routez_num_routes > 10'
|
||||||
for: 3m
|
for: 3m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: Nats high routes count (instance {{ $labels.instance }})
|
summary: Nats high routes count (instance {{ $labels.instance }})
|
||||||
description: "High number of NATS routes ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "High number of NATS routes ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: NatsHighMemoryUsage
|
|
||||||
expr: 'gnatsd_varz_mem > 200000000' # Adjust based on NATS instance sizing
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: NATS high memory usage (instance {{ $labels.instance }})
|
|
||||||
description: "NATS server memory usage is above 200MB for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
||||||
|
|
||||||
- alert: NatsSlowConsumers
|
|
||||||
expr: 'gnatsd_varz_slow_consumers > 0'
|
|
||||||
for: 3m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
summary: Slow consumers in NATS (instance {{ $labels.instance }})
|
|
||||||
description: "There are slow consumers in NATS for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
||||||
|
|
||||||
- alert: NatsServerDown
|
|
||||||
expr: absent(gnatsd_connz_total)
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
summary: "NATS server is down"
|
|
||||||
description: "NATS server has been down for more than 5 minutes."
|
|
||||||
|
|
||||||
- alert: HighNatsCpuUsage
|
|
||||||
expr: rate(gnatsd_varz_cpu[5m]) > 0.8
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: "High CPU usage on NATS server"
|
|
||||||
description: "NATS server is using more than 80% CPU for the last 5 minutes."
|
|
||||||
|
|
||||||
- alert: HighNatsConnections
|
|
||||||
expr: gnatsd_connz_num_connections > 1000
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: "High number of connections in NATS"
|
|
||||||
description: "NATS server has more than 1000 active connections."
|
|
||||||
|
|
||||||
- alert: HighJetStreamStoreUsage
|
|
||||||
expr: gnatsd_varz_jetstream_stats_storage / gnatsd_varz_jetstream_config_max_storage > 0.8
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: "High JetStream store usage"
|
|
||||||
description: "JetStream store usage is over 80%."
|
|
||||||
|
|
||||||
- alert: HighJetStreamMemoryUsage
|
|
||||||
expr: gnatsd_varz_jetstream_stats_memory / gnatsd_varz_jetstream_config_max_memory > 0.8
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: "High JetStream memory usage"
|
|
||||||
description: "JetStream memory usage is over 80%."
|
|
||||||
|
|
||||||
- alert: NatsSubscriptionsExceeded
|
|
||||||
expr: gnatsd_connz_subscriptions > 1000
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: "High number of subscriptions in NATS"
|
|
||||||
description: "NATS server has more than 1000 active subscriptions."
|
|
||||||
|
|
||||||
- alert: NatsMessagesPending
|
|
||||||
expr: gnatsd_connz_pending_bytes > 100000
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: "High number of pending messages in NATS"
|
|
||||||
description: "NATS server has more than 100,000 pending messages."
|
|
||||||
|
|
||||||
- alert: NatsErrors
|
|
||||||
expr: increase(gnatsd_varz_jetstream_stats_api_errors[5m]) > 0
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: "Errors in NATS"
|
|
||||||
description: "NATS server has encountered errors in the last 5 minutes."
|
|
||||||
|
|
||||||
- alert: JetStreamConsumersExceeded
|
|
||||||
expr: sum(gnatsd_varz_jetstream_stats_accounts) > 100
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: "High number of JetStream consumers"
|
|
||||||
description: "JetStream has more than 100 active consumers."
|
|
||||||
|
|
||||||
- alert: NatsAuthTimeouts
|
|
||||||
expr: increase(gnatsd_varz_auth_timeout[5m]) > 5
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: "Frequent authentication timeouts on NATS"
|
|
||||||
description: "There have been more than 5 authentication timeouts in the last 5 minutes."
|
|
||||||
|
|
||||||
- alert: NatsMaxPayloadExceeded
|
|
||||||
expr: max(gnatsd_varz_max_payload) > 1000000
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
summary: "Max payload size exceeded in NATS"
|
|
||||||
description: "The max payload size allowed by NATS has been exceeded."
|
|
||||||
|
|
||||||
- alert: NatsLeafNodeIssues
|
|
||||||
expr: increase(gnatsd_varz_leafnodes[5m]) == 0
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
summary: "Leaf node connection issue in NATS"
|
|
||||||
description: "No leaf node connections have been established in the last 5 minutes."
|
|
||||||
|
|
||||||
- alert: NatsPingMaxExceeded
|
|
||||||
expr: gnatsd_varz_ping_max > 50
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: "Max ping operations exceeded in NATS"
|
|
||||||
description: "The maximum number of ping operations in NATS has exceeded 50."
|
|
||||||
|
|
||||||
- alert: NatsWriteDeadlineExceeded
|
|
||||||
expr: gnatsd_varz_write_deadline > 10
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
summary: "Write deadline exceeded in NATS"
|
|
||||||
description: "The write deadline has been exceeded in NATS, indicating potential message delivery issues."
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue