- minor changes, rollback rules.yml

- address comment changes
- revert to old rules.yml as they are generated
This commit is contained in:
somrat.dutta 2024-08-20 23:41:00 +05:30
parent 1a9ba83feb
commit be55ba1e5a
2 changed files with 39 additions and 183 deletions

View file

@ -1537,7 +1537,7 @@ groups:
for: 3m
- name: Nats high memory usage
description: NATS server memory usage is above 200MB for {{ $labels.instance }}
query: "gnatsd_varz_mem > 200000000"
query: "gnatsd_varz_mem > 200 * 1024 * 1024"
severity: warning
for: 5m
- name: Nats slow consumers
@ -1547,7 +1547,7 @@ groups:
for: 3m
- name: Nats server down
description: NATS server has been down for more than 5 minutes
query: "absent(gnatsd_connz_total)"
query: "absent(up{job='nats'})"
severity: critical
for: 5m
- name: High CPU usage on NATS server
@ -1576,7 +1576,7 @@ groups:
severity: warning
for: 5m
- name: High pending messages in NATS
description: NATS server has more than 100,000 pending messages
description: NATS server has more than 100,000 pending bytes
query: "gnatsd_connz_pending_bytes > 100000"
severity: warning
for: 5m
@ -1596,8 +1596,8 @@ groups:
severity: warning
for: 5m
- name: Max payload size exceeded in NATS
description: The max payload size allowed by NATS has been exceeded
query: "max(gnatsd_varz_max_payload) > 1000000"
description: The max payload size allowed by NATS has been exceeded (1MB)
query: "max(gnatsd_varz_max_payload) > 1024 * 1024"
severity: critical
for: 5m
- name: Leaf node connection issue in NATS

View file

@ -1,185 +1,41 @@
groups:
- name: NatsExporter
- name: NatsExporter
rules:
rules:
- alert: NatsHighConnectionCount
expr: 'gnatsd_varz_connections > 100'
for: 3m
labels:
severity: warning
annotations:
summary: Nats high connection count (instance {{ $labels.instance }})
description: "High number of NATS connections ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsHighConnectionCount
expr: 'gnatsd_varz_connections > 100'
for: 3m
labels:
severity: warning
annotations:
summary: Nats high connection count (instance {{ $labels.instance }})
description: "High number of NATS connections ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsHighPendingBytes
expr: 'gnatsd_connz_pending_bytes > 100000'
for: 3m
labels:
severity: warning
annotations:
summary: Nats high pending bytes (instance {{ $labels.instance }})
description: "High number of NATS pending bytes ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsHighPendingBytes
expr: 'gnatsd_connz_pending_bytes > 100000'
for: 3m
labels:
severity: warning
annotations:
summary: Nats high pending bytes (instance {{ $labels.instance }})
description: "High number of NATS pending bytes ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsHighSubscriptionsCount
expr: 'gnatsd_connz_subscriptions > 50'
for: 3m
labels:
severity: warning
annotations:
summary: Nats high subscriptions count (instance {{ $labels.instance }})
description: "High number of NATS subscriptions ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsHighSubscriptionsCount
expr: 'gnatsd_connz_subscriptions > 50'
for: 3m
labels:
severity: warning
annotations:
summary: Nats high subscriptions count (instance {{ $labels.instance }})
description: "High number of NATS subscriptions ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsHighRoutesCount
expr: 'gnatsd_varz_routes > 10'
for: 3m
labels:
severity: warning
annotations:
summary: Nats high routes count (instance {{ $labels.instance }})
description: "High number of NATS routes ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsHighMemoryUsage
expr: 'gnatsd_varz_mem > 200000000' # Adjust based on NATS instance sizing
for: 5m
labels:
severity: warning
annotations:
summary: NATS high memory usage (instance {{ $labels.instance }})
description: "NATS server memory usage is above 200MB for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsSlowConsumers
expr: 'gnatsd_varz_slow_consumers > 0'
for: 3m
labels:
severity: critical
annotations:
summary: Slow consumers in NATS (instance {{ $labels.instance }})
description: "There are slow consumers in NATS for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsServerDown
expr: absent(gnatsd_connz_total)
for: 5m
labels:
severity: critical
annotations:
summary: "NATS server is down"
description: "NATS server has been down for more than 5 minutes."
- alert: HighNatsCpuUsage
expr: rate(gnatsd_varz_cpu[5m]) > 0.8
for: 5m
labels:
severity: warning
annotations:
summary: "High CPU usage on NATS server"
description: "NATS server is using more than 80% CPU for the last 5 minutes."
- alert: HighNatsConnections
expr: gnatsd_connz_num_connections > 1000
for: 5m
labels:
severity: warning
annotations:
summary: "High number of connections in NATS"
description: "NATS server has more than 1000 active connections."
- alert: HighJetStreamStoreUsage
expr: gnatsd_varz_jetstream_stats_storage / gnatsd_varz_jetstream_config_max_storage > 0.8
for: 5m
labels:
severity: warning
annotations:
summary: "High JetStream store usage"
description: "JetStream store usage is over 80%."
- alert: HighJetStreamMemoryUsage
expr: gnatsd_varz_jetstream_stats_memory / gnatsd_varz_jetstream_config_max_memory > 0.8
for: 5m
labels:
severity: warning
annotations:
summary: "High JetStream memory usage"
description: "JetStream memory usage is over 80%."
- alert: NatsSubscriptionsExceeded
expr: gnatsd_connz_subscriptions > 1000
for: 5m
labels:
severity: warning
annotations:
summary: "High number of subscriptions in NATS"
description: "NATS server has more than 1000 active subscriptions."
- alert: NatsMessagesPending
expr: gnatsd_connz_pending_bytes > 100000
for: 5m
labels:
severity: warning
annotations:
summary: "High number of pending messages in NATS"
description: "NATS server has more than 100,000 pending messages."
- alert: NatsErrors
expr: increase(gnatsd_varz_jetstream_stats_api_errors[5m]) > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Errors in NATS"
description: "NATS server has encountered errors in the last 5 minutes."
- alert: JetStreamConsumersExceeded
expr: sum(gnatsd_varz_jetstream_stats_accounts) > 100
for: 5m
labels:
severity: warning
annotations:
summary: "High number of JetStream consumers"
description: "JetStream has more than 100 active consumers."
- alert: NatsAuthTimeouts
expr: increase(gnatsd_varz_auth_timeout[5m]) > 5
for: 5m
labels:
severity: warning
annotations:
summary: "Frequent authentication timeouts on NATS"
description: "There have been more than 5 authentication timeouts in the last 5 minutes."
- alert: NatsMaxPayloadExceeded
expr: max(gnatsd_varz_max_payload) > 1000000
for: 5m
labels:
severity: critical
annotations:
summary: "Max payload size exceeded in NATS"
description: "The max payload size allowed by NATS has been exceeded."
- alert: NatsLeafNodeIssues
expr: increase(gnatsd_varz_leafnodes[5m]) == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Leaf node connection issue in NATS"
description: "No leaf node connections have been established in the last 5 minutes."
- alert: NatsPingMaxExceeded
expr: gnatsd_varz_ping_max > 50
for: 5m
labels:
severity: warning
annotations:
summary: "Max ping operations exceeded in NATS"
description: "The maximum number of ping operations in NATS has exceeded 50."
- alert: NatsWriteDeadlineExceeded
expr: gnatsd_varz_write_deadline > 10
for: 5m
labels:
severity: critical
annotations:
summary: "Write deadline exceeded in NATS"
description: "The write deadline has been exceeded in NATS, indicating potential message delivery issues."
- alert: NatsHighRoutesCount
expr: 'gnatsd_routez_num_routes > 10'
for: 3m
labels:
severity: warning
annotations:
summary: Nats high routes count (instance {{ $labels.instance }})
description: "High number of NATS routes ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"