Merge remote-tracking branch 'origin/add-nats-alert' into add-nats-alert

# Conflicts:
#	_data/rules.yml
This commit is contained in:
somratdutta 2024-08-20 23:49:59 +05:30
commit 6b7d9135f8
2 changed files with 21 additions and 19 deletions

View file

@ -730,9 +730,11 @@ groups:
See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
- name: Postgresql invalid index
description: "The table {{ $labels.relname }} has an invalid index: {{ $labels.indexrelname }}. You should execute `DROP INDEX {{ $labels.indexrelname }};`"
query: 'pg_genaral_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}'
query: 'pg_general_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}'
severity: warning
for: 6h
comments: |
See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
- name: SQL Server
exporters:
@ -1537,7 +1539,7 @@ groups:
for: 3m
- name: Nats high memory usage
description: NATS server memory usage is above 200MB for {{ $labels.instance }}
query: "gnatsd_varz_mem > 200 * 1024 * 1024"
query: "gnatsd_varz_mem > 200000000"
severity: warning
for: 5m
- name: Nats slow consumers
@ -1547,25 +1549,25 @@ groups:
for: 3m
- name: Nats server down
description: NATS server has been down for more than 5 minutes
query: "absent(up{job='nats'})"
query: "absent(gnatsd_connz_total)"
severity: critical
for: 5m
- name: High CPU usage on NATS server
- name: Nats high CPU usage
description: NATS server is using more than 80% CPU for the last 5 minutes
query: "rate(gnatsd_varz_cpu[5m]) > 0.8"
severity: warning
for: 5m
- name: High number of connections in NATS
- name: Nats high number of connections
description: NATS server has more than 1000 active connections
query: "gnatsd_connz_num_connections > 1000"
severity: warning
for: 5m
- name: High JetStream store usage
- name: Nats high JetStream store usage
description: JetStream store usage is over 80%
query: "gnatsd_varz_jetstream_stats_storage / gnatsd_varz_jetstream_config_max_storage > 0.8"
severity: warning
for: 5m
- name: High JetStream memory usage
- name: Nats high JetStream memory usage
description: JetStream memory usage is over 80%
query: "gnatsd_varz_jetstream_stats_memory / gnatsd_varz_jetstream_config_max_memory > 0.8"
severity: warning
@ -1575,42 +1577,42 @@ groups:
query: "gnatsd_connz_subscriptions > 1000"
severity: warning
for: 5m
- name: High pending messages in NATS
description: NATS server has more than 100,000 pending bytes
- name: Nats high pending messages
description: NATS server has more than 100,000 pending messages
query: "gnatsd_connz_pending_bytes > 100000"
severity: warning
for: 5m
- name: Errors in NATS
- name: Nats too many errors
description: NATS server has encountered errors in the last 5 minutes
query: "increase(gnatsd_varz_jetstream_stats_api_errors[5m]) > 0"
severity: warning
for: 5m
- name: JetStream consumers exceeded
- name: Nats JetStream consumers exceeded
description: JetStream has more than 100 active consumers
query: "sum(gnatsd_varz_jetstream_stats_accounts) > 100"
severity: warning
for: 5m
- name: Frequent authentication timeouts in NATS
- name: Nats frequent authentication timeouts
description: There have been more than 5 authentication timeouts in the last 5 minutes
query: "increase(gnatsd_varz_auth_timeout[5m]) > 5"
severity: warning
for: 5m
- name: Max payload size exceeded in NATS
description: The max payload size allowed by NATS has been exceeded (1MB)
query: "max(gnatsd_varz_max_payload) > 1024 * 1024"
- name: Nats max payload size exceeded
description: The max payload size allowed by NATS has been exceeded
query: "max(gnatsd_varz_max_payload) > 1000000"
severity: critical
for: 5m
- name: Leaf node connection issue in NATS
- name: Nats leaf node connection issue
description: No leaf node connections have been established in the last 5 minutes
query: "increase(gnatsd_varz_leafnodes[5m]) == 0"
severity: critical
for: 5m
- name: Max ping operations exceeded in NATS
- name: Nats max ping operations exceeded
description: The maximum number of ping operations in NATS has exceeded 50
query: "gnatsd_varz_ping_max > 50"
severity: warning
for: 5m
- name: Write deadline exceeded in NATS
- name: Nats write deadline exceeded
description: The write deadline has been exceeded in NATS, indicating potential message delivery issues
query: "gnatsd_varz_write_deadline > 10"
severity: critical

View file

@ -185,7 +185,7 @@ groups:
description: "The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlInvalidIndex
expr: 'pg_genaral_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}'
expr: 'pg_general_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}'
for: 6h
labels:
severity: warning