Merge remote-tracking branch 'origin/add-nats-alert' into add-nats-alert

# Conflicts:
#	_data/rules.yml
This commit is contained in:
somratdutta 2024-08-20 23:49:59 +05:30
commit 6b7d9135f8
2 changed files with 21 additions and 19 deletions

View file

@ -730,9 +730,11 @@ groups:
See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737 See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
- name: Postgresql invalid index - name: Postgresql invalid index
description: "The table {{ $labels.relname }} has an invalid index: {{ $labels.indexrelname }}. You should execute `DROP INDEX {{ $labels.indexrelname }};`" description: "The table {{ $labels.relname }} has an invalid index: {{ $labels.indexrelname }}. You should execute `DROP INDEX {{ $labels.indexrelname }};`"
query: 'pg_genaral_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}' query: 'pg_general_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}'
severity: warning severity: warning
for: 6h for: 6h
comments: |
See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
- name: SQL Server - name: SQL Server
exporters: exporters:
@ -1537,7 +1539,7 @@ groups:
for: 3m for: 3m
- name: Nats high memory usage - name: Nats high memory usage
description: NATS server memory usage is above 200MB for {{ $labels.instance }} description: NATS server memory usage is above 200MB for {{ $labels.instance }}
query: "gnatsd_varz_mem > 200 * 1024 * 1024" query: "gnatsd_varz_mem > 200000000"
severity: warning severity: warning
for: 5m for: 5m
- name: Nats slow consumers - name: Nats slow consumers
@ -1547,25 +1549,25 @@ groups:
for: 3m for: 3m
- name: Nats server down - name: Nats server down
description: NATS server has been down for more than 5 minutes description: NATS server has been down for more than 5 minutes
query: "absent(up{job='nats'})" query: "absent(gnatsd_connz_total)"
severity: critical severity: critical
for: 5m for: 5m
- name: High CPU usage on NATS server - name: Nats high CPU usage
description: NATS server is using more than 80% CPU for the last 5 minutes description: NATS server is using more than 80% CPU for the last 5 minutes
query: "rate(gnatsd_varz_cpu[5m]) > 0.8" query: "rate(gnatsd_varz_cpu[5m]) > 0.8"
severity: warning severity: warning
for: 5m for: 5m
- name: High number of connections in NATS - name: Nats high number of connections
description: NATS server has more than 1000 active connections description: NATS server has more than 1000 active connections
query: "gnatsd_connz_num_connections > 1000" query: "gnatsd_connz_num_connections > 1000"
severity: warning severity: warning
for: 5m for: 5m
- name: High JetStream store usage - name: Nats high JetStream store usage
description: JetStream store usage is over 80% description: JetStream store usage is over 80%
query: "gnatsd_varz_jetstream_stats_storage / gnatsd_varz_jetstream_config_max_storage > 0.8" query: "gnatsd_varz_jetstream_stats_storage / gnatsd_varz_jetstream_config_max_storage > 0.8"
severity: warning severity: warning
for: 5m for: 5m
- name: High JetStream memory usage - name: Nats high JetStream memory usage
description: JetStream memory usage is over 80% description: JetStream memory usage is over 80%
query: "gnatsd_varz_jetstream_stats_memory / gnatsd_varz_jetstream_config_max_memory > 0.8" query: "gnatsd_varz_jetstream_stats_memory / gnatsd_varz_jetstream_config_max_memory > 0.8"
severity: warning severity: warning
@ -1575,42 +1577,42 @@ groups:
query: "gnatsd_connz_subscriptions > 1000" query: "gnatsd_connz_subscriptions > 1000"
severity: warning severity: warning
for: 5m for: 5m
- name: High pending messages in NATS - name: Nats high pending messages
description: NATS server has more than 100,000 pending bytes description: NATS server has more than 100,000 pending messages
query: "gnatsd_connz_pending_bytes > 100000" query: "gnatsd_connz_pending_bytes > 100000"
severity: warning severity: warning
for: 5m for: 5m
- name: Errors in NATS - name: Nats too many errors
description: NATS server has encountered errors in the last 5 minutes description: NATS server has encountered errors in the last 5 minutes
query: "increase(gnatsd_varz_jetstream_stats_api_errors[5m]) > 0" query: "increase(gnatsd_varz_jetstream_stats_api_errors[5m]) > 0"
severity: warning severity: warning
for: 5m for: 5m
- name: JetStream consumers exceeded - name: Nats JetStream consumers exceeded
description: JetStream has more than 100 active consumers description: JetStream has more than 100 active consumers
query: "sum(gnatsd_varz_jetstream_stats_accounts) > 100" query: "sum(gnatsd_varz_jetstream_stats_accounts) > 100"
severity: warning severity: warning
for: 5m for: 5m
- name: Frequent authentication timeouts in NATS - name: Nats frequent authentication timeouts
description: There have been more than 5 authentication timeouts in the last 5 minutes description: There have been more than 5 authentication timeouts in the last 5 minutes
query: "increase(gnatsd_varz_auth_timeout[5m]) > 5" query: "increase(gnatsd_varz_auth_timeout[5m]) > 5"
severity: warning severity: warning
for: 5m for: 5m
- name: Max payload size exceeded in NATS - name: Nats max payload size exceeded
description: The max payload size allowed by NATS has been exceeded (1MB) description: The max payload size allowed by NATS has been exceeded
query: "max(gnatsd_varz_max_payload) > 1024 * 1024" query: "max(gnatsd_varz_max_payload) > 1000000"
severity: critical severity: critical
for: 5m for: 5m
- name: Leaf node connection issue in NATS - name: Nats leaf node connection issue
description: No leaf node connections have been established in the last 5 minutes description: No leaf node connections have been established in the last 5 minutes
query: "increase(gnatsd_varz_leafnodes[5m]) == 0" query: "increase(gnatsd_varz_leafnodes[5m]) == 0"
severity: critical severity: critical
for: 5m for: 5m
- name: Max ping operations exceeded in NATS - name: Nats max ping operations exceeded
description: The maximum number of ping operations in NATS has exceeded 50 description: The maximum number of ping operations in NATS has exceeded 50
query: "gnatsd_varz_ping_max > 50" query: "gnatsd_varz_ping_max > 50"
severity: warning severity: warning
for: 5m for: 5m
- name: Write deadline exceeded in NATS - name: Nats write deadline exceeded
description: The write deadline has been exceeded in NATS, indicating potential message delivery issues description: The write deadline has been exceeded in NATS, indicating potential message delivery issues
query: "gnatsd_varz_write_deadline > 10" query: "gnatsd_varz_write_deadline > 10"
severity: critical severity: critical

View file

@ -185,7 +185,7 @@ groups:
description: "The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlInvalidIndex - alert: PostgresqlInvalidIndex
expr: 'pg_genaral_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}' expr: 'pg_general_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}'
for: 6h for: 6h
labels: labels:
severity: warning severity: warning