From c2b8178304a839eb108a6cd3f297645e8e90ad7b Mon Sep 17 00:00:00 2001 From: Gjed Date: Sun, 4 Jul 2021 23:59:46 +0200 Subject: [PATCH] Loki alerts (#218) Co-authored-by: Samuel Berthe --- README.md | 2 ++ _data/rules.yml | 26 ++++++++++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/README.md b/README.md index d0602de..0f1562e 100644 --- a/README.md +++ b/README.md @@ -72,6 +72,8 @@ Collection available here: **[https://awesome-prometheus-alerts.grep.to](https:/ #### Other - [Thanos](https://awesome-prometheus-alerts.grep.to/rules#thanos) +- [Loki](https://awesome-prometheus-alerts.grep.to/rules#loki) +- [Cortex](https://awesome-prometheus-alerts.grep.to/rules#cortex) ## 🤝 Contributing diff --git a/_data/rules.yml b/_data/rules.yml index c2f5385..99f566d 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -2001,3 +2001,29 @@ groups: description: Thanos compaction has not run in 24 hours. query: '(time() - thanos_objstore_bucket_last_successful_upload_time) > 24*60*60' severity: critical + - name: Loki + exporters: + - rules: + - name: Loki process too many restarts + description: A loki process had too many restarts (target {{ $labels.instance }}) + query: changes(process_start_time_seconds{job=~"loki"}[15m]) > 2 + severity: warning + - name: Cortex + exporters: + - rules: + - name: Cortex ruler configuration reload failure + description: Cortex ruler configuration reload failure (instance {{ $labels.instance }}) + query: cortex_ruler_config_last_reload_successful != 1 + severity: warning + - name: Cortex not connected to Alertmanager + description: Cortex not connected to Alertmanager (instance {{ $labels.instance }}) + query: cortex_prometheus_notifications_alertmanagers_discovered < 1 + severity: critical + - name: Cortex notification are being dropped + description: Cortex notification are being dropped due to errors (instance {{ $labels.instance }}) + query: rate(cortex_prometheus_notifications_dropped_total[5m]) > 0 + severity: critical + - name: Cortex notification error + description: Cortex is failing when sengin alert notifications (instance {{ $labels.instance }}) + query: rate(cortex_prometheus_notifications_errors_total[5m]) > 0 + severity: critical