From 58be245e660fb13434d1b68db1d0414448847ddb Mon Sep 17 00:00:00 2001 From: Marco Pernigo Date: Thu, 13 May 2021 09:39:25 +0200 Subject: [PATCH] Loki alerts --- _data/rules.yml | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/_data/rules.yml b/_data/rules.yml index d38fec3..fb2408d 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -2001,3 +2001,37 @@ groups: description: Thanos compaction has not run in 24 hours. query: '(time() - thanos_objstore_bucket_last_successful_upload_time) > 24*60*60' severity: critical + - name: Loki + exporters: + - rules: + - name: LokiProcTooManyRestarts + query: changes(process_start_time_seconds{job=~"loki"}[15m]) > 2 + for: 0m + severity: warning + annotations: + description: A loki process had too many restarts (target {{ $labels.instance }}) + - name: CortexRulerConfigurationReloadFailure + query: cortex_ruler_config_last_reload_successful != 1 + for: 0m + severity: warning + annotations: + description: Cortex ruler configuration reload failure (instance {{ $labels.instance }}) + - name: CortexNotConnectedToAlertmanager + query: cortex_prometheus_notifications_alertmanagers_discovered < 1 + for: 0m + severity: severe + annotations: + description: Cortex not connected to alertmanager (instance {{ $labels.instance }}) + - name: CortexNotificationAreBeingDropped + query: rate(cortex_prometheus_notifications_dropped_total[5m]) > 0 + for: 0m + severity: severe + annotations: + description: Cortex notification are being dropped due to errors (instance {{ $labels.instance }}) + - name: CortexNotificationError + query: rate(cortex_prometheus_notifications_errors_total[5m]) > 0 + for: 0m + severity: severe + annotations: + description: Cortex is failing when sengin alert notifications (instance {{ $labels.instance }}) + \ No newline at end of file