From 718a039313ca2283c782d8d5c18b306fb03d572d Mon Sep 17 00:00:00 2001 From: Samuel Berthe Date: Sun, 8 Mar 2020 15:08:11 +0100 Subject: [PATCH] Adding an alert for prometheus internals: rule evaluation slowing down --- _data/rules.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/_data/rules.yml b/_data/rules.yml index 8dfa6f7..5368371 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -19,13 +19,17 @@ services: query: "up == 0" severity: error - name: Prometheus rule evaluation failures - description: 'Prometheus encountered {{ $value }} rule evaluation failures. leading to potentially ignored alerts.' + description: 'Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.' query: 'increase(prometheus_rule_evaluation_failures_total[3m]) > 0' severity: error - name: Prometheus template text expansion failures description: 'Prometheus encountered {{ $value }} template text expansion failures' query: 'increase(prometheus_template_text_expansion_failures_total[3m]) > 0' severity: error + - name: Prometheus rule evaluation slow + description: 'Prometheus rule evaluation took more time than the scheduled interval. I indicates a slower storage backend access or too complex query.' + query: 'prometheus_rule_group_last_duration_seconds < prometheus_rule_group_interval_seconds' + severity: error - name: Prometheus TSDB checkpoint creation failures description: 'Prometheus encountered {{ $value }} checkpoint creation failures' query: 'increase(prometheus_tsdb_checkpoint_creations_failed_total[3m]) > 0'