From 91023e6ec4a04748dc0dc3ba044a4d5f0f467025 Mon Sep 17 00:00:00 2001 From: Samuel Berthe Date: Thu, 31 Dec 2020 00:26:13 +0100 Subject: [PATCH] doc: improve alertmanager.html page for debugging notification delays --- alertmanager.md | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/alertmanager.md b/alertmanager.md index 7e7375c..c7f8add 100644 --- a/alertmanager.md +++ b/alertmanager.md @@ -1,12 +1,20 @@ -

- Prometheus configuration -

+

+ Global configuration +

+ +If you notice a delay between an event and the first notification, read the following blog post => [https://pracucci.com/prometheus-understanding-the-delays-on-alerting.html](https://pracucci.com/prometheus-understanding-the-delays-on-alerting.html). + +## Prometheus configuration {% highlight yaml %} # prometheus.yml global: - scrape_interval: 15s + scrape_interval: 20s + + # A short evaluation_interval will check alerting rules very often. + # It can be costly if you run Prometheus with 100+ alerts. + evaluation_interval: 20s ... rule_files: @@ -35,9 +43,7 @@ groups: {% endhighlight %} -

- AlertManager configuration -

+## AlertManager configuration {% highlight yaml %} {% raw %} @@ -53,7 +59,7 @@ route: # When the first notification was sent, wait 'group_interval' to send a batch # of new alerts that started firing for that group. - group_interval: 5m + group_interval: 30s # If an alert has successfully been sent, wait 'repeat_interval' to # resend them. @@ -92,3 +98,14 @@ receivers: {% endraw %} {% endhighlight %} + +## Troubleshooting + +If the notification takes too much time to be triggered, check the following delays: +- `scrape_interval = 20s` (prometheus.yml) +- `evaluation_interval = 20s` (prometheus.yml) +- `increase(mysql_global_status_slow_queries[1m]) > 0` (alerts/example-mysql.yml) +- `for: 5m` (alerts/example-mysql.yml) +- `group_wait = 10s` (alertmanager.yml) + +Also read [https://pracucci.com/prometheus-understanding-the-delays-on-alerting.html](https://pracucci.com/prometheus-understanding-the-delays-on-alerting.html).