From 36ca52e59887922163169189c1c24e1afe06a79e Mon Sep 17 00:00:00 2001 From: Andre Martins <58525821+apmartins85@users.noreply.github.com> Date: Sun, 3 Oct 2021 17:12:59 -0300 Subject: [PATCH] adding alerts to promtail and loki (#241) Co-authored-by: apmbktf Co-authored-by: Samuel Berthe --- README.md | 1 + _data/rules.yml | 28 ++++++++++++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/README.md b/README.md index a3a11bf..29ebfe5 100644 --- a/README.md +++ b/README.md @@ -74,6 +74,7 @@ Collection available here: **[https://awesome-prometheus-alerts.grep.to](https:/ - [Thanos](https://awesome-prometheus-alerts.grep.to/rules#thanos) - [Loki](https://awesome-prometheus-alerts.grep.to/rules#loki) +- [Promtail](https://awesome-prometheus-alerts.grep.to/rules#promtail) - [Cortex](https://awesome-prometheus-alerts.grep.to/rules#cortex) ## 🤝 Contributing diff --git a/_data/rules.yml b/_data/rules.yml index b933c0a..a28847c 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -2031,6 +2031,34 @@ groups: description: A loki process had too many restarts (target {{ $labels.instance }}) query: changes(process_start_time_seconds{job=~"loki"}[15m]) > 2 severity: warning + - name: Loki request errors + description: The {{ $labels.job }} and {{ $labels.route }} are experiencing errors + query: '100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10' + severity: critical + for: 15m + - name: Loki request panic + description: The {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics + query: sum(increase(loki_panic_total[10m])) by (namespace, job) > 0 + severity: critical + for: 5m + - name: Loki request latency + description: The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency + query: (histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (le))) > 1 + severity: critical + for: 5m + - name: Promtail + exporters: + - rules: + - name: Promtail request errors + description: The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. + query: '100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance) / sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) > 10' + severity: critical + for: 5m + - name: Promtail request latency + description: The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. + query: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[5m])) by (le)) > 1 + severity: critical + for: 5m - name: Cortex exporters: - rules: