mirror of
https://github.com/samber/awesome-prometheus-alerts.git
synced 2026-06-22 09:27:26 +08:00
adding alerts to promtail and loki (#241)
Co-authored-by: apmbktf <andre.pasqualinoto-martins@itau-unibanco.com.br> Co-authored-by: Samuel Berthe <dev@samuel-berthe.fr>
This commit is contained in:
parent
dc85963ae5
commit
36ca52e598
2 changed files with 29 additions and 0 deletions
|
|
@ -74,6 +74,7 @@ Collection available here: **[https://awesome-prometheus-alerts.grep.to](https:/
|
|||
|
||||
- [Thanos](https://awesome-prometheus-alerts.grep.to/rules#thanos)
|
||||
- [Loki](https://awesome-prometheus-alerts.grep.to/rules#loki)
|
||||
- [Promtail](https://awesome-prometheus-alerts.grep.to/rules#promtail)
|
||||
- [Cortex](https://awesome-prometheus-alerts.grep.to/rules#cortex)
|
||||
|
||||
## 🤝 Contributing
|
||||
|
|
|
|||
|
|
@ -2031,6 +2031,34 @@ groups:
|
|||
description: A loki process had too many restarts (target {{ $labels.instance }})
|
||||
query: changes(process_start_time_seconds{job=~"loki"}[15m]) > 2
|
||||
severity: warning
|
||||
- name: Loki request errors
|
||||
description: The {{ $labels.job }} and {{ $labels.route }} are experiencing errors
|
||||
query: '100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10'
|
||||
severity: critical
|
||||
for: 15m
|
||||
- name: Loki request panic
|
||||
description: The {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics
|
||||
query: sum(increase(loki_panic_total[10m])) by (namespace, job) > 0
|
||||
severity: critical
|
||||
for: 5m
|
||||
- name: Loki request latency
|
||||
description: The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency
|
||||
query: (histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (le))) > 1
|
||||
severity: critical
|
||||
for: 5m
|
||||
- name: Promtail
|
||||
exporters:
|
||||
- rules:
|
||||
- name: Promtail request errors
|
||||
description: The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.
|
||||
query: '100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance) / sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) > 10'
|
||||
severity: critical
|
||||
for: 5m
|
||||
- name: Promtail request latency
|
||||
description: The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
|
||||
query: histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[5m])) by (le)) > 1
|
||||
severity: critical
|
||||
for: 5m
|
||||
- name: Cortex
|
||||
exporters:
|
||||
- rules:
|
||||
|
|
|
|||
Loading…
Reference in a new issue