From 55b049eb2817541f0b6b989b3a31c1b6b5149332 Mon Sep 17 00:00:00 2001 From: Yonah Dissen <47282577+yonahd@users.noreply.github.com> Date: Sun, 2 Oct 2022 19:05:30 +0300 Subject: [PATCH] add argocd rules (#309) * add argocd rules * fix(argocd): move contrib into _data/rules.yml instead of dist/... Co-authored-by: Samuel Berthe --- README.md | 1 + _data/rules.yml | 20 +++++++++++++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 2b567c4..c914de9 100644 --- a/README.md +++ b/README.md @@ -59,6 +59,7 @@ Collection available here: **[https://awesome-prometheus-alerts.grep.to](https:/ - [Etcd](https://awesome-prometheus-alerts.grep.to/rules#etcd) - [Linkerd](https://awesome-prometheus-alerts.grep.to/rules#linkerd) - [Istio](https://awesome-prometheus-alerts.grep.to/rules#istio) +- [ArgoCD](https://awesome-prometheus-alerts.grep.to/rules#argocd) #### Network, security and storage diff --git a/_data/rules.yml b/_data/rules.yml index c72ef75..286a567 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -27,7 +27,7 @@ groups: query: 'sum by (job) (up) == 0' severity: critical - name: Prometheus target missing with warmup time - description: Allow a job time to start up (10 minutes) before alerting that it's down. + description: Allow a job time to start up (10 minutes) before alerting that it's down. query: 'sum by (instance, job) ((up == 0) * on (instance) group_right(job) (node_time_seconds - node_boot_time_seconds > 600))' severity: critical - name: Prometheus configuration reload failure @@ -1952,6 +1952,24 @@ groups: query: 'sum(rate(pilot_duplicate_envoy_clusters{}[5m])) > 0' severity: critical + - name: ArgoCD + exporters: + - name: Embedded exporter + slug: embedded-exporter + doc_url: https://argo-cd.readthedocs.io/en/stable/operator-manual/metrics/ + rules: + - name: ArgoCD service not synced + description: Service {{ $labels.name }} run by argo is currently not in sync. + query: 'argocd_app_info{sync_status!="Synced"} != 0' + severity: warning + for: 15m + - name: ArgoCD service unhealthy + description: Service {{ $labels.name }} run by argo is currently not healthy. + query: 'argocd_app_info{health_status!="Healthy"} != 0' + severity: warning + for: 15m + + - name: Network, security and storage services: - name: Ceph