From cd5b39a1f02123cb69476b868fee22f393f3f346 Mon Sep 17 00:00:00 2001 From: Samuel Berthe Date: Sat, 5 Oct 2024 18:06:22 +0200 Subject: [PATCH 01/20] Create FUNDING.json --- FUNDING.json | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 FUNDING.json diff --git a/FUNDING.json b/FUNDING.json new file mode 100644 index 0000000..cf7bea6 --- /dev/null +++ b/FUNDING.json @@ -0,0 +1,7 @@ +{ + "drips": { + "ethereum": { + "ownedBy": "0xc31e1c24253da5a0c7ed4955347588c626c22292" + } + } +} From 640f06588d86e9f0818e3d8e90fed90a4b5cd6e2 Mon Sep 17 00:00:00 2001 From: Samuel Berthe Date: Sat, 5 Oct 2024 18:21:35 +0200 Subject: [PATCH 02/20] Delete FUNDING.json --- FUNDING.json | 7 ------- 1 file changed, 7 deletions(-) delete mode 100644 FUNDING.json diff --git a/FUNDING.json b/FUNDING.json deleted file mode 100644 index cf7bea6..0000000 --- a/FUNDING.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "drips": { - "ethereum": { - "ownedBy": "0xc31e1c24253da5a0c7ed4955347588c626c22292" - } - } -} From 7313acce364a5ce013cc020d71fdc5178b7729be Mon Sep 17 00:00:00 2001 From: Samuel Berthe Date: Sat, 5 Oct 2024 18:57:43 +0200 Subject: [PATCH 03/20] Create FUNDING.json --- FUNDING.json | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 FUNDING.json diff --git a/FUNDING.json b/FUNDING.json new file mode 100644 index 0000000..c4eccbf --- /dev/null +++ b/FUNDING.json @@ -0,0 +1,7 @@ +{ + "drips": { + "ethereum": { + "ownedBy": "0x1Baee8431ead537455399cC7099eBb219227C1f1" + } + } +} From c41fda1d924f66315773df90b9e12db172169e81 Mon Sep 17 00:00:00 2001 From: Samuel Berthe Date: Sun, 6 Oct 2024 17:31:23 +0200 Subject: [PATCH 04/20] Update alertmanager.md --- alertmanager.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/alertmanager.md b/alertmanager.md index 6b07089..d350945 100644 --- a/alertmanager.md +++ b/alertmanager.md @@ -138,4 +138,4 @@ If the notification takes too much time to be triggered, check the following del Also read: - [https://pracucci.com/prometheus-understanding-the-delays-on-alerting.html](https://pracucci.com/prometheus-understanding-the-delays-on-alerting.html). - [https://hodovi.cc/blog/creating-awesome-alertmanager-templates-for-slack/](https://hodovi.cc/blog/creating-awesome-alertmanager-templates-for-slack/) - +- [https://grafana.com/blog/2024/10/03/how-to-use-prometheus-to-efficiently-detect-anomalies-at-scale/](https://grafana.com/blog/2024/10/03/how-to-use-prometheus-to-efficiently-detect-anomalies-at-scale/) From f9e683896f07e44ffd3ea15ba3b290a932c30a35 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 28 Oct 2024 20:17:58 +0100 Subject: [PATCH 05/20] build(deps-dev): bump rexml from 3.3.7 to 3.3.9 (#438) Bumps [rexml](https://github.com/ruby/rexml) from 3.3.7 to 3.3.9. - [Release notes](https://github.com/ruby/rexml/releases) - [Changelog](https://github.com/ruby/rexml/blob/master/NEWS.md) - [Commits](https://github.com/ruby/rexml/compare/v3.3.7...v3.3.9) --- updated-dependencies: - dependency-name: rexml dependency-type: indirect ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Gemfile.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Gemfile.lock b/Gemfile.lock index 1afa3ed..305a897 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -248,7 +248,7 @@ GEM rb-fsevent (0.11.2) rb-inotify (0.11.1) ffi (~> 1.0) - rexml (3.3.7) + rexml (3.3.9) rouge (3.30.0) rubyzip (2.3.2) safe_yaml (1.0.5) From bb75cb2c68e0baa634df262b93129b9731a14c94 Mon Sep 17 00:00:00 2001 From: sipr-invivo <160140834+sipr-invivo@users.noreply.github.com> Date: Mon, 28 Oct 2024 22:24:10 +0100 Subject: [PATCH 06/20] feat: Add rule to Kubernetes Job not starting (#436) --- _data/rules.yml | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/_data/rules.yml b/_data/rules.yml index 9b94c17..f05d289 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -1132,12 +1132,12 @@ groups: description: "The indexing latency on Elasticsearch cluster is higher than the threshold." query: "elasticsearch_indices_indexing_index_time_seconds_total / elasticsearch_indices_indexing_index_total > 0.0005" severity: warning - for: 10m + for: 10m - name: Elasticsearch High Indexing Rate description: "The indexing rate on Elasticsearch cluster is higher than the threshold." query: "sum(rate(elasticsearch_indices_indexing_index_total[1m]))> 10000" severity: warning - for: 5m + for: 5m - name: Elasticsearch High Query Rate description: "The query rate on Elasticsearch cluster is higher than the threshold." query: "sum(rate(elasticsearch_indices_search_query_total[1m])) > 100" @@ -1147,14 +1147,14 @@ groups: description: "The query latency on Elasticsearch cluster is higher than the threshold." query: "elasticsearch_indices_search_fetch_time_seconds / elasticsearch_indices_search_fetch_total > 1" severity: warning - for: 5m + for: 5m - name: Meilisearch exporters: - name: Embedded exporter slug: embedded-exporter doc_url: https://github.com/orgs/meilisearch/discussions/625 - rules: + rules: - name: Meilisearch index is empty description: Meilisearch instance is down query: 'meilisearch_index_docs_count == 0' @@ -2044,6 +2044,11 @@ groups: description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete" query: "kube_job_status_failed > 0" severity: warning + - name: Kubernetes Job not starting + summary: Kubernetes Job not starting ({{ $labels.namespace }}/{{ $labels.job_name }}) + description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} did not start for 10 minutes" + query: "kube_job_status_active == 0 and kube_job_status_failed == 0 and kube_job_status_succeeded == 0 and (time() - kube_job_status_start_time) > 600" + severity: warning - name: Kubernetes CronJob suspended summary: Kubernetes CronJob suspended ({{ $labels.namespace }}/{{ $labels.cronjob }}) description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is suspended" From 14949721ba376f9c1c668bbe10eaeef6f86b0671 Mon Sep 17 00:00:00 2001 From: samber Date: Mon, 28 Oct 2024 21:25:18 +0000 Subject: [PATCH 07/20] Publish --- dist/rules/kubernetes/kubestate-exporter.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/dist/rules/kubernetes/kubestate-exporter.yml b/dist/rules/kubernetes/kubestate-exporter.yml index 2db1d64..efb914f 100644 --- a/dist/rules/kubernetes/kubestate-exporter.yml +++ b/dist/rules/kubernetes/kubestate-exporter.yml @@ -67,6 +67,15 @@ groups: summary: Kubernetes Job failed ({{ $labels.namespace }}/{{ $labels.job_name }}) description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: KubernetesJobNotStarting + expr: 'kube_job_status_active == 0 and kube_job_status_failed == 0 and kube_job_status_succeeded == 0 and (time() - kube_job_status_start_time) > 600' + for: 0m + labels: + severity: warning + annotations: + summary: Kubernetes Job not starting ({{ $labels.namespace }}/{{ $labels.job_name }}) + description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} did not start for 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: KubernetesCronjobSuspended expr: 'kube_cronjob_spec_suspend != 0' for: 0m From 353ef1ed95ef03a93087feb2162356a392f53cc6 Mon Sep 17 00:00:00 2001 From: Martin Anderson Date: Sat, 30 Nov 2024 11:29:57 +0200 Subject: [PATCH 08/20] RabbitMQ: add too many ready messages alert (#441) * RabbitMQ: add too many ready messages alert * Add RabbitMQ ready messages alert rule --------- Co-authored-by: Samuel Berthe --- _data/rules.yml | 5 +++++ dist/rules/rabbitmq/rabbitmq-exporter.yml | 9 +++++++++ 2 files changed, 14 insertions(+) diff --git a/_data/rules.yml b/_data/rules.yml index f05d289..aa0ed38 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -969,6 +969,11 @@ groups: query: "rabbitmq_process_open_fds / rabbitmq_process_max_fds * 100 > 90" severity: warning for: 2m + - name: RabbitMQ too many ready messages + description: RabbitMQ too many ready messages on {{ $labels.instace }} + query: "sum(rabbitmq_queue_messages_ready) BY (queue) > 1000" + severity: warning + for: 1m - name: RabbitMQ too many unack messages description: Too many unacknowledged messages query: "sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000" diff --git a/dist/rules/rabbitmq/rabbitmq-exporter.yml b/dist/rules/rabbitmq/rabbitmq-exporter.yml index be95359..c699128 100644 --- a/dist/rules/rabbitmq/rabbitmq-exporter.yml +++ b/dist/rules/rabbitmq/rabbitmq-exporter.yml @@ -49,6 +49,15 @@ groups: summary: RabbitMQ file descriptors usage (instance {{ $labels.instance }}) description: "A node use more than 90% of file descriptors\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: RabbitmqTooManyReadyMessages + expr: 'sum(rabbitmq_queue_messages_ready) BY (queue) > 1000' + for: 1m + labels: + severity: warning + annotations: + summary: RabbitMQ too many ready messages (instance {{ $labels.instance }}) + description: "Too many ready messages\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: RabbitmqTooManyUnackMessages expr: 'sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000' for: 1m From 8a220b1b8af48f5405e2bbc28de14d93128efca7 Mon Sep 17 00:00:00 2001 From: samber Date: Sat, 30 Nov 2024 09:31:05 +0000 Subject: [PATCH 09/20] Publish --- dist/rules/rabbitmq/rabbitmq-exporter.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dist/rules/rabbitmq/rabbitmq-exporter.yml b/dist/rules/rabbitmq/rabbitmq-exporter.yml index c699128..10823d2 100644 --- a/dist/rules/rabbitmq/rabbitmq-exporter.yml +++ b/dist/rules/rabbitmq/rabbitmq-exporter.yml @@ -56,7 +56,7 @@ groups: severity: warning annotations: summary: RabbitMQ too many ready messages (instance {{ $labels.instance }}) - description: "Too many ready messages\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "RabbitMQ too many ready messages on {{ $labels.instace }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: RabbitmqTooManyUnackMessages expr: 'sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000' From 8c3d06502fb26e7d0135a65d55130afa8e905ea9 Mon Sep 17 00:00:00 2001 From: Samuel Berthe Date: Thu, 5 Dec 2024 23:37:28 +0100 Subject: [PATCH 10/20] Update rules.yml --- _data/rules.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_data/rules.yml b/_data/rules.yml index aa0ed38..5a274cc 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -2036,7 +2036,7 @@ groups: for: 2m - name: Kubernetes Node out of pod capacity description: "Node {{ $labels.node }} is out of pod capacity" - query: 'sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90' + query: 'sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid, instance) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90' severity: warning for: 2m - name: Kubernetes Container oom killer From 4e38ae2087b03672e2c63439341ffcedb63ce90e Mon Sep 17 00:00:00 2001 From: samber Date: Thu, 5 Dec 2024 22:38:38 +0000 Subject: [PATCH 11/20] Publish --- dist/rules/kubernetes/kubestate-exporter.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dist/rules/kubernetes/kubestate-exporter.yml b/dist/rules/kubernetes/kubestate-exporter.yml index efb914f..7e32694 100644 --- a/dist/rules/kubernetes/kubestate-exporter.yml +++ b/dist/rules/kubernetes/kubestate-exporter.yml @@ -41,7 +41,7 @@ groups: description: "Node {{ $labels.node }} has NetworkUnavailable condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesNodeOutOfPodCapacity - expr: 'sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90' + expr: 'sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid, instance) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90' for: 2m labels: severity: warning From fff8a80ae5b19109bcdb6331e5304cf21fbba269 Mon Sep 17 00:00:00 2001 From: Samuel Berthe Date: Sun, 8 Dec 2024 21:24:45 +0100 Subject: [PATCH 12/20] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 9188322..fdb5643 100644 --- a/README.md +++ b/README.md @@ -102,6 +102,7 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts - [Loki](https://samber.github.io/awesome-prometheus-alerts/rules#loki) - [Promtail](https://samber.github.io/awesome-prometheus-alerts/rules#promtail) - [Cortex](https://samber.github.io/awesome-prometheus-alerts/rules#cortex) +- [Grafana Alloy](https://samber.github.io/awesome-prometheus-alerts/rules#grafana-alloy) - [Jenkins](https://samber.github.io/awesome-prometheus-alerts/rules#jenkins) - [Graph Node](https://samber.github.io/awesome-prometheus-alerts/rules#graph-node) From a8d7c43b3052e08b8a365f831bc74f9e87f0a824 Mon Sep 17 00:00:00 2001 From: Samuel Berthe Date: Sun, 8 Dec 2024 21:28:07 +0100 Subject: [PATCH 13/20] Update rules.yml --- _data/rules.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/_data/rules.yml b/_data/rules.yml index 5a274cc..abf9beb 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -2982,6 +2982,15 @@ groups: severity: critical for: 5m + - name: Grafana Alloy + exporters: + - slug: embedded-exporter + rules: + - name: Grafana Alloy service down + description: Alloy on (instance {{ $labels.instance }}) is not responding or has stopped running. + query: 'count by (instance) (alloy_build_info) unless count by (instance) (alloy_build_info offset 2m) ' + severity: critical + - name: Jenkins exporters: - name: Metric plugin From c5203e94d009b50d4bb616ba7a6775a2bb9833a7 Mon Sep 17 00:00:00 2001 From: samber Date: Sun, 8 Dec 2024 20:29:15 +0000 Subject: [PATCH 14/20] Publish --- dist/rules/grafana-alloy/embedded-exporter.yml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 dist/rules/grafana-alloy/embedded-exporter.yml diff --git a/dist/rules/grafana-alloy/embedded-exporter.yml b/dist/rules/grafana-alloy/embedded-exporter.yml new file mode 100644 index 0000000..d86c8a4 --- /dev/null +++ b/dist/rules/grafana-alloy/embedded-exporter.yml @@ -0,0 +1,14 @@ +groups: + +- name: EmbeddedExporter + + rules: + + - alert: GrafanaAlloyServiceDown + expr: 'count by (instance) (alloy_build_info) unless count by (instance) (alloy_build_info offset 2m) ' + for: 0m + labels: + severity: critical + annotations: + summary: Grafana Alloy service down (instance {{ $labels.instance }}) + description: "Alloy on (instance {{ $labels.instance }}) is not responding or has stopped running.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" From 52d4a8c7449edfce74e7d495ecb9a070c58f9d31 Mon Sep 17 00:00:00 2001 From: dxrayz <105016816+dxrayz@users.noreply.github.com> Date: Mon, 16 Dec 2024 13:16:05 +0200 Subject: [PATCH 15/20] Update postgres-exporter.yml (#444) Modify PostgresqlConfigurationChanged for prevent error: "many-to-many matching not allowed: matching labels must be unique on one side" in cases when you have multiple instances of postgres --- dist/rules/postgresql/postgres-exporter.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dist/rules/postgresql/postgres-exporter.yml b/dist/rules/postgresql/postgres-exporter.yml index 2ab461f..36070b3 100644 --- a/dist/rules/postgresql/postgres-exporter.yml +++ b/dist/rules/postgresql/postgres-exporter.yml @@ -140,7 +140,7 @@ groups: description: "PostgreSQL dead tuples is too large\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PostgresqlConfigurationChanged - expr: '{__name__=~"pg_settings_.*"} != ON(__name__) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"} OFFSET 5m' + expr: '{__name__=~"pg_settings_.*"} != ON(__name__,instance) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"} OFFSET 5m' for: 0m labels: severity: info From 4533f23b79bc3cb8b608eddec3001ac949408aee Mon Sep 17 00:00:00 2001 From: samber Date: Mon, 16 Dec 2024 11:17:17 +0000 Subject: [PATCH 16/20] Publish --- dist/rules/postgresql/postgres-exporter.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dist/rules/postgresql/postgres-exporter.yml b/dist/rules/postgresql/postgres-exporter.yml index 36070b3..2ab461f 100644 --- a/dist/rules/postgresql/postgres-exporter.yml +++ b/dist/rules/postgresql/postgres-exporter.yml @@ -140,7 +140,7 @@ groups: description: "PostgreSQL dead tuples is too large\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PostgresqlConfigurationChanged - expr: '{__name__=~"pg_settings_.*"} != ON(__name__,instance) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"} OFFSET 5m' + expr: '{__name__=~"pg_settings_.*"} != ON(__name__) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"} OFFSET 5m' for: 0m labels: severity: info From 84a3b517a8338407b0c6dc7f7890f4aaa4580901 Mon Sep 17 00:00:00 2001 From: Samuel Berthe Date: Mon, 16 Dec 2024 12:17:26 +0100 Subject: [PATCH 17/20] Update rules.yml --- _data/rules.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_data/rules.yml b/_data/rules.yml index abf9beb..6cb0fc8 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -703,7 +703,7 @@ groups: for: 2m - name: Postgresql configuration changed description: Postgres Database configuration change has occurred - query: '{__name__=~"pg_settings_.*"} != ON(__name__) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"} OFFSET 5m' + query: '{__name__=~"pg_settings_.*"} != ON(__name__, instance) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"} OFFSET 5m' severity: info - name: Postgresql SSL compression active description: Database connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`. From bdcc67c04e204d8abd88efe41085a7f68ea0ee39 Mon Sep 17 00:00:00 2001 From: Samuel Berthe Date: Mon, 16 Dec 2024 12:17:59 +0100 Subject: [PATCH 18/20] Update rules.yml --- _data/rules.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_data/rules.yml b/_data/rules.yml index 6cb0fc8..fa57247 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -545,7 +545,7 @@ groups: for: 5m - name: Netdata high memory usage description: Netdata high memory usage (> 80%) - query: '100 / netdata_system_ram_MB_average * netdata_system_ram_MB_average{dimension=~"free|cached"} < 20' + query: '100 / netdata_system_ram_MiB_average * netdata_system_ram_MiB_average{dimension=~"free|cached"} < 20' severity: warning for: 5m - name: Netdata low disk space From 53a369769d98837632cfc515d7caf53f04ebcd6b Mon Sep 17 00:00:00 2001 From: samber Date: Mon, 16 Dec 2024 11:19:08 +0000 Subject: [PATCH 19/20] Publish --- dist/rules/netdata/embedded-exporter.yml | 2 +- dist/rules/postgresql/postgres-exporter.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dist/rules/netdata/embedded-exporter.yml b/dist/rules/netdata/embedded-exporter.yml index 7d21766..8c57745 100644 --- a/dist/rules/netdata/embedded-exporter.yml +++ b/dist/rules/netdata/embedded-exporter.yml @@ -23,7 +23,7 @@ groups: description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: NetdataHighMemoryUsage - expr: '100 / netdata_system_ram_MB_average * netdata_system_ram_MB_average{dimension=~"free|cached"} < 20' + expr: '100 / netdata_system_ram_MiB_average * netdata_system_ram_MiB_average{dimension=~"free|cached"} < 20' for: 5m labels: severity: warning diff --git a/dist/rules/postgresql/postgres-exporter.yml b/dist/rules/postgresql/postgres-exporter.yml index 2ab461f..96ae5ea 100644 --- a/dist/rules/postgresql/postgres-exporter.yml +++ b/dist/rules/postgresql/postgres-exporter.yml @@ -140,7 +140,7 @@ groups: description: "PostgreSQL dead tuples is too large\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PostgresqlConfigurationChanged - expr: '{__name__=~"pg_settings_.*"} != ON(__name__) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"} OFFSET 5m' + expr: '{__name__=~"pg_settings_.*"} != ON(__name__, instance) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"} OFFSET 5m' for: 0m labels: severity: info From cbb2337438e8ff75f67e5bde59b8872b3fe31f6a Mon Sep 17 00:00:00 2001 From: sunlei Date: Mon, 13 Jan 2025 05:01:21 +0800 Subject: [PATCH 20/20] fix: formatting errors (#448) * fix: formatting errors * Update query format in rules.yml --------- Co-authored-by: Samuel Berthe --- _data/rules.yml | 5 +---- dist/rules/host-and-hardware/node-exporter.yml | 5 +---- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/_data/rules.yml b/_data/rules.yml index fa57247..eab61c2 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -242,10 +242,7 @@ groups: for: 5m - name: Host context switching high description: Context switching is growing on the node (twice the daily average during the last 15m) - query: | - (rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) - / - (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2 + query: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2' severity: warning comments: | x2 context switches is an arbitrary number. diff --git a/dist/rules/host-and-hardware/node-exporter.yml b/dist/rules/host-and-hardware/node-exporter.yml index 6a465d9..a6adff1 100644 --- a/dist/rules/host-and-hardware/node-exporter.yml +++ b/dist/rules/host-and-hardware/node-exporter.yml @@ -176,10 +176,7 @@ groups: description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostContextSwitchingHigh - expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) -/ -(rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2 -' + expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"}))/(rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2' for: 0m labels: severity: warning