From 0117a6eef4414f66247595484a575baf288ba7b9 Mon Sep 17 00:00:00 2001 From: Samuel Berthe Date: Mon, 11 Feb 2019 22:09:50 +0100 Subject: [PATCH 1/5] :lipstick: awesome-lint --- .travis.yml | 3 +++ README.md | 17 +++++++++++------ package.json | 8 ++++++++ 3 files changed, 22 insertions(+), 6 deletions(-) create mode 100644 .travis.yml create mode 100644 package.json diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..f178ec0 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,3 @@ +language: node_js +node_js: + - 'node' diff --git a/README.md b/README.md index e32f11a..0ee1177 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ -# Awesome Prometheus alerting rules [![Awesome](https://cdn.rawgit.com/sindresorhus/awesome/d7305f38d29fed78fa85652e3a63e154dd8e8829/media/badge.svg)](https://github.com/sindresorhus/awesome) +# Awesome Prometheus alerting rules [![Awesome](https://awesome.re/badge-flat.svg)](https://awesome.re) -

+

@@ -9,7 +9,12 @@ Collection available here: **[https://awesome-prometheus-alerts.grep.to](https://awesome-prometheus-alerts.grep.to)** -## Content +## Contents + +- [Rules](#rules) +- [Improvements](#improvements) + +## Rules - [Prometheus](https://awesome-prometheus-alerts.grep.to/rules#prometheus) - [Host](https://awesome-prometheus-alerts.grep.to/rules#host) @@ -41,11 +46,11 @@ Contributions for common alerting rules are most welcome! [Instructions here](CONTRIBUTING.md) -## Todo +## Improvements -- Create an alert rule builder in Jekyll for custom alerts (severity, thresholds, instances, ...) +- Create an alert rule builder in Jekyll for custom alerts (severity, thresholds, instances...) -# License +## License [![CC4](https://mirrors.creativecommons.org/presskit/cc.srr.primary.svg)](https://creativecommons.org/licenses/by/4.0/legalcode) diff --git a/package.json b/package.json new file mode 100644 index 0000000..1f697f1 --- /dev/null +++ b/package.json @@ -0,0 +1,8 @@ +{ + "scripts": { + "test": "awesome-lint" + }, + "devDependencies": { + "awesome-lint": "*" + } +} From d889a9594f31781d3df10a37c5f23459e6251045 Mon Sep 17 00:00:00 2001 From: Sofrony Pavel Date: Thu, 14 Feb 2019 22:36:35 +0300 Subject: [PATCH 2/5] LA (2 task per core) --- _data/rules.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/_data/rules.yml b/_data/rules.yml index efd79bc..ebaa31a 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -49,8 +49,8 @@ services: query: 'rate(node_disk_write_time_ms[1m]) / rate(node_disk_writes_completed[1m]) > 100' severity: warning - name: CPU load - description: CPU load (15m) is high (> 75%) - query: 'avg by (instance) (sum by (cpu) (rate(node_cpu{mode!="idle"}[5m]))) * 100 > 75' + description: CPU load (15m) is high + query: 'node_load15 / (count without (cpu, mode) (node_cpu_seconds_total{mode="system"})) > 2' severity: warning - name: Context switching description: Context switching is growing on node (> 1000 / s) From 51eedcf6165dcc3d2aa3f7e827efeb5cdd5dde5d Mon Sep 17 00:00:00 2001 From: Sofrony Pavel Date: Thu, 14 Feb 2019 22:37:04 +0300 Subject: [PATCH 3/5] fix memory metric name --- _data/rules.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_data/rules.yml b/_data/rules.yml index ebaa31a..14f731f 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -14,7 +14,7 @@ services: rules: - name: Out of memory description: Node memory is filling up (< 10% left) - query: '(node_memory_MemFree + node_memory_Cached + node_memory_Buffers) / node_memory_MemTotal * 100 < 10' + query: '(node_memory_MemFree_bytes + node_memory_Cached_bytes + node_memory_Buffers_bytes) / node_memory_MemTotal_bytes * 100 < 10' severity: warning - name: Unusual network throughput in description: Host network interfaces are probably receiving too much data (> 100 MB/s) From ff7ef5f6bdb616445db1b11c6329f2561278f08d Mon Sep 17 00:00:00 2001 From: Sofrony Pavel Date: Thu, 14 Feb 2019 22:37:22 +0300 Subject: [PATCH 4/5] node has swap alert --- _data/rules.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/_data/rules.yml b/_data/rules.yml index 14f731f..cbae4c4 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -56,6 +56,10 @@ services: description: Context switching is growing on node (> 1000 / s) query: 'rate(node_context_switches[5m]) > 1000' severity: warning + - name: Node has swap + description: Node has swap + query: 'node_memory_SwapTotal_bytes > 0' + severity: warning - name: Docker containers exporters: From 8136b239be6abc5cb023e8cc74340bdc116a7c61 Mon Sep 17 00:00:00 2001 From: Sofrony Pavel Date: Thu, 14 Feb 2019 22:52:41 +0300 Subject: [PATCH 5/5] add _bytes && _total for metrics --- _data/rules.yml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/_data/rules.yml b/_data/rules.yml index cbae4c4..9b2558c 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -18,23 +18,23 @@ services: severity: warning - name: Unusual network throughput in description: Host network interfaces are probably receiving too much data (> 100 MB/s) - query: 'sum by (instance) (irate(node_network_receive_bytes[2m])) / 1024 / 1024 > 100' + query: 'sum by (instance) (irate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100' severity: warning - name: Unusual network throughput out description: Host network interfaces are probably sending too much data (> 100 MB/s) - query: 'sum by (instance) (irate(node_network_transmit_bytes[2m])) / 1024 / 1024 > 100' + query: 'sum by (instance) (irate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100' severity: warning - name: Unusual disk read rate description: Disk is probably reading too much data (> 50 MB/s) - query: 'sum by (instance) (irate(node_disk_bytes_read[2m])) / 1024 / 1024 > 50' + query: 'sum by (instance) (irate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50' severity: warning - name: Unusual disk write rate description: Disk is probably writing too much data (> 50 MB/s) - query: 'sum by (instance) (irate(node_disk_bytes_written[2m])) / 1024 / 1024 > 50' + query: 'sum by (instance) (irate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50' severity: warning - name: Out of disk space description: Disk is almost full (< 10% left) - query: 'node_filesystem_free{mountpoint ="/rootfs"} / node_filesystem_size{mountpoint ="/rootfs"} * 100 < 10' + query: 'node_filesystem_free_bytes{mountpoint ="/rootfs"} / node_filesystem_size_bytes{mountpoint ="/rootfs"} * 100 < 10' severity: warning - name: Out of inodes description: Disk is almost running out of available inodes (< 10% left) @@ -42,11 +42,11 @@ services: severity: warning - name: Unusual disk read latency description: Disk latency is growing (read operations > 100ms) - query: 'rate(node_disk_read_time_ms[1m]) / rate(node_disk_reads_completed[1m]) > 100' + query: 'rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 100' severity: warning - name: Unusual disk write latency description: Disk latency is growing (write operations > 100ms) - query: 'rate(node_disk_write_time_ms[1m]) / rate(node_disk_writes_completed[1m]) > 100' + query: 'rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 100' severity: warning - name: CPU load description: CPU load (15m) is high @@ -54,7 +54,7 @@ services: severity: warning - name: Context switching description: Context switching is growing on node (> 1000 / s) - query: 'rate(node_context_switches[5m]) > 1000' + query: 'rate(node_context_switches_total[5m]) > 1000' severity: warning - name: Node has swap description: Node has swap