mirror of
https://github.com/samber/awesome-prometheus-alerts.git
synced 2026-06-21 00:47:18 +08:00
Jekyll based doc
This commit is contained in:
parent
1c28ca9eb7
commit
0bc4a1633c
11 changed files with 804 additions and 108 deletions
14
CONTRIBUTING.md
Normal file
14
CONTRIBUTING.md
Normal file
|
|
@ -0,0 +1,14 @@
|
|||
|
||||
# Contributing
|
||||
|
||||
## Adding alerting rule
|
||||
|
||||
Rules are here: `_data/rules.yml`.
|
||||
|
||||
## Run localy
|
||||
|
||||
```
|
||||
gem install bundler
|
||||
bundle install
|
||||
bundle exec jekyll serve
|
||||
```
|
||||
2
Gemfile
Normal file
2
Gemfile
Normal file
|
|
@ -0,0 +1,2 @@
|
|||
source 'https://rubygems.org'
|
||||
gem 'github-pages', group: :jekyll_plugins
|
||||
248
Gemfile.lock
Normal file
248
Gemfile.lock
Normal file
|
|
@ -0,0 +1,248 @@
|
|||
GEM
|
||||
remote: https://rubygems.org/
|
||||
specs:
|
||||
activesupport (4.2.10)
|
||||
i18n (~> 0.7)
|
||||
minitest (~> 5.1)
|
||||
thread_safe (~> 0.3, >= 0.3.4)
|
||||
tzinfo (~> 1.1)
|
||||
addressable (2.5.2)
|
||||
public_suffix (>= 2.0.2, < 4.0)
|
||||
coffee-script (2.4.1)
|
||||
coffee-script-source
|
||||
execjs
|
||||
coffee-script-source (1.11.1)
|
||||
colorator (1.1.0)
|
||||
commonmarker (0.17.13)
|
||||
ruby-enum (~> 0.5)
|
||||
concurrent-ruby (1.0.5)
|
||||
dnsruby (1.61.2)
|
||||
addressable (~> 2.5)
|
||||
em-websocket (0.5.1)
|
||||
eventmachine (>= 0.12.9)
|
||||
http_parser.rb (~> 0.6.0)
|
||||
ethon (0.11.0)
|
||||
ffi (>= 1.3.0)
|
||||
eventmachine (1.2.7)
|
||||
execjs (2.7.0)
|
||||
faraday (0.15.3)
|
||||
multipart-post (>= 1.2, < 3)
|
||||
ffi (1.9.25)
|
||||
forwardable-extended (2.6.0)
|
||||
gemoji (3.0.0)
|
||||
github-pages (192)
|
||||
activesupport (= 4.2.10)
|
||||
github-pages-health-check (= 1.8.1)
|
||||
jekyll (= 3.7.4)
|
||||
jekyll-avatar (= 0.6.0)
|
||||
jekyll-coffeescript (= 1.1.1)
|
||||
jekyll-commonmark-ghpages (= 0.1.5)
|
||||
jekyll-default-layout (= 0.1.4)
|
||||
jekyll-feed (= 0.10.0)
|
||||
jekyll-gist (= 1.5.0)
|
||||
jekyll-github-metadata (= 2.9.4)
|
||||
jekyll-mentions (= 1.4.1)
|
||||
jekyll-optional-front-matter (= 0.3.0)
|
||||
jekyll-paginate (= 1.1.0)
|
||||
jekyll-readme-index (= 0.2.0)
|
||||
jekyll-redirect-from (= 0.14.0)
|
||||
jekyll-relative-links (= 0.5.3)
|
||||
jekyll-remote-theme (= 0.3.1)
|
||||
jekyll-sass-converter (= 1.5.2)
|
||||
jekyll-seo-tag (= 2.5.0)
|
||||
jekyll-sitemap (= 1.2.0)
|
||||
jekyll-swiss (= 0.4.0)
|
||||
jekyll-theme-architect (= 0.1.1)
|
||||
jekyll-theme-cayman (= 0.1.1)
|
||||
jekyll-theme-dinky (= 0.1.1)
|
||||
jekyll-theme-hacker (= 0.1.1)
|
||||
jekyll-theme-leap-day (= 0.1.1)
|
||||
jekyll-theme-merlot (= 0.1.1)
|
||||
jekyll-theme-midnight (= 0.1.1)
|
||||
jekyll-theme-minimal (= 0.1.1)
|
||||
jekyll-theme-modernist (= 0.1.1)
|
||||
jekyll-theme-primer (= 0.5.3)
|
||||
jekyll-theme-slate (= 0.1.1)
|
||||
jekyll-theme-tactile (= 0.1.1)
|
||||
jekyll-theme-time-machine (= 0.1.1)
|
||||
jekyll-titles-from-headings (= 0.5.1)
|
||||
jemoji (= 0.10.1)
|
||||
kramdown (= 1.17.0)
|
||||
liquid (= 4.0.0)
|
||||
listen (= 3.1.5)
|
||||
mercenary (~> 0.3)
|
||||
minima (= 2.5.0)
|
||||
nokogiri (>= 1.8.2, < 2.0)
|
||||
rouge (= 2.2.1)
|
||||
terminal-table (~> 1.4)
|
||||
github-pages-health-check (1.8.1)
|
||||
addressable (~> 2.3)
|
||||
dnsruby (~> 1.60)
|
||||
octokit (~> 4.0)
|
||||
public_suffix (~> 2.0)
|
||||
typhoeus (~> 1.3)
|
||||
html-pipeline (2.8.4)
|
||||
activesupport (>= 2)
|
||||
nokogiri (>= 1.4)
|
||||
http_parser.rb (0.6.0)
|
||||
i18n (0.9.5)
|
||||
concurrent-ruby (~> 1.0)
|
||||
jekyll (3.7.4)
|
||||
addressable (~> 2.4)
|
||||
colorator (~> 1.0)
|
||||
em-websocket (~> 0.5)
|
||||
i18n (~> 0.7)
|
||||
jekyll-sass-converter (~> 1.0)
|
||||
jekyll-watch (~> 2.0)
|
||||
kramdown (~> 1.14)
|
||||
liquid (~> 4.0)
|
||||
mercenary (~> 0.3.3)
|
||||
pathutil (~> 0.9)
|
||||
rouge (>= 1.7, < 4)
|
||||
safe_yaml (~> 1.0)
|
||||
jekyll-avatar (0.6.0)
|
||||
jekyll (~> 3.0)
|
||||
jekyll-coffeescript (1.1.1)
|
||||
coffee-script (~> 2.2)
|
||||
coffee-script-source (~> 1.11.1)
|
||||
jekyll-commonmark (1.2.0)
|
||||
commonmarker (~> 0.14)
|
||||
jekyll (>= 3.0, < 4.0)
|
||||
jekyll-commonmark-ghpages (0.1.5)
|
||||
commonmarker (~> 0.17.6)
|
||||
jekyll-commonmark (~> 1)
|
||||
rouge (~> 2)
|
||||
jekyll-default-layout (0.1.4)
|
||||
jekyll (~> 3.0)
|
||||
jekyll-feed (0.10.0)
|
||||
jekyll (~> 3.3)
|
||||
jekyll-gist (1.5.0)
|
||||
octokit (~> 4.2)
|
||||
jekyll-github-metadata (2.9.4)
|
||||
jekyll (~> 3.1)
|
||||
octokit (~> 4.0, != 4.4.0)
|
||||
jekyll-mentions (1.4.1)
|
||||
html-pipeline (~> 2.3)
|
||||
jekyll (~> 3.0)
|
||||
jekyll-optional-front-matter (0.3.0)
|
||||
jekyll (~> 3.0)
|
||||
jekyll-paginate (1.1.0)
|
||||
jekyll-readme-index (0.2.0)
|
||||
jekyll (~> 3.0)
|
||||
jekyll-redirect-from (0.14.0)
|
||||
jekyll (~> 3.3)
|
||||
jekyll-relative-links (0.5.3)
|
||||
jekyll (~> 3.3)
|
||||
jekyll-remote-theme (0.3.1)
|
||||
jekyll (~> 3.5)
|
||||
rubyzip (>= 1.2.1, < 3.0)
|
||||
jekyll-sass-converter (1.5.2)
|
||||
sass (~> 3.4)
|
||||
jekyll-seo-tag (2.5.0)
|
||||
jekyll (~> 3.3)
|
||||
jekyll-sitemap (1.2.0)
|
||||
jekyll (~> 3.3)
|
||||
jekyll-swiss (0.4.0)
|
||||
jekyll-theme-architect (0.1.1)
|
||||
jekyll (~> 3.5)
|
||||
jekyll-seo-tag (~> 2.0)
|
||||
jekyll-theme-cayman (0.1.1)
|
||||
jekyll (~> 3.5)
|
||||
jekyll-seo-tag (~> 2.0)
|
||||
jekyll-theme-dinky (0.1.1)
|
||||
jekyll (~> 3.5)
|
||||
jekyll-seo-tag (~> 2.0)
|
||||
jekyll-theme-hacker (0.1.1)
|
||||
jekyll (~> 3.5)
|
||||
jekyll-seo-tag (~> 2.0)
|
||||
jekyll-theme-leap-day (0.1.1)
|
||||
jekyll (~> 3.5)
|
||||
jekyll-seo-tag (~> 2.0)
|
||||
jekyll-theme-merlot (0.1.1)
|
||||
jekyll (~> 3.5)
|
||||
jekyll-seo-tag (~> 2.0)
|
||||
jekyll-theme-midnight (0.1.1)
|
||||
jekyll (~> 3.5)
|
||||
jekyll-seo-tag (~> 2.0)
|
||||
jekyll-theme-minimal (0.1.1)
|
||||
jekyll (~> 3.5)
|
||||
jekyll-seo-tag (~> 2.0)
|
||||
jekyll-theme-modernist (0.1.1)
|
||||
jekyll (~> 3.5)
|
||||
jekyll-seo-tag (~> 2.0)
|
||||
jekyll-theme-primer (0.5.3)
|
||||
jekyll (~> 3.5)
|
||||
jekyll-github-metadata (~> 2.9)
|
||||
jekyll-seo-tag (~> 2.0)
|
||||
jekyll-theme-slate (0.1.1)
|
||||
jekyll (~> 3.5)
|
||||
jekyll-seo-tag (~> 2.0)
|
||||
jekyll-theme-tactile (0.1.1)
|
||||
jekyll (~> 3.5)
|
||||
jekyll-seo-tag (~> 2.0)
|
||||
jekyll-theme-time-machine (0.1.1)
|
||||
jekyll (~> 3.5)
|
||||
jekyll-seo-tag (~> 2.0)
|
||||
jekyll-titles-from-headings (0.5.1)
|
||||
jekyll (~> 3.3)
|
||||
jekyll-watch (2.1.2)
|
||||
listen (~> 3.0)
|
||||
jemoji (0.10.1)
|
||||
gemoji (~> 3.0)
|
||||
html-pipeline (~> 2.2)
|
||||
jekyll (~> 3.0)
|
||||
kramdown (1.17.0)
|
||||
liquid (4.0.0)
|
||||
listen (3.1.5)
|
||||
rb-fsevent (~> 0.9, >= 0.9.4)
|
||||
rb-inotify (~> 0.9, >= 0.9.7)
|
||||
ruby_dep (~> 1.2)
|
||||
mercenary (0.3.6)
|
||||
mini_portile2 (2.3.0)
|
||||
minima (2.5.0)
|
||||
jekyll (~> 3.5)
|
||||
jekyll-feed (~> 0.9)
|
||||
jekyll-seo-tag (~> 2.1)
|
||||
minitest (5.11.3)
|
||||
multipart-post (2.0.0)
|
||||
nokogiri (1.8.5)
|
||||
mini_portile2 (~> 2.3.0)
|
||||
octokit (4.13.0)
|
||||
sawyer (~> 0.8.0, >= 0.5.3)
|
||||
pathutil (0.16.1)
|
||||
forwardable-extended (~> 2.6)
|
||||
public_suffix (2.0.5)
|
||||
rb-fsevent (0.10.3)
|
||||
rb-inotify (0.9.10)
|
||||
ffi (>= 0.5.0, < 2)
|
||||
rouge (2.2.1)
|
||||
ruby-enum (0.7.2)
|
||||
i18n
|
||||
ruby_dep (1.5.0)
|
||||
rubyzip (1.2.2)
|
||||
safe_yaml (1.0.4)
|
||||
sass (3.6.0)
|
||||
sass-listen (~> 4.0.0)
|
||||
sass-listen (4.0.0)
|
||||
rb-fsevent (~> 0.9, >= 0.9.4)
|
||||
rb-inotify (~> 0.9, >= 0.9.7)
|
||||
sawyer (0.8.1)
|
||||
addressable (>= 2.3.5, < 2.6)
|
||||
faraday (~> 0.8, < 1.0)
|
||||
terminal-table (1.8.0)
|
||||
unicode-display_width (~> 1.1, >= 1.1.1)
|
||||
thread_safe (0.3.6)
|
||||
typhoeus (1.3.0)
|
||||
ethon (>= 0.9.0)
|
||||
tzinfo (1.2.5)
|
||||
thread_safe (~> 0.1)
|
||||
unicode-display_width (1.4.0)
|
||||
|
||||
PLATFORMS
|
||||
ruby
|
||||
|
||||
DEPENDENCIES
|
||||
github-pages
|
||||
|
||||
BUNDLED WITH
|
||||
1.16.6
|
||||
110
README.md
110
README.md
|
|
@ -2,112 +2,8 @@
|
|||
|
||||
(WIP)
|
||||
|
||||
[https://awesome-prometheus-alerts.grep.to](https://awesome-prometheus-alerts.grep.to)
|
||||
|
||||
## Todo
|
||||
|
||||
- Write full alert rules in yml files
|
||||
- Make a small website with form for each rule, to build custom alerts (criticity, thresolds, instance...)
|
||||
|
||||
## Queries
|
||||
|
||||
### Prometheus internal
|
||||
|
||||
- `up == 0` // killed exporters
|
||||
|
||||
### node-exporter
|
||||
|
||||
Memory:
|
||||
|
||||
- `(node_memory_MemFree{} + node_memory_Cached{} + node_memory_Buffers{}) / node_memory_MemTotal{} * 100 < 5`
|
||||
|
||||
Network:
|
||||
|
||||
- `sum by (instance) (irate(node_network_transmit_bytes{}[2m])) / 1024 / 1024 > 100`
|
||||
- `sum by (instance) (irate(node_network_receive_bytes{}[2m])) / 1024 / 1024 > 100`
|
||||
|
||||
Disk:
|
||||
|
||||
- `sum by (instance) (irate(node_disk_bytes_read{}[2m])) / 1024 / 1024 > 50`
|
||||
- `sum by (instance) (irate(node_disk_bytes_written{}[2m])) / 1024 / 1024 > 50`
|
||||
- `node_filesystem_free{mountpoint ="/rootfs"} / node_filesystem_size{mountpoint ="/rootfs"} * 100 < 10` // gb
|
||||
- `node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint ="/rootfs"} * 100` // inodes
|
||||
- `rate(node_disk_read_time_ms{}[1m]) / rate(node_disk_reads_completed{}[1m]) > 100` // too much latency
|
||||
- `rate(node_disk_write_time_ms{}[1m]) / rate(node_disk_writes_completed{}[1m]) > 100` // too much latency
|
||||
|
||||
CPU:
|
||||
|
||||
- `avg by (instance) (sum by (cpu) (rate(node_cpu{mode!="idle"}[2m]))) * 100 > 75` // load
|
||||
- `rate(node_context_switches{}[5m]) > 1000` // nbr context switch per second
|
||||
|
||||
### cAdvisor
|
||||
|
||||
- `time() - container_last_seen{} > 60` // get killed container
|
||||
|
||||
### Nginx
|
||||
|
||||
- `rate(nginx_http_requests_total{status=~"^4.."}[1m]) > 10` // get 4xx http requests
|
||||
- `rate(nginx_http_requests_total{status=~"^5.."}[1m]) > 10` // get 5xx http requests
|
||||
|
||||
### Rabbitmq (kbudde/rabbitmq-exporter)
|
||||
|
||||
- `rabbitmq_up{} == 0`
|
||||
- `rabbitmq_running{} >= 2` // cluster
|
||||
- `rabbitmq_partitions{} > 0` // cluster got partition :-(
|
||||
- `rabbitmq_node_mem_used{} / rabbitmq_node_mem_limit{} * 100 > 90` // too much ram used
|
||||
- `rabbitmq_connectionsTotal{} > 1000`
|
||||
|
||||
- `rabbitmq_queue_messages_unacknowledged{queue="my-queue"} > 5`
|
||||
- `rabbitmq_queue_messages_ready{queue="my-queue"} > 1000` // more consumers needed
|
||||
- `time() - rabbitmq_queue_head_message_timestamp{queue="my-queue"} > 60` // takes more than 1min to consume messages
|
||||
- `rabbitmq_queue_consumers{} == 0` // no consumer on queue
|
||||
- `rate(rabbitmq_exchange_messages_published_in_total{exchange="my-exchange"}[1m]) < 5` // no activity on exchange
|
||||
|
||||
### PostgreSQL (wrouesnel/postgres_exporter)
|
||||
|
||||
- `pg_up{} == 0`
|
||||
- `pg_replication_lag{} > 10` // more than 10s lag between master and slave
|
||||
- `time() - pg_stat_user_tables_last_autovacuum{} > 60 * 60 * 24` // did not vaccum for 1 day
|
||||
- `time() - pg_stat_user_tables_last_autoanalyze{} > 60 * 60 * 24` // did not analyse for 1 day
|
||||
- `sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) > 100` // too many connections
|
||||
- `sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5` // connections number too small
|
||||
- `rate(pg_stat_database_deadlocks{pg_stat_database_de}[1m]) > 0`
|
||||
|
||||
### Redis (oliver006/redis_exporter)
|
||||
|
||||
- `redis_up{} == 0`
|
||||
- `time() - redis_rdb_last_save_timestamp_seconds{} > 60 * 60 * 24` // did not backup for 1 day
|
||||
- `redis_memory_used_bytes{} / redis_total_system_memory_bytes{} * 100 > 90`
|
||||
- `redis_connected_slaves{}`
|
||||
- `delta(redis_connected_slaves{}[1m]) < 0` // slaved killed
|
||||
- `redis_connected_clients{} > 100` // too many connections
|
||||
- `redis_connected_clients{} < 5` // connections number too small
|
||||
- `increase(redis_rejected_connections_total{}[1m]) > 0` // rejected connections
|
||||
|
||||
### MySQL
|
||||
|
||||
### Elasticsearch
|
||||
|
||||
### MongoDB
|
||||
|
||||
### Apache
|
||||
|
||||
### HaProxy
|
||||
|
||||
### Traefik
|
||||
|
||||
### PHP-FPM
|
||||
|
||||
### Kubernetes
|
||||
|
||||
### Nomad
|
||||
|
||||
### Consul
|
||||
|
||||
### Etcd
|
||||
|
||||
### Zookeeper
|
||||
|
||||
### Linkerd
|
||||
|
||||
### Istio
|
||||
|
||||
### Blackbox
|
||||
- In Jekyll, create an alert rule builder, to create custom alerts (criticity, thresolds, instance...)
|
||||
|
|
|
|||
|
|
@ -1 +1,4 @@
|
|||
theme: jekyll-theme-cayman
|
||||
theme: jekyll-theme-cayman
|
||||
|
||||
title: Awesome Prometheus alerts
|
||||
description: Collection of alert rules
|
||||
|
|
|
|||
286
_data/rules.yml
Normal file
286
_data/rules.yml
Normal file
|
|
@ -0,0 +1,286 @@
|
|||
|
||||
services:
|
||||
- name: Prometheus
|
||||
exporters:
|
||||
- rules:
|
||||
- name: Exporter down
|
||||
description: Prometheus exporter down
|
||||
query: 'up{} == 0'
|
||||
severity: warning
|
||||
|
||||
- name: Host
|
||||
exporters:
|
||||
- name: node-exporter
|
||||
rules:
|
||||
- name: Out of memory
|
||||
description: Node memory is filling up (< 10% left)
|
||||
query: '(node_memory_MemFree{} + node_memory_Cached{} + node_memory_Buffers{}) / node_memory_MemTotal{} * 100 < 10'
|
||||
severity: warning
|
||||
- name: Unusual network throughput in
|
||||
description: Host network interfaces are probably receiving too much data (> 100 MB/s)
|
||||
query: 'sum by (instance) (irate(node_network_receive_bytes{}[2m])) / 1024 / 1024 > 100'
|
||||
severity: warning
|
||||
- name: Unusual network throughput out
|
||||
description: Host network interfaces are probably sending too much data (> 100 MB/s)
|
||||
query: 'sum by (instance) (irate(node_network_transmit_bytes{}[2m])) / 1024 / 1024 > 100'
|
||||
severity: warning
|
||||
- name: Unusual disk read rate
|
||||
description: Disk is probably reading too much data (> 50 MB/s)
|
||||
query: 'sum by (instance) (irate(node_disk_bytes_read{}[2m])) / 1024 / 1024 > 50'
|
||||
severity: warning
|
||||
- name: Unusual disk write rate
|
||||
description: Disk is probably writing too much data (> 50 MB/s)
|
||||
query: 'sum by (instance) (irate(node_disk_bytes_written{}[2m])) / 1024 / 1024 > 50'
|
||||
severity: warning
|
||||
- name: Out of disk space
|
||||
description: Disk is almost full (< 10% left)
|
||||
query: 'node_filesystem_free{mountpoint ="/rootfs"} / node_filesystem_size{mountpoint ="/rootfs"} * 100 < 10'
|
||||
severity: warning
|
||||
- name: Out of inodes
|
||||
description: Disk is almost running out of available inodes (< 10% left)
|
||||
query: 'node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint ="/rootfs"} * 100 < 10'
|
||||
severity: warning
|
||||
- name: Unusual disk read latency
|
||||
description: Disk latency is growing (read operations > 100ms)
|
||||
query: 'rate(node_disk_read_time_ms{}[1m]) / rate(node_disk_reads_completed{}[1m]) > 100'
|
||||
severity: warning
|
||||
- name: Unusual disk write latency
|
||||
description: Disk latency is growing (write operations > 100ms)
|
||||
query: 'rate(node_disk_write_time_ms{}[1m]) / rate(node_disk_writes_completed{}[1m]) > 100'
|
||||
severity: warning
|
||||
- name: CPU load
|
||||
description: CPU load (15m) is high (> 75%)
|
||||
query: 'avg by (instance) (sum by (cpu) (rate(node_cpu{mode!="idle"}[5m]))) * 100 > 75'
|
||||
severity: warning
|
||||
- name: Context switching
|
||||
description: Context switching is growing on node (> 1000 / s)
|
||||
query: 'rate(node_context_switches{}[5m]) > 1000'
|
||||
severity: warning
|
||||
|
||||
- name: Docker containers
|
||||
exporters:
|
||||
- name: cAdvisor
|
||||
doc_url: https://github.com/google/cadvisor
|
||||
rules:
|
||||
- name: Container killed
|
||||
description: A container has disappeared
|
||||
query: 'time() - container_last_seen{} > 60'
|
||||
severity: warning
|
||||
|
||||
- name: Nginx
|
||||
exporters:
|
||||
- name: nginx-lua-prometheus
|
||||
doc_url: https://github.com/knyar/nginx-lua-prometheus
|
||||
rules:
|
||||
- name: HTTP errors 4xx
|
||||
description: Too many HTTP requests with status 4xx (> 5%)
|
||||
query: 'sum(rate(nginx_http_requests_total{status=~"^4.."}[1m])) / sum(rate(nginx_http_requests_total{}[1m])) * 100 > 5'
|
||||
severity: error
|
||||
- name: HTTP errors 5xx
|
||||
description: Too many HTTP requests with status 5xx (> 5%)
|
||||
query: 'sum(rate(nginx_http_requests_total{status=~"^5.."}[1m])) / sum(rate(nginx_http_requests_total{}[1m])) * 100 > 5'
|
||||
severity: error
|
||||
|
||||
- name: RabbitMQ
|
||||
exporters:
|
||||
- name: kbudde/rabbitmq-exporter
|
||||
doc_url: https://github.com/kbudde/rabbitmq_exporter
|
||||
rules:
|
||||
- name: Rabbitmq down
|
||||
description: RabbitMQ node down
|
||||
query: 'rabbitmq_up{} == 0'
|
||||
severity: error
|
||||
- name: Cluster down
|
||||
description: Less than 3 nodes running in RabbitMQ cluster
|
||||
query: 'rabbitmq_running{} < 3'
|
||||
severity: error
|
||||
- name: Cluster partition
|
||||
description: Cluster partition
|
||||
query: 'rabbitmq_partitions{} > 0'
|
||||
severity: error
|
||||
- name: Out of memory
|
||||
description: Memory available for RabbmitMQ is low (< 10%)
|
||||
query: 'rabbitmq_node_mem_used{} / rabbitmq_node_mem_limit{} * 100 > 90'
|
||||
severity: warning
|
||||
- name: Too many connections
|
||||
description: RabbitMQ instance has too many connections (> 1000)
|
||||
query: 'rabbitmq_connectionsTotal{} > 1000'
|
||||
severity: warning
|
||||
- name: Dead letter queue filling up
|
||||
description: Dead letter queue is filling up (> 10 msgs)
|
||||
query: 'rabbitmq_queue_messages{queue="my-dead-letter-queue"} > 10'
|
||||
severity: error
|
||||
- name: Too many messages in queue
|
||||
description: Queue is filling up (> 1000 msgs)
|
||||
query: 'rabbitmq_queue_messages_ready{queue="my-queue"} > 1000'
|
||||
severity: warning
|
||||
- name: Slow queue consuming
|
||||
description: Queue messages are consumed slowly (> 60s)
|
||||
query: 'time() - rabbitmq_queue_head_message_timestamp{queue="my-queue"} > 60'
|
||||
severity: warning
|
||||
- name: No consumer
|
||||
description: Queue has no consumer
|
||||
query: 'rabbitmq_queue_consumers{} == 0'
|
||||
severity: error
|
||||
- name: Too many consumers
|
||||
description: Queue should have only 1 consumer
|
||||
query: 'rabbitmq_queue_consumers{} > 1'
|
||||
severity: error
|
||||
- name: Unactive exchange
|
||||
description: Exchange receive less than 5 msgs per second
|
||||
query: 'rate(rabbitmq_exchange_messages_published_in_total{exchange="my-exchange"}[1m]) < 5'
|
||||
severity: warning
|
||||
|
||||
- name: MySQL
|
||||
exporters:
|
||||
- name: prometheus/mysqld_exporter
|
||||
doc_url: https://github.com/prometheus/mysqld_exporter
|
||||
rules:
|
||||
|
||||
- name: PostgreSQL
|
||||
exporters:
|
||||
- name: wrouesnel/postgres_exporter
|
||||
doc_url: https://github.com/wrouesnel/postgres_exporter/
|
||||
rules:
|
||||
- name: PostgreSQL down
|
||||
description: PostgreSQL instance is down
|
||||
query: 'pg_up{} == 0'
|
||||
severity: error
|
||||
- name: Replication lag
|
||||
description: PostgreSQL replication lag is going up (> 10s)
|
||||
query: 'pg_replication_lag{} > 10'
|
||||
severity: warning
|
||||
- name: Table not vaccumed
|
||||
description: Table has not been vaccum for 24 hours
|
||||
query: 'time() - pg_stat_user_tables_last_autovacuum{} > 60 * 60 * 24'
|
||||
severity: warning
|
||||
- name: Table not analyzed
|
||||
description: Table has not been analyzed for 24 hours
|
||||
query: 'time() - pg_stat_user_tables_last_autoanalyze{} > 60 * 60 * 24'
|
||||
severity: warning
|
||||
- name: Too many connections
|
||||
description: PostgreSQL instance has too many connections
|
||||
query: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) > 100'
|
||||
severity: warning
|
||||
- name: Not enough connections
|
||||
description: PostgreSQL instance should have more connections (> 5)
|
||||
query: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5'
|
||||
severity: warning
|
||||
- name: Dead locks
|
||||
description: PostgreSQL has dead-locks
|
||||
query: 'rate(pg_stat_database_deadlocks{pg_stat_database_de}[1m]) > 0'
|
||||
severity: warning
|
||||
|
||||
- name: Redis
|
||||
exporters:
|
||||
- name: oliver006/redis_exporter
|
||||
doc_url: https://github.com/oliver006/redis_exporter
|
||||
rules:
|
||||
- name: Redis down
|
||||
description: Redis instance is down
|
||||
query: 'redis_up{} == 0'
|
||||
severity: error
|
||||
- name: Missing backup
|
||||
description: Redis has not been backuped for 24 hours
|
||||
query: 'time() - redis_rdb_last_save_timestamp_seconds{} > 60 * 60 * 24'
|
||||
severity: error
|
||||
- name: Out of memory
|
||||
description: Redis is running out of memory (> 90%)
|
||||
query: 'redis_memory_used_bytes{} / redis_total_system_memory_bytes{} * 100 > 90'
|
||||
severity: warning
|
||||
- name: Replication broken
|
||||
description: Redis instance lost a slave
|
||||
query: 'delta(redis_connected_slaves{}[1m]) < 0'
|
||||
severity: error
|
||||
- name: Too many connections
|
||||
description: Redis instance has too many connections
|
||||
query: 'redis_connected_clients{} > 100'
|
||||
severity: warning
|
||||
- name: Not enough connections
|
||||
description: Redis instance should have more connections (> 5)
|
||||
query: 'redis_connected_clients{} < 5'
|
||||
severity: warning
|
||||
- name: Rejected connections
|
||||
description: Some connections to Redis has been rejected
|
||||
query: 'increase(redis_rejected_connections_total{}[1m]) > 0'
|
||||
severity: error
|
||||
|
||||
- name: MongoDB
|
||||
exporters:
|
||||
- name: dcu/mongodb_exporter
|
||||
doc_url: https://github.com/dcu/mongodb_exporter
|
||||
rules:
|
||||
|
||||
- name: Elasticsearch
|
||||
exporters:
|
||||
- name: justwatchcom/elasticsearch_exporter
|
||||
doc_url: https://github.com/justwatchcom/elasticsearch_exporter
|
||||
rules:
|
||||
|
||||
- name: Apache
|
||||
exporters:
|
||||
- name: Lusitaniae/apache_exporter
|
||||
doc_url: https://github.com/Lusitaniae/apache_exporter
|
||||
rules:
|
||||
|
||||
- name: HaProxy
|
||||
exporters:
|
||||
- name: prometheus/haproxy_exporter
|
||||
doc_url: https://github.com/prometheus/haproxy_exporter
|
||||
rules:
|
||||
|
||||
- name: Traefik
|
||||
exporters:
|
||||
- rules:
|
||||
|
||||
- name: PHP-FPM
|
||||
exporters:
|
||||
- name: bakins/php-fpm-exporter
|
||||
doc_url: https://github.com/bakins/php-fpm-exporter
|
||||
rules:
|
||||
|
||||
- name: Kubernetes
|
||||
exporters:
|
||||
- rules:
|
||||
|
||||
- name: Nomad
|
||||
exporters:
|
||||
- name: samber/prometheus-nomad-exporter
|
||||
doc_url: https://github.com/samber/prometheus-nomad-exporter
|
||||
rules:
|
||||
|
||||
- name: Consul
|
||||
exporters:
|
||||
- name: prometheus/consul_exporter
|
||||
doc_url: https://github.com/prometheus/consul_exporter
|
||||
rules:
|
||||
|
||||
- name: Etcd
|
||||
exporters:
|
||||
- rules:
|
||||
|
||||
- name: Zookeeper
|
||||
exporters:
|
||||
- name: cloudflare/kafka_zookeeper_exporter
|
||||
doc_url: https://github.com/cloudflare/kafka_zookeeper_exporter
|
||||
rules:
|
||||
|
||||
- name: Kafka
|
||||
exporters:
|
||||
- name: danielqsj/kafka_exporter
|
||||
doc_url: https://github.com/danielqsj/kafka_exporter
|
||||
rules:
|
||||
|
||||
- name: Linkerd
|
||||
exporters:
|
||||
- rules:
|
||||
|
||||
- name: Istio
|
||||
exporters:
|
||||
- rules:
|
||||
|
||||
- name: Blackbox
|
||||
exporters:
|
||||
- name: prometheus/blackbox_exporter
|
||||
doc_url: https://github.com/prometheus/blackbox_exporter
|
||||
rules:
|
||||
54
_layouts/default.html
Normal file
54
_layouts/default.html
Normal file
|
|
@ -0,0 +1,54 @@
|
|||
<!DOCTYPE html>
|
||||
<html lang="{{ site.lang | default: "en-US" }}">
|
||||
<head>
|
||||
|
||||
{% if site.google_analytics %}
|
||||
<script async src="https://www.googletagmanager.com/gtag/js?id={{ site.google_analytics }}"></script>
|
||||
<script>
|
||||
window.dataLayer = window.dataLayer || [];
|
||||
function gtag(){dataLayer.push(arguments);}
|
||||
gtag('js', new Date());
|
||||
gtag('config', '{{ site.google_analytics }}');
|
||||
</script>
|
||||
{% endif %}
|
||||
<meta charset="UTF-8">
|
||||
|
||||
{% seo %}
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||
<meta name="theme-color" content="#157878">
|
||||
<meta name="apple-mobile-web-app-status-bar-style" content="black-translucent">
|
||||
<link rel="stylesheet" href="{{ '/assets/css/style.css?v=' | append: site.github.build_revision | relative_url }}">
|
||||
</head>
|
||||
<body>
|
||||
<a id="skip-to-content" href="#content">Skip to the content.</a>
|
||||
|
||||
<header class="page-header" role="banner">
|
||||
<h1 class="project-name">
|
||||
<a href="/" style="color: white">
|
||||
{{ site.title | default: site.github.repository_name }}
|
||||
</a>
|
||||
</h1>
|
||||
<h2 class="project-tagline">{{ site.description | default: site.github.project_tagline }}</h2>
|
||||
<a href="/alertmanager" class="btn">AlertManager config</a>
|
||||
<a href="/rules" class="btn">Rules</a>
|
||||
{% if site.github.is_project_page %}
|
||||
<a href="{{ site.github.repository_url }}" class="btn">View on GitHub</a>
|
||||
{% endif %}
|
||||
{% if site.show_downloads %}
|
||||
<a href="{{ site.github.zip_url }}" class="btn">Download .zip</a>
|
||||
<a href="{{ site.github.tar_url }}" class="btn">Download .tar.gz</a>
|
||||
{% endif %}
|
||||
</header>
|
||||
|
||||
<main id="content" class="main-content" role="main">
|
||||
{{ content }}
|
||||
|
||||
<footer class="site-footer">
|
||||
{% if site.github.is_project_page %}
|
||||
<span class="site-footer-owner"><a href="{{ site.github.repository_url }}">{{ site.title }}</a> is maintained by <a href="{{ site.github.owner_url }}">{{ site.github.owner_name }}</a>.</span>
|
||||
{% endif %}
|
||||
<span class="site-footer-credits">This page was generated by <a href="https://pages.github.com">GitHub Pages</a>.</span>
|
||||
</footer>
|
||||
</main>
|
||||
</body>
|
||||
</html>
|
||||
56
alertmanager.md
Normal file
56
alertmanager.md
Normal file
|
|
@ -0,0 +1,56 @@
|
|||
|
||||
<h2>
|
||||
AlertManager configuration
|
||||
</h2>
|
||||
|
||||
{% highlight yaml %}
|
||||
# alertmanager.yml
|
||||
|
||||
route:
|
||||
# When a new group of alerts is created by an incoming alert, wait at
|
||||
# least 'group_wait' to send the initial notification.
|
||||
# This way ensures that you get multiple alerts for the same group that start
|
||||
# firing shortly after another are batched together on the first
|
||||
# notification.
|
||||
group_wait: 10s
|
||||
|
||||
# When the first notification was sent, wait 'group_interval' to send a betch
|
||||
# of new alerts that started firing for that group.
|
||||
group_interval: 5m
|
||||
|
||||
# If an alert has successfully been sent, wait 'repeat_interval' to
|
||||
# resend them.
|
||||
repeat_interval: 30m
|
||||
|
||||
# A default receiver
|
||||
receiver: "slack"
|
||||
|
||||
# All the above attributes are inherited by all child routes and can
|
||||
# overwritten on each.
|
||||
routes:
|
||||
- receiver: "slack"
|
||||
group_wait: 10s
|
||||
match_re:
|
||||
severity: error|warning
|
||||
continue: true
|
||||
|
||||
- receiver: "sms"
|
||||
group_wait: 10s
|
||||
match_re:
|
||||
severity: error
|
||||
continue: true
|
||||
|
||||
receivers:
|
||||
- name: "slack"
|
||||
slack_configs:
|
||||
- api_url: 'https://hooks.slack.com/services/XXXXXXXXX/XXXXXXXXX/xxxxxxxxxxxxxxxxxxxxxxxxxxx'
|
||||
send_resolved: true
|
||||
channel: 'monitoring'
|
||||
text: "{{ range .Alerts }}<!channel> {{ .Annotations.summary }}\n{{ .Annotations.description }}\n{{ end }}"
|
||||
|
||||
- name: "sms"
|
||||
webhook_config:
|
||||
- url: http://a.b.c:8080/send/sms
|
||||
send_resolved: true
|
||||
|
||||
{% endhighlight %}
|
||||
BIN
assets/prometheus-logo.png
Normal file
BIN
assets/prometheus-logo.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 17 KiB |
32
index.md
Normal file
32
index.md
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
|
||||
<style>
|
||||
.center-image
|
||||
{
|
||||
margin: 0 auto;
|
||||
display: block;
|
||||
}
|
||||
</style>
|
||||
|
||||
{: .center-image }
|
||||
|
||||
<h2>
|
||||
AlertManager configuration
|
||||
</h2>
|
||||
|
||||
<a href="/alertmanager">
|
||||
See here
|
||||
</a>
|
||||
|
||||
<h2>
|
||||
Prometheus alerting rules
|
||||
</h2>
|
||||
|
||||
<ul>
|
||||
{% for service in site.data.rules.services %}
|
||||
<li>
|
||||
<a href="/rules#{{ service.name | replace: " ", "-" | downcase }}">
|
||||
{{ service.name }}
|
||||
</a>
|
||||
</li>
|
||||
{% endfor %}
|
||||
</ul>
|
||||
105
rules.md
Normal file
105
rules.md
Normal file
|
|
@ -0,0 +1,105 @@
|
|||
<style>
|
||||
ul {
|
||||
list-style: none;
|
||||
}
|
||||
</style>
|
||||
|
||||
{% highlight yaml %}
|
||||
# prometheus.yml
|
||||
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
...
|
||||
|
||||
rule_files:
|
||||
- 'alerts/*.yml'
|
||||
|
||||
scrape_configs:
|
||||
...
|
||||
|
||||
{% endhighlight %}
|
||||
|
||||
{% highlight yaml %}
|
||||
# alerts/example-redis.yml
|
||||
|
||||
groups:
|
||||
|
||||
- name: ExampleRedisGroup
|
||||
rules:
|
||||
- alert: ExampleRedisDown
|
||||
expr: redis_up{} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: error
|
||||
annotations:
|
||||
summary: "Redis instance down"
|
||||
description: "Whatever"
|
||||
|
||||
{% endhighlight %}
|
||||
|
||||
<ul>
|
||||
{% for service in site.data.rules.services %}
|
||||
{% assign serviceIndex = forloop.index %}
|
||||
{% for exporter in service.exporters %}
|
||||
<li>
|
||||
<h2 id="{{ service.name | replace: " ", "-" | downcase }}">
|
||||
{{ serviceIndex }}.
|
||||
{{ service.name }}
|
||||
{% if exporter.name %}
|
||||
:
|
||||
{% if exporter.doc_url %}
|
||||
<a href="{{ exporter.doc_url }}">
|
||||
{{ exporter.name }}
|
||||
</a>
|
||||
{% else %}
|
||||
{{ exporter.name }}
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
</h2>
|
||||
|
||||
{% assign nbrRules = exporter.rules | size %}
|
||||
{% if nbrRules == 0 %}
|
||||
{% highlight javascript %}
|
||||
// @TODO
|
||||
{% endhighlight %}
|
||||
{% endif %}
|
||||
|
||||
<ul>
|
||||
{% for rule in exporter.rules %}
|
||||
{% assign ruleIndex = forloop.index %}
|
||||
<li>
|
||||
<h4>
|
||||
{{ serviceIndex }}.{{ ruleIndex }}.
|
||||
{{ rule.name }}
|
||||
</h4>
|
||||
<details {% if true || (serviceIndex == 1 && ruleIndex == 1) %} open {% endif %}>
|
||||
<summary>{{ rule.description }}</summary>
|
||||
<p>
|
||||
|
||||
{% assign ruleName = rule.name | split: ' ' %}
|
||||
{% capture ruleNameCamelcase %}{% for word in ruleName %}{{ word | capitalize }} {% endfor %}{% endcapture %}
|
||||
|
||||
{% highlight yaml %}
|
||||
- alert: {{ ruleNameCamelcase | remove: ' ' }}
|
||||
expr: {{ rule.query }}
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "{{ rule.name }} (instance {% raw %}{{ $labels.instance }}{% endraw %})"
|
||||
description: "{{ rule.description }}\n VALUE = {% raw %}{{ $value }}{% endraw %}\n LABELS: {% raw %}{{ $labels }}{% endraw %}"
|
||||
|
||||
{% endhighlight %}
|
||||
|
||||
</p>
|
||||
</details>
|
||||
<br/>
|
||||
</li>
|
||||
{% endfor %}
|
||||
</ul>
|
||||
|
||||
<hr/>
|
||||
</li>
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
</ul>
|
||||
Loading…
Reference in a new issue