diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..89f4ccd --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,14 @@ + +# Contributing + +## Adding alerting rule + +Rules are here: `_data/rules.yml`. + +## Run localy + +``` +gem install bundler +bundle install +bundle exec jekyll serve +``` diff --git a/Gemfile b/Gemfile new file mode 100644 index 0000000..37f5eaa --- /dev/null +++ b/Gemfile @@ -0,0 +1,2 @@ +source 'https://rubygems.org' +gem 'github-pages', group: :jekyll_plugins diff --git a/Gemfile.lock b/Gemfile.lock new file mode 100644 index 0000000..46cf81f --- /dev/null +++ b/Gemfile.lock @@ -0,0 +1,248 @@ +GEM + remote: https://rubygems.org/ + specs: + activesupport (4.2.10) + i18n (~> 0.7) + minitest (~> 5.1) + thread_safe (~> 0.3, >= 0.3.4) + tzinfo (~> 1.1) + addressable (2.5.2) + public_suffix (>= 2.0.2, < 4.0) + coffee-script (2.4.1) + coffee-script-source + execjs + coffee-script-source (1.11.1) + colorator (1.1.0) + commonmarker (0.17.13) + ruby-enum (~> 0.5) + concurrent-ruby (1.0.5) + dnsruby (1.61.2) + addressable (~> 2.5) + em-websocket (0.5.1) + eventmachine (>= 0.12.9) + http_parser.rb (~> 0.6.0) + ethon (0.11.0) + ffi (>= 1.3.0) + eventmachine (1.2.7) + execjs (2.7.0) + faraday (0.15.3) + multipart-post (>= 1.2, < 3) + ffi (1.9.25) + forwardable-extended (2.6.0) + gemoji (3.0.0) + github-pages (192) + activesupport (= 4.2.10) + github-pages-health-check (= 1.8.1) + jekyll (= 3.7.4) + jekyll-avatar (= 0.6.0) + jekyll-coffeescript (= 1.1.1) + jekyll-commonmark-ghpages (= 0.1.5) + jekyll-default-layout (= 0.1.4) + jekyll-feed (= 0.10.0) + jekyll-gist (= 1.5.0) + jekyll-github-metadata (= 2.9.4) + jekyll-mentions (= 1.4.1) + jekyll-optional-front-matter (= 0.3.0) + jekyll-paginate (= 1.1.0) + jekyll-readme-index (= 0.2.0) + jekyll-redirect-from (= 0.14.0) + jekyll-relative-links (= 0.5.3) + jekyll-remote-theme (= 0.3.1) + jekyll-sass-converter (= 1.5.2) + jekyll-seo-tag (= 2.5.0) + jekyll-sitemap (= 1.2.0) + jekyll-swiss (= 0.4.0) + jekyll-theme-architect (= 0.1.1) + jekyll-theme-cayman (= 0.1.1) + jekyll-theme-dinky (= 0.1.1) + jekyll-theme-hacker (= 0.1.1) + jekyll-theme-leap-day (= 0.1.1) + jekyll-theme-merlot (= 0.1.1) + jekyll-theme-midnight (= 0.1.1) + jekyll-theme-minimal (= 0.1.1) + jekyll-theme-modernist (= 0.1.1) + jekyll-theme-primer (= 0.5.3) + jekyll-theme-slate (= 0.1.1) + jekyll-theme-tactile (= 0.1.1) + jekyll-theme-time-machine (= 0.1.1) + jekyll-titles-from-headings (= 0.5.1) + jemoji (= 0.10.1) + kramdown (= 1.17.0) + liquid (= 4.0.0) + listen (= 3.1.5) + mercenary (~> 0.3) + minima (= 2.5.0) + nokogiri (>= 1.8.2, < 2.0) + rouge (= 2.2.1) + terminal-table (~> 1.4) + github-pages-health-check (1.8.1) + addressable (~> 2.3) + dnsruby (~> 1.60) + octokit (~> 4.0) + public_suffix (~> 2.0) + typhoeus (~> 1.3) + html-pipeline (2.8.4) + activesupport (>= 2) + nokogiri (>= 1.4) + http_parser.rb (0.6.0) + i18n (0.9.5) + concurrent-ruby (~> 1.0) + jekyll (3.7.4) + addressable (~> 2.4) + colorator (~> 1.0) + em-websocket (~> 0.5) + i18n (~> 0.7) + jekyll-sass-converter (~> 1.0) + jekyll-watch (~> 2.0) + kramdown (~> 1.14) + liquid (~> 4.0) + mercenary (~> 0.3.3) + pathutil (~> 0.9) + rouge (>= 1.7, < 4) + safe_yaml (~> 1.0) + jekyll-avatar (0.6.0) + jekyll (~> 3.0) + jekyll-coffeescript (1.1.1) + coffee-script (~> 2.2) + coffee-script-source (~> 1.11.1) + jekyll-commonmark (1.2.0) + commonmarker (~> 0.14) + jekyll (>= 3.0, < 4.0) + jekyll-commonmark-ghpages (0.1.5) + commonmarker (~> 0.17.6) + jekyll-commonmark (~> 1) + rouge (~> 2) + jekyll-default-layout (0.1.4) + jekyll (~> 3.0) + jekyll-feed (0.10.0) + jekyll (~> 3.3) + jekyll-gist (1.5.0) + octokit (~> 4.2) + jekyll-github-metadata (2.9.4) + jekyll (~> 3.1) + octokit (~> 4.0, != 4.4.0) + jekyll-mentions (1.4.1) + html-pipeline (~> 2.3) + jekyll (~> 3.0) + jekyll-optional-front-matter (0.3.0) + jekyll (~> 3.0) + jekyll-paginate (1.1.0) + jekyll-readme-index (0.2.0) + jekyll (~> 3.0) + jekyll-redirect-from (0.14.0) + jekyll (~> 3.3) + jekyll-relative-links (0.5.3) + jekyll (~> 3.3) + jekyll-remote-theme (0.3.1) + jekyll (~> 3.5) + rubyzip (>= 1.2.1, < 3.0) + jekyll-sass-converter (1.5.2) + sass (~> 3.4) + jekyll-seo-tag (2.5.0) + jekyll (~> 3.3) + jekyll-sitemap (1.2.0) + jekyll (~> 3.3) + jekyll-swiss (0.4.0) + jekyll-theme-architect (0.1.1) + jekyll (~> 3.5) + jekyll-seo-tag (~> 2.0) + jekyll-theme-cayman (0.1.1) + jekyll (~> 3.5) + jekyll-seo-tag (~> 2.0) + jekyll-theme-dinky (0.1.1) + jekyll (~> 3.5) + jekyll-seo-tag (~> 2.0) + jekyll-theme-hacker (0.1.1) + jekyll (~> 3.5) + jekyll-seo-tag (~> 2.0) + jekyll-theme-leap-day (0.1.1) + jekyll (~> 3.5) + jekyll-seo-tag (~> 2.0) + jekyll-theme-merlot (0.1.1) + jekyll (~> 3.5) + jekyll-seo-tag (~> 2.0) + jekyll-theme-midnight (0.1.1) + jekyll (~> 3.5) + jekyll-seo-tag (~> 2.0) + jekyll-theme-minimal (0.1.1) + jekyll (~> 3.5) + jekyll-seo-tag (~> 2.0) + jekyll-theme-modernist (0.1.1) + jekyll (~> 3.5) + jekyll-seo-tag (~> 2.0) + jekyll-theme-primer (0.5.3) + jekyll (~> 3.5) + jekyll-github-metadata (~> 2.9) + jekyll-seo-tag (~> 2.0) + jekyll-theme-slate (0.1.1) + jekyll (~> 3.5) + jekyll-seo-tag (~> 2.0) + jekyll-theme-tactile (0.1.1) + jekyll (~> 3.5) + jekyll-seo-tag (~> 2.0) + jekyll-theme-time-machine (0.1.1) + jekyll (~> 3.5) + jekyll-seo-tag (~> 2.0) + jekyll-titles-from-headings (0.5.1) + jekyll (~> 3.3) + jekyll-watch (2.1.2) + listen (~> 3.0) + jemoji (0.10.1) + gemoji (~> 3.0) + html-pipeline (~> 2.2) + jekyll (~> 3.0) + kramdown (1.17.0) + liquid (4.0.0) + listen (3.1.5) + rb-fsevent (~> 0.9, >= 0.9.4) + rb-inotify (~> 0.9, >= 0.9.7) + ruby_dep (~> 1.2) + mercenary (0.3.6) + mini_portile2 (2.3.0) + minima (2.5.0) + jekyll (~> 3.5) + jekyll-feed (~> 0.9) + jekyll-seo-tag (~> 2.1) + minitest (5.11.3) + multipart-post (2.0.0) + nokogiri (1.8.5) + mini_portile2 (~> 2.3.0) + octokit (4.13.0) + sawyer (~> 0.8.0, >= 0.5.3) + pathutil (0.16.1) + forwardable-extended (~> 2.6) + public_suffix (2.0.5) + rb-fsevent (0.10.3) + rb-inotify (0.9.10) + ffi (>= 0.5.0, < 2) + rouge (2.2.1) + ruby-enum (0.7.2) + i18n + ruby_dep (1.5.0) + rubyzip (1.2.2) + safe_yaml (1.0.4) + sass (3.6.0) + sass-listen (~> 4.0.0) + sass-listen (4.0.0) + rb-fsevent (~> 0.9, >= 0.9.4) + rb-inotify (~> 0.9, >= 0.9.7) + sawyer (0.8.1) + addressable (>= 2.3.5, < 2.6) + faraday (~> 0.8, < 1.0) + terminal-table (1.8.0) + unicode-display_width (~> 1.1, >= 1.1.1) + thread_safe (0.3.6) + typhoeus (1.3.0) + ethon (>= 0.9.0) + tzinfo (1.2.5) + thread_safe (~> 0.1) + unicode-display_width (1.4.0) + +PLATFORMS + ruby + +DEPENDENCIES + github-pages + +BUNDLED WITH + 1.16.6 diff --git a/README.md b/README.md index 3a4edf8..1f42ae8 100644 --- a/README.md +++ b/README.md @@ -2,112 +2,8 @@ (WIP) +[https://awesome-prometheus-alerts.grep.to](https://awesome-prometheus-alerts.grep.to) + ## Todo -- Write full alert rules in yml files -- Make a small website with form for each rule, to build custom alerts (criticity, thresolds, instance...) - -## Queries - -### Prometheus internal - -- `up == 0` // killed exporters - -### node-exporter - -Memory: - -- `(node_memory_MemFree{} + node_memory_Cached{} + node_memory_Buffers{}) / node_memory_MemTotal{} * 100 < 5` - -Network: - -- `sum by (instance) (irate(node_network_transmit_bytes{}[2m])) / 1024 / 1024 > 100` -- `sum by (instance) (irate(node_network_receive_bytes{}[2m])) / 1024 / 1024 > 100` - -Disk: - -- `sum by (instance) (irate(node_disk_bytes_read{}[2m])) / 1024 / 1024 > 50` -- `sum by (instance) (irate(node_disk_bytes_written{}[2m])) / 1024 / 1024 > 50` -- `node_filesystem_free{mountpoint ="/rootfs"} / node_filesystem_size{mountpoint ="/rootfs"} * 100 < 10` // gb -- `node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint ="/rootfs"} * 100` // inodes -- `rate(node_disk_read_time_ms{}[1m]) / rate(node_disk_reads_completed{}[1m]) > 100` // too much latency -- `rate(node_disk_write_time_ms{}[1m]) / rate(node_disk_writes_completed{}[1m]) > 100` // too much latency - -CPU: - -- `avg by (instance) (sum by (cpu) (rate(node_cpu{mode!="idle"}[2m]))) * 100 > 75` // load -- `rate(node_context_switches{}[5m]) > 1000` // nbr context switch per second - -### cAdvisor - -- `time() - container_last_seen{} > 60` // get killed container - -### Nginx - -- `rate(nginx_http_requests_total{status=~"^4.."}[1m]) > 10` // get 4xx http requests -- `rate(nginx_http_requests_total{status=~"^5.."}[1m]) > 10` // get 5xx http requests - -### Rabbitmq (kbudde/rabbitmq-exporter) - -- `rabbitmq_up{} == 0` -- `rabbitmq_running{} >= 2` // cluster -- `rabbitmq_partitions{} > 0` // cluster got partition :-( -- `rabbitmq_node_mem_used{} / rabbitmq_node_mem_limit{} * 100 > 90` // too much ram used -- `rabbitmq_connectionsTotal{} > 1000` - -- `rabbitmq_queue_messages_unacknowledged{queue="my-queue"} > 5` -- `rabbitmq_queue_messages_ready{queue="my-queue"} > 1000` // more consumers needed -- `time() - rabbitmq_queue_head_message_timestamp{queue="my-queue"} > 60` // takes more than 1min to consume messages -- `rabbitmq_queue_consumers{} == 0` // no consumer on queue -- `rate(rabbitmq_exchange_messages_published_in_total{exchange="my-exchange"}[1m]) < 5` // no activity on exchange - -### PostgreSQL (wrouesnel/postgres_exporter) - -- `pg_up{} == 0` -- `pg_replication_lag{} > 10` // more than 10s lag between master and slave -- `time() - pg_stat_user_tables_last_autovacuum{} > 60 * 60 * 24` // did not vaccum for 1 day -- `time() - pg_stat_user_tables_last_autoanalyze{} > 60 * 60 * 24` // did not analyse for 1 day -- `sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) > 100` // too many connections -- `sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5` // connections number too small -- `rate(pg_stat_database_deadlocks{pg_stat_database_de}[1m]) > 0` - -### Redis (oliver006/redis_exporter) - -- `redis_up{} == 0` -- `time() - redis_rdb_last_save_timestamp_seconds{} > 60 * 60 * 24` // did not backup for 1 day -- `redis_memory_used_bytes{} / redis_total_system_memory_bytes{} * 100 > 90` -- `redis_connected_slaves{}` -- `delta(redis_connected_slaves{}[1m]) < 0` // slaved killed -- `redis_connected_clients{} > 100` // too many connections -- `redis_connected_clients{} < 5` // connections number too small -- `increase(redis_rejected_connections_total{}[1m]) > 0` // rejected connections - -### MySQL - -### Elasticsearch - -### MongoDB - -### Apache - -### HaProxy - -### Traefik - -### PHP-FPM - -### Kubernetes - -### Nomad - -### Consul - -### Etcd - -### Zookeeper - -### Linkerd - -### Istio - -### Blackbox +- In Jekyll, create an alert rule builder, to create custom alerts (criticity, thresolds, instance...) diff --git a/_config.yml b/_config.yml index c419263..fd86df3 100644 --- a/_config.yml +++ b/_config.yml @@ -1 +1,4 @@ -theme: jekyll-theme-cayman \ No newline at end of file +theme: jekyll-theme-cayman + +title: Awesome Prometheus alerts +description: Collection of alert rules diff --git a/_data/rules.yml b/_data/rules.yml new file mode 100644 index 0000000..5035cf6 --- /dev/null +++ b/_data/rules.yml @@ -0,0 +1,286 @@ + +services: + - name: Prometheus + exporters: + - rules: + - name: Exporter down + description: Prometheus exporter down + query: 'up{} == 0' + severity: warning + + - name: Host + exporters: + - name: node-exporter + rules: + - name: Out of memory + description: Node memory is filling up (< 10% left) + query: '(node_memory_MemFree{} + node_memory_Cached{} + node_memory_Buffers{}) / node_memory_MemTotal{} * 100 < 10' + severity: warning + - name: Unusual network throughput in + description: Host network interfaces are probably receiving too much data (> 100 MB/s) + query: 'sum by (instance) (irate(node_network_receive_bytes{}[2m])) / 1024 / 1024 > 100' + severity: warning + - name: Unusual network throughput out + description: Host network interfaces are probably sending too much data (> 100 MB/s) + query: 'sum by (instance) (irate(node_network_transmit_bytes{}[2m])) / 1024 / 1024 > 100' + severity: warning + - name: Unusual disk read rate + description: Disk is probably reading too much data (> 50 MB/s) + query: 'sum by (instance) (irate(node_disk_bytes_read{}[2m])) / 1024 / 1024 > 50' + severity: warning + - name: Unusual disk write rate + description: Disk is probably writing too much data (> 50 MB/s) + query: 'sum by (instance) (irate(node_disk_bytes_written{}[2m])) / 1024 / 1024 > 50' + severity: warning + - name: Out of disk space + description: Disk is almost full (< 10% left) + query: 'node_filesystem_free{mountpoint ="/rootfs"} / node_filesystem_size{mountpoint ="/rootfs"} * 100 < 10' + severity: warning + - name: Out of inodes + description: Disk is almost running out of available inodes (< 10% left) + query: 'node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint ="/rootfs"} * 100 < 10' + severity: warning + - name: Unusual disk read latency + description: Disk latency is growing (read operations > 100ms) + query: 'rate(node_disk_read_time_ms{}[1m]) / rate(node_disk_reads_completed{}[1m]) > 100' + severity: warning + - name: Unusual disk write latency + description: Disk latency is growing (write operations > 100ms) + query: 'rate(node_disk_write_time_ms{}[1m]) / rate(node_disk_writes_completed{}[1m]) > 100' + severity: warning + - name: CPU load + description: CPU load (15m) is high (> 75%) + query: 'avg by (instance) (sum by (cpu) (rate(node_cpu{mode!="idle"}[5m]))) * 100 > 75' + severity: warning + - name: Context switching + description: Context switching is growing on node (> 1000 / s) + query: 'rate(node_context_switches{}[5m]) > 1000' + severity: warning + + - name: Docker containers + exporters: + - name: cAdvisor + doc_url: https://github.com/google/cadvisor + rules: + - name: Container killed + description: A container has disappeared + query: 'time() - container_last_seen{} > 60' + severity: warning + + - name: Nginx + exporters: + - name: nginx-lua-prometheus + doc_url: https://github.com/knyar/nginx-lua-prometheus + rules: + - name: HTTP errors 4xx + description: Too many HTTP requests with status 4xx (> 5%) + query: 'sum(rate(nginx_http_requests_total{status=~"^4.."}[1m])) / sum(rate(nginx_http_requests_total{}[1m])) * 100 > 5' + severity: error + - name: HTTP errors 5xx + description: Too many HTTP requests with status 5xx (> 5%) + query: 'sum(rate(nginx_http_requests_total{status=~"^5.."}[1m])) / sum(rate(nginx_http_requests_total{}[1m])) * 100 > 5' + severity: error + + - name: RabbitMQ + exporters: + - name: kbudde/rabbitmq-exporter + doc_url: https://github.com/kbudde/rabbitmq_exporter + rules: + - name: Rabbitmq down + description: RabbitMQ node down + query: 'rabbitmq_up{} == 0' + severity: error + - name: Cluster down + description: Less than 3 nodes running in RabbitMQ cluster + query: 'rabbitmq_running{} < 3' + severity: error + - name: Cluster partition + description: Cluster partition + query: 'rabbitmq_partitions{} > 0' + severity: error + - name: Out of memory + description: Memory available for RabbmitMQ is low (< 10%) + query: 'rabbitmq_node_mem_used{} / rabbitmq_node_mem_limit{} * 100 > 90' + severity: warning + - name: Too many connections + description: RabbitMQ instance has too many connections (> 1000) + query: 'rabbitmq_connectionsTotal{} > 1000' + severity: warning + - name: Dead letter queue filling up + description: Dead letter queue is filling up (> 10 msgs) + query: 'rabbitmq_queue_messages{queue="my-dead-letter-queue"} > 10' + severity: error + - name: Too many messages in queue + description: Queue is filling up (> 1000 msgs) + query: 'rabbitmq_queue_messages_ready{queue="my-queue"} > 1000' + severity: warning + - name: Slow queue consuming + description: Queue messages are consumed slowly (> 60s) + query: 'time() - rabbitmq_queue_head_message_timestamp{queue="my-queue"} > 60' + severity: warning + - name: No consumer + description: Queue has no consumer + query: 'rabbitmq_queue_consumers{} == 0' + severity: error + - name: Too many consumers + description: Queue should have only 1 consumer + query: 'rabbitmq_queue_consumers{} > 1' + severity: error + - name: Unactive exchange + description: Exchange receive less than 5 msgs per second + query: 'rate(rabbitmq_exchange_messages_published_in_total{exchange="my-exchange"}[1m]) < 5' + severity: warning + + - name: MySQL + exporters: + - name: prometheus/mysqld_exporter + doc_url: https://github.com/prometheus/mysqld_exporter + rules: + + - name: PostgreSQL + exporters: + - name: wrouesnel/postgres_exporter + doc_url: https://github.com/wrouesnel/postgres_exporter/ + rules: + - name: PostgreSQL down + description: PostgreSQL instance is down + query: 'pg_up{} == 0' + severity: error + - name: Replication lag + description: PostgreSQL replication lag is going up (> 10s) + query: 'pg_replication_lag{} > 10' + severity: warning + - name: Table not vaccumed + description: Table has not been vaccum for 24 hours + query: 'time() - pg_stat_user_tables_last_autovacuum{} > 60 * 60 * 24' + severity: warning + - name: Table not analyzed + description: Table has not been analyzed for 24 hours + query: 'time() - pg_stat_user_tables_last_autoanalyze{} > 60 * 60 * 24' + severity: warning + - name: Too many connections + description: PostgreSQL instance has too many connections + query: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) > 100' + severity: warning + - name: Not enough connections + description: PostgreSQL instance should have more connections (> 5) + query: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5' + severity: warning + - name: Dead locks + description: PostgreSQL has dead-locks + query: 'rate(pg_stat_database_deadlocks{pg_stat_database_de}[1m]) > 0' + severity: warning + + - name: Redis + exporters: + - name: oliver006/redis_exporter + doc_url: https://github.com/oliver006/redis_exporter + rules: + - name: Redis down + description: Redis instance is down + query: 'redis_up{} == 0' + severity: error + - name: Missing backup + description: Redis has not been backuped for 24 hours + query: 'time() - redis_rdb_last_save_timestamp_seconds{} > 60 * 60 * 24' + severity: error + - name: Out of memory + description: Redis is running out of memory (> 90%) + query: 'redis_memory_used_bytes{} / redis_total_system_memory_bytes{} * 100 > 90' + severity: warning + - name: Replication broken + description: Redis instance lost a slave + query: 'delta(redis_connected_slaves{}[1m]) < 0' + severity: error + - name: Too many connections + description: Redis instance has too many connections + query: 'redis_connected_clients{} > 100' + severity: warning + - name: Not enough connections + description: Redis instance should have more connections (> 5) + query: 'redis_connected_clients{} < 5' + severity: warning + - name: Rejected connections + description: Some connections to Redis has been rejected + query: 'increase(redis_rejected_connections_total{}[1m]) > 0' + severity: error + + - name: MongoDB + exporters: + - name: dcu/mongodb_exporter + doc_url: https://github.com/dcu/mongodb_exporter + rules: + + - name: Elasticsearch + exporters: + - name: justwatchcom/elasticsearch_exporter + doc_url: https://github.com/justwatchcom/elasticsearch_exporter + rules: + + - name: Apache + exporters: + - name: Lusitaniae/apache_exporter + doc_url: https://github.com/Lusitaniae/apache_exporter + rules: + + - name: HaProxy + exporters: + - name: prometheus/haproxy_exporter + doc_url: https://github.com/prometheus/haproxy_exporter + rules: + + - name: Traefik + exporters: + - rules: + + - name: PHP-FPM + exporters: + - name: bakins/php-fpm-exporter + doc_url: https://github.com/bakins/php-fpm-exporter + rules: + + - name: Kubernetes + exporters: + - rules: + + - name: Nomad + exporters: + - name: samber/prometheus-nomad-exporter + doc_url: https://github.com/samber/prometheus-nomad-exporter + rules: + + - name: Consul + exporters: + - name: prometheus/consul_exporter + doc_url: https://github.com/prometheus/consul_exporter + rules: + + - name: Etcd + exporters: + - rules: + + - name: Zookeeper + exporters: + - name: cloudflare/kafka_zookeeper_exporter + doc_url: https://github.com/cloudflare/kafka_zookeeper_exporter + rules: + + - name: Kafka + exporters: + - name: danielqsj/kafka_exporter + doc_url: https://github.com/danielqsj/kafka_exporter + rules: + + - name: Linkerd + exporters: + - rules: + + - name: Istio + exporters: + - rules: + + - name: Blackbox + exporters: + - name: prometheus/blackbox_exporter + doc_url: https://github.com/prometheus/blackbox_exporter + rules: diff --git a/_layouts/default.html b/_layouts/default.html new file mode 100644 index 0000000..f5a0176 --- /dev/null +++ b/_layouts/default.html @@ -0,0 +1,54 @@ + + + + + {% if site.google_analytics %} + + + {% endif %} + + +{% seo %} + + + + + + + Skip to the content. + + + +
+ {{ content }} + + +
+ + diff --git a/alertmanager.md b/alertmanager.md new file mode 100644 index 0000000..d049bb4 --- /dev/null +++ b/alertmanager.md @@ -0,0 +1,56 @@ + +

+ AlertManager configuration +

+ +{% highlight yaml %} +# alertmanager.yml + +route: + # When a new group of alerts is created by an incoming alert, wait at + # least 'group_wait' to send the initial notification. + # This way ensures that you get multiple alerts for the same group that start + # firing shortly after another are batched together on the first + # notification. + group_wait: 10s + + # When the first notification was sent, wait 'group_interval' to send a betch + # of new alerts that started firing for that group. + group_interval: 5m + + # If an alert has successfully been sent, wait 'repeat_interval' to + # resend them. + repeat_interval: 30m + + # A default receiver + receiver: "slack" + + # All the above attributes are inherited by all child routes and can + # overwritten on each. + routes: + - receiver: "slack" + group_wait: 10s + match_re: + severity: error|warning + continue: true + + - receiver: "sms" + group_wait: 10s + match_re: + severity: error + continue: true + +receivers: + - name: "slack" + slack_configs: + - api_url: 'https://hooks.slack.com/services/XXXXXXXXX/XXXXXXXXX/xxxxxxxxxxxxxxxxxxxxxxxxxxx' + send_resolved: true + channel: 'monitoring' + text: "{{ range .Alerts }} {{ .Annotations.summary }}\n{{ .Annotations.description }}\n{{ end }}" + + - name: "sms" + webhook_config: + - url: http://a.b.c:8080/send/sms + send_resolved: true + +{% endhighlight %} diff --git a/assets/prometheus-logo.png b/assets/prometheus-logo.png new file mode 100644 index 0000000..7772109 Binary files /dev/null and b/assets/prometheus-logo.png differ diff --git a/index.md b/index.md new file mode 100644 index 0000000..fcccf2f --- /dev/null +++ b/index.md @@ -0,0 +1,32 @@ + + + +![Prometheus logo](assets/prometheus-logo.png){: .center-image } + +

+ AlertManager configuration +

+ + + See here + + +

+ Prometheus alerting rules +

+ + \ No newline at end of file diff --git a/rules.md b/rules.md new file mode 100644 index 0000000..8604774 --- /dev/null +++ b/rules.md @@ -0,0 +1,105 @@ + + +{% highlight yaml %} +# prometheus.yml + +global: + scrape_interval: 15s + ... + +rule_files: + - 'alerts/*.yml' + +scrape_configs: + ... + +{% endhighlight %} + +{% highlight yaml %} +# alerts/example-redis.yml + +groups: + +- name: ExampleRedisGroup + rules: + - alert: ExampleRedisDown + expr: redis_up{} == 0 + for: 2m + labels: + severity: error + annotations: + summary: "Redis instance down" + description: "Whatever" + +{% endhighlight %} + +