Merge branch 'master' into master-1

2026-06-26 11:27:00 +08:00 · 2025-02-16 23:46:12 +01:00 · 2025-02-16 23:46:12 +01:00 · 10724be49c
commit 10724be49c
parent 387743234f 7889a9a29b
36 changed files with 2205 additions and 1068 deletions
--- a/.github/workflows/dist.yml
+++ b/.github/workflows/dist.yml
@ -1,6 +1,7 @@
 name: Publish
 on:
  workflow_dispatch:
  push:
    branches:
      - master
@ -13,22 +14,23 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout Repo
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
      - name: Set up Ruby
        uses: ruby/setup-ruby@v1
        with:
-          ruby-version: 2.7
+          ruby-version: 3.4
      - name: Set up yq
        uses: mikefarah/yq@master
      - name: Install liquid
-        run: gem install liquid-cli
+        run: |
         gem install liquid -v 5.5.1
         gem install liquid-cli 
      - name: Build rule configuration
        run: |
          gem install liquid-cli
          cat _data/rules.yml | yq -I 0 -o json > _data/rules.json
          rm -rf dist/rules
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -8,12 +8,12 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout Repo
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
      - name: Set up Ruby
        uses: ruby/setup-ruby@v1
        with:
-          ruby-version: 2.7
+          ruby-version: 3.4
      - name: Set up yq
        uses: mikefarah/yq@master
@ -31,7 +31,7 @@ jobs:
            mkdir -p "${subdir}"
            # groupName=$(echo "{% assign groupName = name | split: ' ' %}{% capture groupNameCamelcase %}{% for word in groupName %}{{ word | capitalize }} {% endfor %}{% endcapture %} {{ groupNameCamelcase | remove: ' ' | remove: '-' }}" | liquid $(echo ${service} | base64 --decode | jq -r '.name | ascii_downcase | split(" ") | join("-")'))
-    
+
            for exporter in $(echo ${service} | base64 --decode | jq -r '.exporters[] | @base64'); do
              exporterName=$(echo ${exporter} | base64 --decode | jq -r '.slug')
              cat dist/template.yml | liquid "$(echo ${exporter} | base64 --decode)" > ${subdir}/${exporterName}.yml
--- a/.gitignore
+++ b/.gitignore
@ -4,3 +4,4 @@ _site/
 .jekyll-metadata
 _data/rules.json
 test/rules/
 /node_modules
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -32,8 +32,8 @@ Or with Docker:
 docker run --rm -it -p 4000:4000 -v $(pwd):/srv/jekyll jekyll/jekyll jekyll serve
 ```
-Or with Docker-Compose:
+Or with Docker Compose:
 ```
-docker-compose up -d
+docker compose up -d
 ```
--- a/FUNDING.json
+++ b/FUNDING.json
@ -0,0 +1,7 @@
 {
  "drips": {
    "ethereum": {
      "ownedBy": "0x1Baee8431ead537455399cC7099eBb219227C1f1"
    }
  }
 }
--- a/4
+++ b/4
@ -1,3 +1,3 @@
 source 'https://rubygems.org'
-gem 'github-pages', group: :jekyll_plugins
+gem 'github-pages', '>= 232', group: :jekyll_plugins
-gem 'webrick', '~> 1.3', '>= 1.3.1'
+gem 'webrick', '~> 1.8'
--- a/Gemfile.lock
+++ b/Gemfile.lock
@ -1,66 +1,61 @@
 GEM
  remote: https://rubygems.org/
  specs:
-    activesupport (6.0.6.1)
+    activesupport (7.2.1)
-      concurrent-ruby (~> 1.0, >= 1.0.2)
+      base64
-      i18n (>= 0.7, < 2)
+      bigdecimal
-      minitest (~> 5.1)
+      concurrent-ruby (~> 1.0, >= 1.3.1)
-      tzinfo (~> 1.1)
+      connection_pool (>= 2.2.5)
-      zeitwerk (~> 2.2, >= 2.2.2)
+      drb
-    addressable (2.8.0)
+      i18n (>= 1.6, < 2)
-      public_suffix (>= 2.0.2, < 5.0)
+      logger (>= 1.4.2)
      minitest (>= 5.1)
      securerandom (>= 0.3)
      tzinfo (~> 2.0, >= 2.0.5)
    addressable (2.8.7)
      public_suffix (>= 2.0.2, < 7.0)
    base64 (0.2.0)
    bigdecimal (3.1.8)
    coffee-script (2.4.1)
      coffee-script-source
      execjs
-    coffee-script-source (1.11.1)
+    coffee-script-source (1.12.2)
    colorator (1.1.0)
    commonmarker (0.23.10)
-    concurrent-ruby (1.2.0)
+    concurrent-ruby (1.3.4)
-    dnsruby (1.61.9)
+    connection_pool (2.4.1)
-      simpleidn (~> 0.1)
+    csv (3.3.0)
    dnsruby (1.72.2)
      simpleidn (~> 0.2.1)
    drb (2.2.1)
    em-websocket (0.5.3)
      eventmachine (>= 0.12.9)
      http_parser.rb (~> 0)
-    ethon (0.15.0)
+    ethon (0.16.0)
      ffi (>= 1.15.0)
    eventmachine (1.2.7)
-    execjs (2.8.1)
+    execjs (2.9.1)
-    faraday (1.10.0)
+    faraday (2.12.0)
-      faraday-em_http (~> 1.0)
+      faraday-net_http (>= 2.0, < 3.4)
-      faraday-em_synchrony (~> 1.0)
+      json
-      faraday-excon (~> 1.1)
+      logger
-      faraday-httpclient (~> 1.0)
+    faraday-net_http (3.3.0)
-      faraday-multipart (~> 1.0)
+      net-http
-      faraday-net_http (~> 1.0)
+    ffi (1.17.0)
-      faraday-net_http_persistent (~> 1.0)
+    ffi (1.17.0-x86_64-linux-gnu)
-      faraday-patron (~> 1.0)
+    ffi (1.17.0-x86_64-linux-musl)
      faraday-rack (~> 1.0)
      faraday-retry (~> 1.0)
      ruby2_keywords (>= 0.0.4)
    faraday-em_http (1.0.0)
    faraday-em_synchrony (1.0.0)
    faraday-excon (1.1.0)
    faraday-httpclient (1.0.1)
    faraday-multipart (1.0.3)
      multipart-post (>= 1.2, < 3)
    faraday-net_http (1.0.1)
    faraday-net_http_persistent (1.2.0)
    faraday-patron (1.0.0)
    faraday-rack (1.0.0)
    faraday-retry (1.0.3)
    ffi (1.15.5)
    forwardable-extended (2.6.0)
-    gemoji (3.0.1)
+    gemoji (4.1.0)
-    github-pages (226)
+    github-pages (232)
-      github-pages-health-check (= 1.17.9)
+      github-pages-health-check (= 1.18.2)
-      jekyll (= 3.9.2)
+      jekyll (= 3.10.0)
-      jekyll-avatar (= 0.7.0)
+      jekyll-avatar (= 0.8.0)
-      jekyll-coffeescript (= 1.1.1)
+      jekyll-coffeescript (= 1.2.2)
-      jekyll-commonmark-ghpages (= 0.2.0)
+      jekyll-commonmark-ghpages (= 0.5.1)
-      jekyll-default-layout (= 0.1.4)
+      jekyll-default-layout (= 0.1.5)
-      jekyll-feed (= 0.15.1)
+      jekyll-feed (= 0.17.0)
      jekyll-gist (= 1.5.0)
-      jekyll-github-metadata (= 2.13.0)
+      jekyll-github-metadata (= 2.16.1)
      jekyll-include-cache (= 0.2.1)
      jekyll-mentions (= 1.6.0)
      jekyll-optional-front-matter (= 0.3.2)
@ -87,32 +82,34 @@ GEM
      jekyll-theme-tactile (= 0.2.0)
      jekyll-theme-time-machine (= 0.2.0)
      jekyll-titles-from-headings (= 0.5.3)
-      jemoji (= 0.12.0)
+      jemoji (= 0.13.0)
-      kramdown (= 2.3.2)
+      kramdown (= 2.4.0)
      kramdown-parser-gfm (= 1.1.0)
-      liquid (= 4.0.3)
+      liquid (= 4.0.4)
      mercenary (~> 0.3)
      minima (= 2.5.1)
-      nokogiri (>= 1.13.4, < 2.0)
+      nokogiri (>= 1.16.2, < 2.0)
-      rouge (= 3.26.0)
+      rouge (= 3.30.0)
      terminal-table (~> 1.4)
-    github-pages-health-check (1.17.9)
+      webrick (~> 1.8)
    github-pages-health-check (1.18.2)
      addressable (~> 2.3)
      dnsruby (~> 1.60)
-      octokit (~> 4.0)
+      octokit (>= 4, < 8)
-      public_suffix (>= 3.0, < 5.0)
+      public_suffix (>= 3.0, < 6.0)
      typhoeus (~> 1.3)
-    html-pipeline (2.14.1)
+    html-pipeline (2.14.3)
      activesupport (>= 2)
      nokogiri (>= 1.4)
    http_parser.rb (0.8.0)
-    i18n (0.9.5)
+    i18n (1.14.6)
      concurrent-ruby (~> 1.0)
-    jekyll (3.9.2)
+    jekyll (3.10.0)
      addressable (~> 2.4)
      colorator (~> 1.0)
      csv (~> 3.0)
      em-websocket (~> 0.5)
-      i18n (~> 0.7)
+      i18n (>= 0.7, < 2)
      jekyll-sass-converter (~> 1.0)
      jekyll-watch (~> 2.0)
      kramdown (>= 1.17, < 3)
@ -121,27 +118,28 @@ GEM
      pathutil (~> 0.9)
      rouge (>= 1.7, < 4)
      safe_yaml (~> 1.0)
-    jekyll-avatar (0.7.0)
+      webrick (>= 1.0)
    jekyll-avatar (0.8.0)
      jekyll (>= 3.0, < 5.0)
-    jekyll-coffeescript (1.1.1)
+    jekyll-coffeescript (1.2.2)
      coffee-script (~> 2.2)
-      coffee-script-source (~> 1.11.1)
+      coffee-script-source (~> 1.12)
    jekyll-commonmark (1.4.0)
      commonmarker (~> 0.22)
-    jekyll-commonmark-ghpages (0.2.0)
+    jekyll-commonmark-ghpages (0.5.1)
-      commonmarker (~> 0.23.4)
+      commonmarker (>= 0.23.7, < 1.1.0)
-      jekyll (~> 3.9.0)
+      jekyll (>= 3.9, < 4.0)
      jekyll-commonmark (~> 1.4.0)
-      rouge (>= 2.0, < 4.0)
+      rouge (>= 2.0, < 5.0)
-    jekyll-default-layout (0.1.4)
+    jekyll-default-layout (0.1.5)
-      jekyll (~> 3.0)
+      jekyll (>= 3.0, < 5.0)
-    jekyll-feed (0.15.1)
+    jekyll-feed (0.17.0)
      jekyll (>= 3.7, < 5.0)
    jekyll-gist (1.5.0)
      octokit (~> 4.2)
-    jekyll-github-metadata (2.13.0)
+    jekyll-github-metadata (2.16.1)
      jekyll (>= 3.4, < 5.0)
-      octokit (~> 4.0, != 4.4.0)
+      octokit (>= 4, < 7, != 4.4.0)
    jekyll-include-cache (0.2.1)
      jekyll (>= 3.7, < 5.0)
    jekyll-mentions (1.6.0)
@ -212,40 +210,46 @@ GEM
      jekyll (>= 3.3, < 5.0)
    jekyll-watch (2.2.1)
      listen (~> 3.0)
-    jemoji (0.12.0)
+    jemoji (0.13.0)
-      gemoji (~> 3.0)
+      gemoji (>= 3, < 5)
      html-pipeline (~> 2.2)
      jekyll (>= 3.0, < 5.0)
-    kramdown (2.3.2)
+    json (2.7.2)
    kramdown (2.4.0)
      rexml
    kramdown-parser-gfm (1.1.0)
      kramdown (~> 2.0)
-    liquid (4.0.3)
+    liquid (4.0.4)
-    listen (3.7.1)
+    listen (3.9.0)
      rb-fsevent (~> 0.10, >= 0.10.3)
      rb-inotify (~> 0.9, >= 0.9.10)
    logger (1.6.1)
    mercenary (0.3.6)
    mini_portile2 (2.8.7)
    minima (2.5.1)
      jekyll (>= 3.5, < 5.0)
      jekyll-feed (~> 0.9)
      jekyll-seo-tag (~> 2.1)
-    minitest (5.17.0)
+    minitest (5.25.1)
-    multipart-post (2.1.1)
+    net-http (0.4.1)
-    nokogiri (1.14.3-x86_64-linux)
+      uri
    nokogiri (1.16.7)
      mini_portile2 (~> 2.8.2)
      racc (~> 1.4)
-    octokit (4.22.0)
+    nokogiri (1.16.7-x86_64-linux)
-      faraday (>= 0.9)
+      racc (~> 1.4)
-      sawyer (~> 0.8.0, >= 0.5.3)
+    octokit (4.25.1)
      faraday (>= 1, < 3)
      sawyer (~> 0.9)
    pathutil (0.16.2)
      forwardable-extended (~> 2.6)
-    public_suffix (4.0.7)
+    public_suffix (5.1.1)
-    racc (1.6.2)
+    racc (1.8.1)
-    rb-fsevent (0.11.1)
+    rb-fsevent (0.11.2)
-    rb-inotify (0.10.1)
+    rb-inotify (0.11.1)
      ffi (~> 1.0)
-    rexml (3.2.5)
+    rexml (3.3.9)
-    rouge (3.26.0)
+    rouge (3.30.0)
    ruby2_keywords (0.0.5)
    rubyzip (2.3.2)
    safe_yaml (1.0.5)
    sass (3.7.4)
@ -253,32 +257,29 @@ GEM
    sass-listen (4.0.0)
      rb-fsevent (~> 0.9, >= 0.9.4)
      rb-inotify (~> 0.9, >= 0.9.7)
-    sawyer (0.8.2)
+    sawyer (0.9.2)
      addressable (>= 2.3.5)
-      faraday (> 0.8, < 2.0)
+      faraday (>= 0.17.3, < 3)
-    simpleidn (0.2.1)
+    securerandom (0.3.1)
-      unf (~> 0.1.4)
+    simpleidn (0.2.3)
    terminal-table (1.8.0)
      unicode-display_width (~> 1.1, >= 1.1.1)
-    thread_safe (0.3.6)
+    typhoeus (1.4.1)
    typhoeus (1.4.0)
      ethon (>= 0.9.0)
-    tzinfo (1.2.11)
+    tzinfo (2.0.6)
-      thread_safe (~> 0.1)
+      concurrent-ruby (~> 1.0)
    unf (0.1.4)
      unf_ext
    unf_ext (0.0.8.1)
    unicode-display_width (1.8.0)
-    webrick (1.7.0)
+    uri (0.13.1)
-    zeitwerk (2.6.6)
+    webrick (1.8.2)
 PLATFORMS
  ruby
  x86_64-linux
  x86_64-linux-musl
 DEPENDENCIES
-  github-pages
+  github-pages (>= 232)
-  webrick (~> 1.3, >= 1.3.1)
+  webrick (~> 1.8)
 BUNDLED WITH
   2.3.13
--- a/README.md
+++ b/README.md
@ -4,6 +4,21 @@
 Collection available here: **[https://samber.github.io/awesome-prometheus-alerts](https://samber.github.io/awesome-prometheus-alerts)**
 <div align="center">
  <hr>
  <sup><b>Sponsored by:</b></sup>
  <br>
  <a href="https://betterstack.com">
    <div>
      <img src="https://samber.github.io/awesome-prometheus-alerts/assets/sponsor-betterstack.png" width="200" alt="Better Stack">
    </div>
    <div>
      Better Stack lets you centralize, search, and visualize your logs.
    </div>
  </a>
  <hr>
 </div>
 ## ✨ Contents
 - [Rules](#-rules)
@ -18,6 +33,7 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts
 - [Prometheus self-monitoring](https://samber.github.io/awesome-prometheus-alerts/rules#prometheus-internals)
 - [Host/Hardware](https://samber.github.io/awesome-prometheus-alerts/rules#host-and-hardware)
 - [SMART](https://samber.github.io/awesome-prometheus-alerts/rules#smart)
 - [Docker Containers](https://samber.github.io/awesome-prometheus-alerts/rules#docker-containers)
 - [Blackbox](https://samber.github.io/awesome-prometheus-alerts/rules#blackbox)
 - [Windows](https://samber.github.io/awesome-prometheus-alerts/rules#windows-server)
@ -35,12 +51,15 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts
 - [MongoDB](https://samber.github.io/awesome-prometheus-alerts/rules#mongodb)
 - [RabbitMQ](https://samber.github.io/awesome-prometheus-alerts/rules#rabbitmq)
 - [Elasticsearch](https://samber.github.io/awesome-prometheus-alerts/rules#elasticsearch)
 - [Meilisearch](https://samber.github.io/awesome-prometheus-alerts/rules#meilisearch)
 - [Cassandra](https://samber.github.io/awesome-prometheus-alerts/rules#cassandra)
 - [Clickhouse](https://samber.github.io/awesome-prometheus-alerts/rules#clickhouse)
 - [Zookeeper](https://samber.github.io/awesome-prometheus-alerts/rules#zookeeper)
 - [Kafka](https://samber.github.io/awesome-prometheus-alerts/rules#kafka)
 - [Pulsar](https://samber.github.io/awesome-prometheus-alerts/rules#pulsar)
 - [Nats](https://samber.github.io/awesome-prometheus-alerts/rules#nats)
 - [Solr](https://samber.github.io/awesome-prometheus-alerts/rules#solr)
 - [Hadoop](https://samber.github.io/awesome-prometheus-alerts/rules#hadoop)
 #### Reverse proxies and load balancers
@ -48,6 +67,7 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts
 - [Apache](https://samber.github.io/awesome-prometheus-alerts/rules#apache)
 - [HaProxy](https://samber.github.io/awesome-prometheus-alerts/rules#haproxy)
 - [Traefik](https://samber.github.io/awesome-prometheus-alerts/rules#traefik)
 - [Caddy](https://samber.github.io/awesome-prometheus-alerts/rules#caddy)
 #### Runtimes
@ -83,7 +103,9 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts
 - [Loki](https://samber.github.io/awesome-prometheus-alerts/rules#loki)
 - [Promtail](https://samber.github.io/awesome-prometheus-alerts/rules#promtail)
 - [Cortex](https://samber.github.io/awesome-prometheus-alerts/rules#cortex)
 - [Grafana Alloy](https://samber.github.io/awesome-prometheus-alerts/rules#grafana-alloy)
 - [Jenkins](https://samber.github.io/awesome-prometheus-alerts/rules#jenkins)
 - [Graph Node](https://samber.github.io/awesome-prometheus-alerts/rules#graph-node)
 ## 🤝 Contributing
--- a/_data/rules.yml
+++ b/_data/rules.yml
--- a/_layouts/default.html
+++ b/_layouts/default.html
@ -125,6 +125,18 @@
          class="fa fa-linkedin" target="_blank"></a>
      </li>
    </ul>
    <ul id="sponsoring">
      <li>
        Kindly supported by&nbsp; 👉
      </li>
      <li>
        <a href="https://betterstack.com/">
          <img width="" src="assets/sponsor-betterstack.png" />
        </a>
      </li>
    </ul>
  </header>
  <main id="content" class="main-content" role="main">
@ -147,7 +159,7 @@
        s['ScreebObject']=r;s[r]=s[r]||function(){(s[r].q=s[r].q||[]).push(arguments)};
        b=c.createElement('script');b.type='text/javascript';
        b.id=r;b.src=ee;b.async=1;c.getElementsByTagName("head")[0].appendChild(b);
-    }(window,document,'$screeb','https://t.screeb.app/tag.js'));
+    }(window,document,'$screeb','https://t2.screeb.app/tag.js'));
    $screeb('init', '232450e3-d3fe-4240-b543-649a5041a7db');
  </script>
--- a/alertmanager.md
+++ b/alertmanager.md
@ -80,7 +80,7 @@ route:
    - receiver: "pager"
      group_wait: 10s
      match_re:
-        severity: critial
+        severity: critical
      continue: true
 receivers:
@ -135,4 +135,7 @@ If the notification takes too much time to be triggered, check the following del
 - `for: 5m` (alerts/example-mysql.yml)
 - `group_wait = 10s` (alertmanager.yml)
-Also read [https://pracucci.com/prometheus-understanding-the-delays-on-alerting.html](https://pracucci.com/prometheus-understanding-the-delays-on-alerting.html).
+Also read:
 - [https://pracucci.com/prometheus-understanding-the-delays-on-alerting.html](https://pracucci.com/prometheus-understanding-the-delays-on-alerting.html).
 - [https://hodovi.cc/blog/creating-awesome-alertmanager-templates-for-slack/](https://hodovi.cc/blog/creating-awesome-alertmanager-templates-for-slack/)
 - [https://grafana.com/blog/2024/10/03/how-to-use-prometheus-to-efficiently-detect-anomalies-at-scale/](https://grafana.com/blog/2024/10/03/how-to-use-prometheus-to-efficiently-detect-anomalies-at-scale/)
--- a/assets/css/app.css
+++ b/assets/css/app.css
@ -115,3 +115,29 @@ h2 {
        max-width: 85rem;
    }
 }
 ul#sponsoring {
    display: flex;
    align-items: center;
    justify-content: center;
    margin-top: 50px;
 }
 ul#sponsoring li {
    display: flex;
    padding: 0px 15px;
    font-size: 16px;
 }
 ul#sponsoring li a {
    display: flex;
 }
 ul#sponsoring li a img {
    max-width: 180px;
    max-height: 80px;
 }
 .page-header {
    padding-bottom: 30px;
 }
--- a/assets/sponsor-betterstack.png
+++ b/assets/sponsor-betterstack.png
--- a/dist/rules/caddy/null.yml
+++ b/dist/rules/caddy/null.yml
@ -0,0 +1,32 @@
 groups:
 - name: 
  rules:
    - alert: CaddyReverseProxyDown
      expr: 'count(caddy_reverse_proxy_upstreams_healthy) by (upstream) == 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Caddy Reverse Proxy Down (instance {{ $labels.instance }})
        description: "All Caddy reverse proxies are down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: CaddyHighHttp4xxErrorRateService
      expr: 'sum(rate(caddy_http_request_duration_seconds_count{code=~"4.."}[3m])) by (instance) / sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) * 100 > 5'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: Caddy high HTTP 4xx error rate service (instance {{ $labels.instance }})
        description: "Caddy service 4xx error rate is above 5%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: CaddyHighHttp5xxErrorRateService
      expr: 'sum(rate(caddy_http_request_duration_seconds_count{code=~"5.."}[3m])) by (instance) / sum(rate(caddy_http_request_duration_seconds_count[3m])) by (instance) * 100 > 5'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: Caddy high HTTP 5xx error rate service (instance {{ $labels.instance }})
        description: "Caddy service 5xx error rate is above 5%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/dist/rules/clickhouse/embedded-exporter.yml
+++ b/dist/rules/clickhouse/embedded-exporter.yml
@ -0,0 +1,131 @@
 groups:
 - name: EmbeddedExporter
  rules:
    - alert: ClickhouseMemoryUsageCritical
      expr: 'ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 90'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: ClickHouse Memory Usage Critical (instance {{ $labels.instance }})
        description: "Memory usage is critically high, over 90%.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: ClickhouseMemoryUsageWarning
      expr: 'ClickHouseAsyncMetrics_CGroupMemoryUsed / ClickHouseAsyncMetrics_CGroupMemoryTotal * 100 > 80'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: ClickHouse Memory Usage Warning (instance {{ $labels.instance }})
        description: "Memory usage is over 80%.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: ClickhouseDiskSpaceLowOnDefault
      expr: 'ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 20'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: ClickHouse Disk Space Low on Default (instance {{ $labels.instance }})
        description: "Disk space on default is below 20%.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: ClickhouseDiskSpaceCriticalOnDefault
      expr: 'ClickHouseAsyncMetrics_DiskAvailable_default / (ClickHouseAsyncMetrics_DiskAvailable_default + ClickHouseAsyncMetrics_DiskUsed_default) * 100 < 10'
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: ClickHouse Disk Space Critical on Default (instance {{ $labels.instance }})
        description: "Disk space on default disk is critically low, below 10%.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: ClickhouseDiskSpaceLowOnBackups
      expr: 'ClickHouseAsyncMetrics_DiskAvailable_backups / (ClickHouseAsyncMetrics_DiskAvailable_backups + ClickHouseAsyncMetrics_DiskUsed_backups) * 100 < 20'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: ClickHouse Disk Space Low on Backups (instance {{ $labels.instance }})
        description: "Disk space on backups is below 20%.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: ClickhouseReplicaErrors
      expr: 'ClickHouseErrorMetric_ALL_REPLICAS_ARE_STALE == 1 or ClickHouseErrorMetric_ALL_REPLICAS_LOST == 1'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: ClickHouse Replica Errors (instance {{ $labels.instance }})
        description: "Critical replica errors detected, either all replicas are stale or lost.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: ClickhouseNoAvailableReplicas
      expr: 'ClickHouseErrorMetric_NO_AVAILABLE_REPLICA == 1'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: ClickHouse No Available Replicas (instance {{ $labels.instance }})
        description: "No available replicas in ClickHouse.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: ClickhouseNoLiveReplicas
      expr: 'ClickHouseErrorMetric_TOO_FEW_LIVE_REPLICAS == 1'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: ClickHouse No Live Replicas (instance {{ $labels.instance }})
        description: "There are too few live replicas available, risking data loss and service disruption.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: ClickhouseHighNetworkTraffic
      expr: 'ClickHouseMetrics_NetworkSend > 250 or ClickHouseMetrics_NetworkReceive > 250'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: ClickHouse High Network Traffic (instance {{ $labels.instance }})
        description: "Network traffic is unusually high, may affect cluster performance.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: ClickhouseHighTcpConnections
      expr: 'ClickHouseMetrics_TCPConnection > 400'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: ClickHouse High TCP Connections (instance {{ $labels.instance }})
        description: "High number of TCP connections, indicating heavy client or inter-cluster communication.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: ClickhouseInterserverConnectionIssues
      expr: 'increase(ClickHouseMetrics_InterserverConnection[5m]) > 0'
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: ClickHouse Interserver Connection Issues (instance {{ $labels.instance }})
        description: "An increase in interserver connections may indicate replication or distributed query handling issues.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: ClickhouseZookeeperConnectionIssues
      expr: 'avg(ClickHouseMetrics_ZooKeeperSession) != 1'
      for: 3m
      labels:
        severity: warning
      annotations:
        summary: ClickHouse ZooKeeper Connection Issues (instance {{ $labels.instance }})
        description: "ClickHouse is experiencing issues with ZooKeeper connections, which may affect cluster state and coordination.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: ClickhouseAuthenticationFailures
      expr: 'increase(ClickHouseErrorMetric_AUTHENTICATION_FAILED[5m]) > 0'
      for: 0m
      labels:
        severity: info
      annotations:
        summary: ClickHouse Authentication Failures (instance {{ $labels.instance }})
        description: "Authentication failures detected, indicating potential security issues or misconfiguration.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: ClickhouseAccessDeniedErrors
      expr: 'increase(ClickHouseErrorMetric_RESOURCE_ACCESS_DENIED[5m]) > 0'
      for: 0m
      labels:
        severity: info
      annotations:
        summary: ClickHouse Access Denied Errors (instance {{ $labels.instance }})
        description: "Access denied errors have been logged, which could indicate permission issues or unauthorized access attempts.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/dist/rules/docker-containers/google-cadvisor.yml
+++ b/dist/rules/docker-containers/google-cadvisor.yml
@ -23,7 +23,7 @@ groups:
        description: "A container is absent for 5 min\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: ContainerHighCpuUtilization
-      expr: '(sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) > 80'
+      expr: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) > 80'
      for: 2m
      labels:
        severity: warning
@ -50,8 +50,8 @@ groups:
        description: "Container Volume usage is above 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: ContainerHighThrottleRate
-      expr: 'rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1'
+      expr: 'sum(increase(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) by (container, pod, namespace) / sum(increase(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > ( 25 / 100 )'
-      for: 2m
+      for: 5m
      labels:
        severity: warning
      annotations:
@ -69,7 +69,7 @@ groups:
    - alert: ContainerLowCpuUtilization
-      expr: '(sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) < 20'
+      expr: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) < 20'
      for: 7d
      labels:
        severity: info
--- a/dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml
+++ b/dist/rules/elasticsearch/prometheus-community-elasticsearch-exporter.yml
@ -138,3 +138,39 @@ groups:
      annotations:
        summary: Elasticsearch no new documents (instance {{ $labels.instance }})
        description: "No new documents for 10 min!\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: ElasticsearchHighIndexingLatency
      expr: 'elasticsearch_indices_indexing_index_time_seconds_total / elasticsearch_indices_indexing_index_total > 0.0005'
      for: 10m
      labels:
        severity: warning
      annotations:
        summary: Elasticsearch High Indexing Latency (instance {{ $labels.instance }})
        description: "The indexing latency on Elasticsearch cluster is higher than the threshold.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: ElasticsearchHighIndexingRate
      expr: 'sum(rate(elasticsearch_indices_indexing_index_total[1m]))> 10000'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Elasticsearch High Indexing Rate (instance {{ $labels.instance }})
        description: "The indexing rate on Elasticsearch cluster is higher than the threshold.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: ElasticsearchHighQueryRate
      expr: 'sum(rate(elasticsearch_indices_search_query_total[1m])) > 100'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Elasticsearch High Query Rate (instance {{ $labels.instance }})
        description: "The query rate on Elasticsearch cluster is higher than the threshold.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: ElasticsearchHighQueryLatency
      expr: 'elasticsearch_indices_search_fetch_time_seconds / elasticsearch_indices_search_fetch_total > 1'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Elasticsearch High Query Latency (instance {{ $labels.instance }})
        description: "The query latency on Elasticsearch cluster is higher than the threshold.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/dist/rules/grafana-alloy/embedded-exporter.yml
+++ b/dist/rules/grafana-alloy/embedded-exporter.yml
@ -0,0 +1,14 @@
 groups:
 - name: EmbeddedExporter
  rules:
    - alert: GrafanaAlloyServiceDown
      expr: 'count by (instance) (alloy_build_info) unless count by (instance) (alloy_build_info offset 2m)  '
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Grafana Alloy service down (instance {{ $labels.instance }})
        description: "Alloy on (instance {{ $labels.instance }}) is not responding or has stopped running.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/dist/rules/graph-node/embedded-exporter.yml
+++ b/dist/rules/graph-node/embedded-exporter.yml
@ -0,0 +1,59 @@
 groups:
 - name: EmbeddedExporter
  rules:
    - alert: ProviderFailedBecauseNet_versionFailed
      expr: 'eth_rpc_status == 1'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Provider failed because net_version failed (instance {{ $labels.instance }})
        description: "Failed net_version for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: ProviderFailedBecauseGetGenesisFailed
      expr: 'eth_rpc_status == 2'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Provider failed because get genesis failed (instance {{ $labels.instance }})
        description: "Failed to get genesis for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: ProviderFailedBecauseNet_versionTimeout
      expr: 'eth_rpc_status == 3'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Provider failed because net_version timeout (instance {{ $labels.instance }})
        description: "net_version timeout for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: ProviderFailedBecauseGetGenesisTimeout
      expr: 'eth_rpc_status == 4'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Provider failed because get genesis timeout (instance {{ $labels.instance }})
        description: "Timeout to get genesis for Provider `{{$labels.provider}}` in Graph node `{{$labels.instance}}`\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: StoreConnectionIsTooSlow
      expr: 'store_connection_wait_time_ms > 10'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Store connection is too slow (instance {{ $labels.instance }})
        description: "Store connection is too slow to `{{$labels.pool}}` pool, `{{$labels.shard}}` shard in Graph node `{{$labels.instance}}`\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: StoreConnectionIsTooSlow
      expr: 'store_connection_wait_time_ms > 20'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Store connection is too slow (instance {{ $labels.instance }})
        description: "Store connection is too slow to `{{$labels.pool}}` pool, `{{$labels.shard}}` shard in Graph node `{{$labels.instance}}`\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/dist/rules/hadoop/jmx_exporter.yml
+++ b/dist/rules/hadoop/jmx_exporter.yml
@ -0,0 +1,95 @@
 groups:
 - name: Jmx_exporter
  rules:
    - alert: HadoopNameNodeDown
      expr: 'up{job="hadoop-namenode"} == 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Hadoop Name Node Down (instance {{ $labels.instance }})
        description: "The Hadoop NameNode service is unavailable.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HadoopResourceManagerDown
      expr: 'up{job="hadoop-resourcemanager"} == 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Hadoop Resource Manager Down (instance {{ $labels.instance }})
        description: "The Hadoop ResourceManager service is unavailable.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HadoopDataNodeOutOfService
      expr: 'hadoop_datanode_last_heartbeat == 0'
      for: 10m
      labels:
        severity: warning
      annotations:
        summary: Hadoop Data Node Out Of Service (instance {{ $labels.instance }})
        description: "The Hadoop DataNode is not sending heartbeats.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HadoopHdfsDiskSpaceLow
      expr: '(hadoop_hdfs_bytes_total - hadoop_hdfs_bytes_used) / hadoop_hdfs_bytes_total < 0.1'
      for: 15m
      labels:
        severity: warning
      annotations:
        summary: Hadoop HDFS Disk Space Low (instance {{ $labels.instance }})
        description: "Available HDFS disk space is running low.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HadoopMapReduceTaskFailures
      expr: 'hadoop_mapreduce_task_failures_total > 100'
      for: 10m
      labels:
        severity: critical
      annotations:
        summary: Hadoop Map Reduce Task Failures (instance {{ $labels.instance }})
        description: "There is an unusually high number of MapReduce task failures.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HadoopResourceManagerMemoryHigh
      expr: 'hadoop_resourcemanager_memory_bytes / hadoop_resourcemanager_memory_max_bytes > 0.8'
      for: 15m
      labels:
        severity: warning
      annotations:
        summary: Hadoop Resource Manager Memory High (instance {{ $labels.instance }})
        description: "The Hadoop ResourceManager is approaching its memory limit.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HadoopYarnContainerAllocationFailures
      expr: 'hadoop_yarn_container_allocation_failures_total > 10'
      for: 10m
      labels:
        severity: warning
      annotations:
        summary: Hadoop YARN Container Allocation Failures (instance {{ $labels.instance }})
        description: "There is a significant number of YARN container allocation failures.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HadoopHbaseRegionCountHigh
      expr: 'hadoop_hbase_region_count > 5000'
      for: 15m
      labels:
        severity: warning
      annotations:
        summary: Hadoop HBase Region Count High (instance {{ $labels.instance }})
        description: "The HBase cluster has an unusually high number of regions.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HadoopHbaseRegionServerHeapLow
      expr: 'hadoop_hbase_region_server_heap_bytes / hadoop_hbase_region_server_max_heap_bytes < 0.2'
      for: 10m
      labels:
        severity: critical
      annotations:
        summary: Hadoop HBase Region Server Heap Low (instance {{ $labels.instance }})
        description: "HBase Region Servers are running low on heap space.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HadoopHbaseWriteRequestsLatencyHigh
      expr: 'hadoop_hbase_write_requests_latency_seconds > 0.5'
      for: 10m
      labels:
        severity: warning
      annotations:
        summary: Hadoop HBase Write Requests Latency High (instance {{ $labels.instance }})
        description: "HBase Write Requests are experiencing high latency.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/dist/rules/haproxy/haproxy-exporter-v1.yml
+++ b/dist/rules/haproxy/haproxy-exporter-v1.yml
@ -77,7 +77,7 @@ groups:
        description: "Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HaproxyBackendMaxActiveSession
-      expr: '((sum by (backend) (avg_over_time(haproxy_backend_max_sessions[2m]) * 100) / sum by (backend) (avg_over_time(haproxy_backend_limit_sessions[2m])))) > 80'
+      expr: '((sum by (backend) (avg_over_time(haproxy_backend_current_sessions[2m]) * 100) / sum by (backend) (avg_over_time(haproxy_backend_limit_sessions[2m])))) > 80'
      for: 2m
      labels:
        severity: warning
--- a/dist/rules/host-and-hardware/node-exporter.yml
+++ b/dist/rules/host-and-hardware/node-exporter.yml
@ -5,7 +5,7 @@ groups:
  rules:
    - alert: HostOutOfMemory
-      expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < .10)'
      for: 2m
      labels:
        severity: warning
@ -14,106 +14,97 @@ groups:
        description: "Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostMemoryUnderMemoryPressure
-      expr: '(rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: '(rate(node_vmstat_pgmajfault[5m]) > 1000)'
-      for: 2m
+      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Host memory under memory pressure (instance {{ $labels.instance }})
-        description: "The node is under heavy memory pressure. High rate of major page faults\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "The node is under heavy memory pressure. High rate of loading memory pages from disk.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostMemoryIsUnderutilized
-      expr: '(100 - (rate(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: 'min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8'
-      for: 1w
+      for: 0m
      labels:
        severity: info
      annotations:
        summary: Host Memory is underutilized (instance {{ $labels.instance }})
-        description: "Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostUnusualNetworkThroughputIn
-      expr: '(sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: '((rate(node_network_receive_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)'
-      for: 5m
+      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Host unusual network throughput in (instance {{ $labels.instance }})
-        description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Host receive bandwidth is high (>80%).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostUnusualNetworkThroughputOut
-      expr: '(sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: '((rate(node_network_transmit_bytes_total[5m]) / on(instance, device) node_network_speed_bytes) > .80)'
-      for: 5m
+      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Host unusual network throughput out (instance {{ $labels.instance }})
-        description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Host transmit bandwidth is high (>80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostUnusualDiskReadRate
-      expr: '(sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: '(rate(node_disk_io_time_seconds_total[5m]) > .80)'
-      for: 5m
+      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Host unusual disk read rate (instance {{ $labels.instance }})
-        description: "Disk is probably reading too much data (> 50 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Disk is too busy (IO wait > 80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostUnusualDiskWriteRate
      expr: '(sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Host unusual disk write rate (instance {{ $labels.instance }})
        description: "Disk is probably writing too much data (> 50 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostOutOfDiskSpace
-      expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: '(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0)'
      for: 2m
      labels:
-        severity: warning
+        severity: critical
      annotations:
        summary: Host out of disk space (instance {{ $labels.instance }})
        description: "Disk is almost full (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-    - alert: HostDiskWillFillIn24Hours
+    - alert: HostDiskMayFillIn24Hours
-      expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: 'predict_linear(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_avail_bytes > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
-        summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
+        summary: Host disk may fill in 24 hours (instance {{ $labels.instance }})
-        description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Filesystem will likely run out of space within the next 24 hours.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostOutOfInodes
-      expr: '(node_filesystem_files_free / node_filesystem_files * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: '(node_filesystem_files_free / node_filesystem_files < .10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0)'
      for: 2m
      labels:
-        severity: warning
+        severity: critical
      annotations:
        summary: Host out of inodes (instance {{ $labels.instance }})
        description: "Disk is almost running out of available inodes (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostFilesystemDeviceError
-      expr: 'node_filesystem_device_error == 1'
+      expr: 'node_filesystem_device_error{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} == 1'
-      for: 0m
+      for: 2m
      labels:
        severity: critical
      annotations:
        summary: Host filesystem device error (instance {{ $labels.instance }})
-        description: "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Error stat-ing the {{ $labels.mountpoint }} filesystem\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-    - alert: HostInodesWillFillIn24Hours
+    - alert: HostInodesMayFillIn24Hours
-      expr: '(node_filesystem_files_free / node_filesystem_files * 100 < 10 and predict_linear(node_filesystem_files_free[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: 'predict_linear(node_filesystem_files_free{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_files_free > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
-        summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
+        summary: Host inodes may fill in 24 hours (instance {{ $labels.instance }})
-        description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Filesystem will likely run out of inodes within the next 24 hours at current write rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostUnusualDiskReadLatency
-      expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0'
      for: 2m
      labels:
        severity: warning
@ -122,7 +113,7 @@ groups:
        description: "Disk latency is growing (read operations > 100ms)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostUnusualDiskWriteLatency
-      expr: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0)'
      for: 2m
      labels:
        severity: warning
@ -131,7 +122,7 @@ groups:
        description: "Disk latency is growing (write operations > 100ms)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostHighCpuLoad
-      expr: '(sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > .80'
      for: 10m
      labels:
        severity: warning
@ -140,16 +131,16 @@ groups:
        description: "CPU load is > 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostCpuIsUnderutilized
-      expr: '(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: '(min by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8'
      for: 1w
      labels:
        severity: info
      annotations:
        summary: Host CPU is underutilized (instance {{ $labels.instance }})
-        description: "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "CPU load has been < 20% for 1 week. Consider reducing the number of CPUs.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostCpuStealNoisyNeighbor
-      expr: '(avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: 'avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10'
      for: 0m
      labels:
        severity: warning
@ -158,34 +149,34 @@ groups:
        description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostCpuHighIowait
-      expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: 'avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > .10'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Host CPU high iowait (instance {{ $labels.instance }})
-        description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "CPU iowait > 10%. Your CPU is idling waiting for storage to respond.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostUnusualDiskIo
-      expr: '(rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: 'rate(node_disk_io_time_seconds_total[5m]) > 0.8'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Host unusual disk IO (instance {{ $labels.instance }})
-        description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities. Check storage for issues.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-    - alert: HostContextSwitching
+    - alert: HostContextSwitchingHigh
-      expr: '((rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2'
      for: 0m
      labels:
        severity: warning
      annotations:
-        summary: Host context switching (instance {{ $labels.instance }})
+        summary: Host context switching high (instance {{ $labels.instance }})
-        description: "Context switching is growing on the node (> 10000 / s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Context switching is growing on the node (twice the daily average during the last 15m)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostSwapIsFillingUp
-      expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80)'
      for: 2m
      labels:
        severity: warning
@ -194,7 +185,7 @@ groups:
        description: "Swap is filling up (>80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostSystemdServiceCrashed
-      expr: '(node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: '(node_systemd_unit_state{state="failed"} == 1)'
      for: 0m
      labels:
        severity: warning
@ -203,7 +194,7 @@ groups:
        description: "systemd service crashed\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostPhysicalComponentTooHot
-      expr: '((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl"} > 75)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: 'node_hwmon_temp_celsius > node_hwmon_temp_max_celsius'
      for: 5m
      labels:
        severity: warning
@ -212,7 +203,7 @@ groups:
        description: "Physical hardware component too hot\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostNodeOvertemperatureAlarm
-      expr: '(node_hwmon_temp_crit_alarm_celsius == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: '((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1))'
      for: 0m
      labels:
        severity: critical
@ -220,35 +211,35 @@ groups:
        summary: Host node overtemperature alarm (instance {{ $labels.instance }})
        description: "Physical node temperature alarm triggered\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-    - alert: HostRaidArrayGotInactive
+    - alert: HostSoftwareRaidInsufficientDrives
-      expr: '(node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: '((node_md_disks_required - on(device, instance) node_md_disks{state="active"}) > 0)'
      for: 0m
      labels:
        severity: critical
      annotations:
-        summary: Host RAID array got inactive (instance {{ $labels.instance }})
+        summary: Host software RAID insufficient drives (instance {{ $labels.instance }})
-        description: "RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} has insufficient drives remaining.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-    - alert: HostRaidDiskFailure
+    - alert: HostSoftwareRaidDiskFailure
-      expr: '(node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: '(node_md_disks{state="failed"} > 0)'
      for: 2m
      labels:
        severity: warning
      annotations:
-        summary: Host RAID disk failure (instance {{ $labels.instance }})
+        summary: Host software RAID disk failure (instance {{ $labels.instance }})
-        description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "MD RAID array {{ $labels.device }} on {{ $labels.instance }} needs attention.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostKernelVersionDeviations
-      expr: '(count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: 'changes(node_uname_info[1h]) > 0'
-      for: 6h
+      for: 0m
      labels:
-        severity: warning
+        severity: info
      annotations:
        summary: Host kernel version deviations (instance {{ $labels.instance }})
-        description: "Different kernel versions are running\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Kernel version for {{ $labels.instance }} has changed.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostOomKillDetected
-      expr: '(increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: '(increase(node_vmstat_oom_kill[1m]) > 0)'
      for: 0m
      labels:
        severity: warning
@ -257,7 +248,7 @@ groups:
        description: "OOM kill detected\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostEdacCorrectableErrorsDetected
-      expr: '(increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: '(increase(node_edac_correctable_errors_total[1m]) > 0)'
      for: 0m
      labels:
        severity: info
@ -266,7 +257,7 @@ groups:
        description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostEdacUncorrectableErrorsDetected
-      expr: '(node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: '(node_edac_uncorrectable_errors_total > 0)'
      for: 0m
      labels:
        severity: warning
@ -275,7 +266,7 @@ groups:
        description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostNetworkReceiveErrors
-      expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01)'
      for: 2m
      labels:
        severity: warning
@ -284,7 +275,7 @@ groups:
        description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostNetworkTransmitErrors
-      expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01)'
      for: 2m
      labels:
        severity: warning
@ -292,17 +283,8 @@ groups:
        summary: Host Network Transmit Errors (instance {{ $labels.instance }})
        description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostNetworkInterfaceSaturated
      expr: '((rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} > 0.8 < 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: Host Network Interface Saturated (instance {{ $labels.instance }})
        description: "The network interface \"{{ $labels.device }}\" on \"{{ $labels.instance }}\" is getting overloaded.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostNetworkBondDegraded
-      expr: '((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: '((node_bonding_active - node_bonding_slaves) != 0)'
      for: 2m
      labels:
        severity: warning
@ -311,7 +293,7 @@ groups:
        description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostConntrackLimit
-      expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8)'
      for: 5m
      labels:
        severity: warning
@ -320,7 +302,7 @@ groups:
        description: "The number of conntrack is approaching limit\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostClockSkew
-      expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0))'
      for: 10m
      labels:
        severity: warning
@ -329,7 +311,7 @@ groups:
        description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostClockNotSynchronising
-      expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16)'
      for: 2m
      labels:
        severity: warning
@ -338,7 +320,7 @@ groups:
        description: "Clock not synchronising. Ensure NTP is configured on this host.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostRequiresReboot
-      expr: '(node_reboot_required > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
+      expr: '(node_reboot_required > 0)'
      for: 4h
      labels:
        severity: info
--- a/dist/rules/kubernetes/kubestate-exporter.yml
+++ b/dist/rules/kubernetes/kubestate-exporter.yml
@ -10,44 +10,44 @@ groups:
      labels:
        severity: critical
      annotations:
-        summary: Kubernetes node not ready (instance {{ $labels.instance }})
+        summary: Kubernetes Node ready (node {{ $labels.node }})
        description: "Node {{ $labels.node }} has been unready for a long time\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-    - alert: KubernetesMemoryPressure
+    - alert: KubernetesNodeMemoryPressure
      expr: 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1'
      for: 2m
      labels:
        severity: critical
      annotations:
-        summary: Kubernetes memory pressure (instance {{ $labels.instance }})
+        summary: Kubernetes memory pressure (node {{ $labels.node }})
-        description: "{{ $labels.node }} has MemoryPressure condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Node {{ $labels.node }} has MemoryPressure condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-    - alert: KubernetesDiskPressure
+    - alert: KubernetesNodeDiskPressure
      expr: 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1'
      for: 2m
      labels:
        severity: critical
      annotations:
-        summary: Kubernetes disk pressure (instance {{ $labels.instance }})
+        summary: Kubernetes disk pressure (node {{ $labels.node }})
-        description: "{{ $labels.node }} has DiskPressure condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Node {{ $labels.node }} has DiskPressure condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-    - alert: KubernetesNetworkUnavailable
+    - alert: KubernetesNodeNetworkUnavailable
      expr: 'kube_node_status_condition{condition="NetworkUnavailable",status="true"} == 1'
      for: 2m
      labels:
        severity: critical
      annotations:
-        summary: Kubernetes network unavailable (instance {{ $labels.instance }})
+        summary: Kubernetes Node network unavailable (instance {{ $labels.instance }})
-        description: "{{ $labels.node }} has NetworkUnavailable condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Node {{ $labels.node }} has NetworkUnavailable condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-    - alert: KubernetesOutOfCapacity
+    - alert: KubernetesNodeOutOfPodCapacity
-      expr: 'sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90'
+      expr: 'sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid, instance) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90'
      for: 2m
      labels:
        severity: warning
      annotations:
-        summary: Kubernetes out of capacity (instance {{ $labels.instance }})
+        summary: Kubernetes Node out of pod capacity (instance {{ $labels.instance }})
-        description: "{{ $labels.node }} is out of capacity\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Node {{ $labels.node }} is out of pod capacity\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: KubernetesContainerOomKiller
      expr: '(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1'
@ -55,7 +55,7 @@ groups:
      labels:
        severity: warning
      annotations:
-        summary: Kubernetes container oom killer (instance {{ $labels.instance }})
+        summary: Kubernetes container oom killer ({{ $labels.namespace }}/{{ $labels.pod }}:{{ $labels.container }})
        description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: KubernetesJobFailed
@ -64,16 +64,25 @@ groups:
      labels:
        severity: warning
      annotations:
-        summary: Kubernetes Job failed (instance {{ $labels.instance }})
+        summary: Kubernetes Job failed ({{ $labels.namespace }}/{{ $labels.job_name }})
        description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: KubernetesJobNotStarting
      expr: 'kube_job_status_active == 0 and kube_job_status_failed == 0 and kube_job_status_succeeded == 0 and (time() - kube_job_status_start_time) > 600'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Kubernetes Job not starting ({{ $labels.namespace }}/{{ $labels.job_name }})
        description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} did not start for 10 minutes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: KubernetesCronjobSuspended
      expr: 'kube_cronjob_spec_suspend != 0'
      for: 0m
      labels:
        severity: warning
      annotations:
-        summary: Kubernetes CronJob suspended (instance {{ $labels.instance }})
+        summary: Kubernetes CronJob suspended ({{ $labels.namespace }}/{{ $labels.cronjob }})
        description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is suspended\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: KubernetesPersistentvolumeclaimPending
@ -82,7 +91,7 @@ groups:
      labels:
        severity: warning
      annotations:
-        summary: Kubernetes PersistentVolumeClaim pending (instance {{ $labels.instance }})
+        summary: Kubernetes PersistentVolumeClaim pending ({{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }})
        description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pending\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: KubernetesVolumeOutOfDiskSpace
@ -95,13 +104,13 @@ groups:
        description: "Volume is almost full (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: KubernetesVolumeFullInFourDays
-      expr: 'predict_linear(kubelet_volume_stats_available_bytes[6h], 4 * 24 * 3600) < 0'
+      expr: 'predict_linear(kubelet_volume_stats_available_bytes[6h:5m], 4 * 24 * 3600) < 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Kubernetes Volume full in four days (instance {{ $labels.instance }})
-        description: "{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Volume under {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: KubernetesPersistentvolumeError
      expr: 'kube_persistentvolume_status_phase{phase=~"Failed|Pending", job="kube-state-metrics"} > 0'
@ -109,8 +118,8 @@ groups:
      labels:
        severity: critical
      annotations:
-        summary: Kubernetes PersistentVolume error (instance {{ $labels.instance }})
+        summary: Kubernetes PersistentVolumeClaim pending ({{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }})
-        description: "Persistent volume is in bad state\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Persistent volume {{ $labels.persistentvolume }} is in bad state\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: KubernetesStatefulsetDown
      expr: 'kube_statefulset_replicas != kube_statefulset_status_replicas_ready > 0'
@ -118,35 +127,35 @@ groups:
      labels:
        severity: critical
      annotations:
-        summary: Kubernetes StatefulSet down (instance {{ $labels.instance }})
+        summary: Kubernetes StatefulSet down ({{ $labels.namespace }}/{{ $labels.statefulset }})
-        description: "A StatefulSet went down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} went down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-    - alert: KubernetesHpaScalingAbility
+    - alert: KubernetesHpaScaleInability
-      expr: 'kube_horizontalpodautoscaler_status_condition{status="false", condition="AbleToScale"} == 1'
+      expr: '(kube_horizontalpodautoscaler_spec_max_replicas - kube_horizontalpodautoscaler_status_desired_replicas) * on (horizontalpodautoscaler,namespace) (kube_horizontalpodautoscaler_status_condition{condition="ScalingLimited", status="true"} == 1) == 0'
      for: 2m
      labels:
        severity: warning
      annotations:
-        summary: Kubernetes HPA scaling ability (instance {{ $labels.instance }})
+        summary: Kubernetes HPA scale inability (instance {{ $labels.instance }})
-        description: "Pod is unable to scale\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to scale\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-    - alert: KubernetesHpaMetricAvailability
+    - alert: KubernetesHpaMetricsUnavailability
      expr: 'kube_horizontalpodautoscaler_status_condition{status="false", condition="ScalingActive"} == 1'
      for: 0m
      labels:
        severity: warning
      annotations:
-        summary: Kubernetes HPA metric availability (instance {{ $labels.instance }})
+        summary: Kubernetes HPA metrics unavailability (instance {{ $labels.instance }})
-        description: "HPA is not able to collect metrics\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to collect metrics\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-    - alert: KubernetesHpaScaleCapability
+    - alert: KubernetesHpaScaleMaximum
-      expr: 'kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas'
+      expr: '(kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas) and (kube_horizontalpodautoscaler_spec_max_replicas > 1) and (kube_horizontalpodautoscaler_spec_min_replicas != kube_horizontalpodautoscaler_spec_max_replicas)'
      for: 2m
      labels:
        severity: info
      annotations:
-        summary: Kubernetes HPA scale capability (instance {{ $labels.instance }})
+        summary: Kubernetes HPA scale maximum (instance {{ $labels.instance }})
-        description: "The maximum number of desired Pods has been hit\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has hit maximum number of desired pods\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: KubernetesHpaUnderutilized
      expr: 'max(quantile_over_time(0.5, kube_horizontalpodautoscaler_status_desired_replicas[1d]) == kube_horizontalpodautoscaler_spec_min_replicas) by (horizontalpodautoscaler) > 3'
@ -155,7 +164,7 @@ groups:
        severity: info
      annotations:
        summary: Kubernetes HPA underutilized (instance {{ $labels.instance }})
-        description: "HPA is constantly at minimum replicas for 50% of the time. Potential cost saving here.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is constantly at minimum replicas for 50% of the time. Potential cost saving here.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: KubernetesPodNotHealthy
      expr: 'sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"}) > 0'
@ -163,8 +172,8 @@ groups:
      labels:
        severity: critical
      annotations:
-        summary: Kubernetes Pod not healthy (instance {{ $labels.instance }})
+        summary: Kubernetes Pod not healthy ({{ $labels.namespace }}/{{ $labels.pod }})
-        description: "Pod has been in a non-ready state for longer than 15 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-running state for longer than 15 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: KubernetesPodCrashLooping
      expr: 'increase(kube_pod_container_status_restarts_total[1m]) > 3'
@ -172,17 +181,17 @@ groups:
      labels:
        severity: warning
      annotations:
-        summary: Kubernetes pod crash looping (instance {{ $labels.instance }})
+        summary: Kubernetes pod crash looping ({{ $labels.namespace }}/{{ $labels.pod }})
-        description: "Pod {{ $labels.pod }} is crash looping\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-    - alert: KubernetesReplicassetMismatch
+    - alert: KubernetesReplicasetReplicasMismatch
      expr: 'kube_replicaset_spec_replicas != kube_replicaset_status_ready_replicas'
      for: 10m
      labels:
        severity: warning
      annotations:
-        summary: Kubernetes ReplicasSet mismatch (instance {{ $labels.instance }})
+        summary: Kubernetes ReplicasSet mismatch ({{ $labels.namespace }}/{{ $labels.replicaset }})
-        description: "Deployment Replicas mismatch\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "ReplicaSet {{ $labels.namespace }}/{{ $labels.replicaset }} replicas mismatch\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: KubernetesDeploymentReplicasMismatch
      expr: 'kube_deployment_spec_replicas != kube_deployment_status_replicas_available'
@ -190,8 +199,8 @@ groups:
      labels:
        severity: warning
      annotations:
-        summary: Kubernetes Deployment replicas mismatch (instance {{ $labels.instance }})
+        summary: Kubernetes Deployment replicas mismatch ({{ $labels.namespace }}/{{ $labels.deployment }})
-        description: "Deployment Replicas mismatch\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replicas mismatch\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: KubernetesStatefulsetReplicasMismatch
      expr: 'kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas'
@ -200,7 +209,7 @@ groups:
        severity: warning
      annotations:
        summary: Kubernetes StatefulSet replicas mismatch (instance {{ $labels.instance }})
-        description: "A StatefulSet does not match the expected number of replicas.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "StatefulSet does not match the expected number of replicas.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: KubernetesDeploymentGenerationMismatch
      expr: 'kube_deployment_status_observed_generation != kube_deployment_metadata_generation'
@ -208,8 +217,8 @@ groups:
      labels:
        severity: critical
      annotations:
-        summary: Kubernetes Deployment generation mismatch (instance {{ $labels.instance }})
+        summary: Kubernetes Deployment generation mismatch ({{ $labels.namespace }}/{{ $labels.deployment }})
-        description: "A Deployment has failed but has not been rolled back.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has failed but has not been rolled back.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: KubernetesStatefulsetGenerationMismatch
      expr: 'kube_statefulset_status_observed_generation != kube_statefulset_metadata_generation'
@ -217,8 +226,8 @@ groups:
      labels:
        severity: critical
      annotations:
-        summary: Kubernetes StatefulSet generation mismatch (instance {{ $labels.instance }})
+        summary: Kubernetes StatefulSet generation mismatch ({{ $labels.namespace }}/{{ $labels.statefulset }})
-        description: "A StatefulSet has failed but has not been rolled back.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has failed but has not been rolled back.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: KubernetesStatefulsetUpdateNotRolledOut
      expr: 'max without (revision) (kube_statefulset_status_current_revision unless kube_statefulset_status_update_revision) * (kube_statefulset_replicas != kube_statefulset_status_replicas_updated)'
@ -226,8 +235,8 @@ groups:
      labels:
        severity: warning
      annotations:
-        summary: Kubernetes StatefulSet update not rolled out (instance {{ $labels.instance }})
+        summary: Kubernetes StatefulSet update not rolled out ({{ $labels.namespace }}/{{ $labels.statefulset }})
-        description: "StatefulSet update has not been rolled out.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: KubernetesDaemonsetRolloutStuck
      expr: 'kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 or kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0'
@ -235,8 +244,8 @@ groups:
      labels:
        severity: warning
      annotations:
-        summary: Kubernetes DaemonSet rollout stuck (instance {{ $labels.instance }})
+        summary: Kubernetes DaemonSet rollout stuck ({{ $labels.namespace }}/{{ $labels.daemonset }})
-        description: "Some Pods of DaemonSet are not scheduled or not ready\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled or not ready\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: KubernetesDaemonsetMisscheduled
      expr: 'kube_daemonset_status_number_misscheduled > 0'
@ -244,8 +253,8 @@ groups:
      labels:
        severity: critical
      annotations:
-        summary: Kubernetes DaemonSet misscheduled (instance {{ $labels.instance }})
+        summary: Kubernetes DaemonSet misscheduled ({{ $labels.namespace }}/{{ $labels.daemonset }})
-        description: "Some DaemonSet Pods are running where they are not supposed to run\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: KubernetesCronjobTooLong
      expr: 'time() - kube_cronjob_next_schedule_time > 3600'
@ -253,7 +262,7 @@ groups:
      labels:
        severity: warning
      annotations:
-        summary: Kubernetes CronJob too long (instance {{ $labels.instance }})
+        summary: Kubernetes CronJob too long ({{ $labels.namespace }}/{{ $labels.cronjob }})
        description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: KubernetesJobSlowCompletion
@ -262,11 +271,11 @@ groups:
      labels:
        severity: critical
      annotations:
-        summary: Kubernetes job slow completion (instance {{ $labels.instance }})
+        summary: Kubernetes job slow completion ({{ $labels.namespace }}/{{ $labels.job_name }})
        description: "Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in time.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: KubernetesApiServerErrors
-      expr: 'sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[1m])) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) * 100 > 3'
+      expr: 'sum(rate(apiserver_request_total{job="apiserver",code=~"(?:5..)"}[1m])) by (instance, job) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) by (instance, job) * 100 > 3'
      for: 2m
      labels:
        severity: critical
@ -302,7 +311,7 @@ groups:
        description: "A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: KubernetesApiServerLatency
-      expr: 'histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"} [10m])) WITHOUT (instance, resource)) / 1e+06 > 1'
+      expr: 'histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{verb!~"(?:CONNECT|WATCHLIST|WATCH|PROXY)"} [10m])) WITHOUT (subresource)) > 1'
      for: 2m
      labels:
        severity: warning
--- a/dist/rules/meilisearch/embedded-exporter.yml
+++ b/dist/rules/meilisearch/embedded-exporter.yml
@ -0,0 +1,23 @@
 groups:
 - name: EmbeddedExporter
  rules:
    - alert: MeilisearchIndexIsEmpty
      expr: 'meilisearch_index_docs_count == 0'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Meilisearch index is empty (instance {{ $labels.instance }})
        description: "Meilisearch instance is down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: MeilisearchHttpResponseTime
      expr: 'meilisearch_http_response_time_seconds > 0.5'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Meilisearch http response time (instance {{ $labels.instance }})
        description: "Meilisearch http response time is too high\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/dist/rules/minio/embedded-exporter.yml
+++ b/dist/rules/minio/embedded-exporter.yml
@ -5,7 +5,7 @@ groups:
  rules:
    - alert: MinioClusterDiskOffline
-      expr: 'minio_cluster_disk_offline_total > 0'
+      expr: 'minio_cluster_drive_offline_total > 0'
      for: 0m
      labels:
        severity: critical
--- a/dist/rules/mongodb/percona-mongodb-exporter.yml
+++ b/dist/rules/mongodb/percona-mongodb-exporter.yml
@ -66,12 +66,3 @@ groups:
      annotations:
        summary: MongoDB too many connections (instance {{ $labels.instance }})
        description: "Too many connections (> 80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: MongodbVirtualMemoryUsage
      expr: '(sum(mongodb_ss_mem_virtual) BY (instance) / sum(mongodb_ss_mem_resident) BY (instance)) > 3'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: MongoDB virtual memory usage (instance {{ $labels.instance }})
        description: "High memory usage\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/dist/rules/mysql/mysqld-exporter.yml
+++ b/dist/rules/mysql/mysqld-exporter.yml
@ -22,6 +22,15 @@ groups:
        summary: MySQL too many connections (> 80%) (instance {{ $labels.instance }})
        description: "More than 80% of MySQL connections are in use on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: MysqlHighPreparedStatementsUtilization(>80%)
      expr: 'max_over_time(mysql_global_status_prepared_stmt_count[1m]) / mysql_global_variables_max_prepared_stmt_count * 100 > 80'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: MySQL high prepared statements utilization (> 80%) (instance {{ $labels.instance }})
        description: "High utilization of prepared statements (>80%) on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: MysqlHighThreadsRunning
      expr: 'max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections * 100 > 60'
      for: 2m
@ -84,3 +93,39 @@ groups:
      annotations:
        summary: MySQL restarted (instance {{ $labels.instance }})
        description: "MySQL has just been restarted, less than one minute ago on {{ $labels.instance }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: MysqlHighQps
      expr: 'irate(mysql_global_status_questions[1m]) > 10000'
      for: 2m
      labels:
        severity: info
      annotations:
        summary: MySQL High QPS (instance {{ $labels.instance }})
        description: "MySQL is being overload with unusual QPS (> 10k QPS).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: MysqlTooManyOpenFiles
      expr: 'mysql_global_status_innodb_num_open_files / mysql_global_variables_open_files_limit * 100 > 75'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: MySQL too many open files (instance {{ $labels.instance }})
        description: "MySQL has too many open files, consider increase variables open_files_limit on {{ $labels.instance }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: MysqlInnodbForceRecoveryIsEnabled
      expr: 'mysql_global_variables_innodb_force_recovery != 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: MySQL InnoDB Force Recovery is enabled (instance {{ $labels.instance }})
        description: "MySQL InnoDB force recovery is enabled on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: MysqlInnodbHistory_lenTooLong
      expr: 'mysql_info_schema_innodb_metrics_transaction_trx_rseg_history_len > 50000'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: MySQL InnoDB history_len too long (instance {{ $labels.instance }})
        description: "MySQL history_len (undo log) too long on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/dist/rules/nats/nats-exporter.yml
+++ b/dist/rules/nats/nats-exporter.yml
@ -32,10 +32,154 @@ groups:
        description: "High number of NATS subscriptions ({{ $value }}) for {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: NatsHighRoutesCount
-      expr: 'gnatsd_routez_num_routes > 10'
+      expr: 'gnatsd_varz_routes > 10'
      for: 3m
      labels:
        severity: warning
      annotations:
        summary: Nats high routes count (instance {{ $labels.instance }})
        description: "High number of NATS routes ({{ $value }}) for {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: NatsHighMemoryUsage
      expr: 'gnatsd_varz_mem > 200 * 1024 * 1024'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Nats high memory usage (instance {{ $labels.instance }})
        description: "NATS server memory usage is above 200MB for {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: NatsSlowConsumers
      expr: 'gnatsd_varz_slow_consumers > 0'
      for: 3m
      labels:
        severity: critical
      annotations:
        summary: Nats slow consumers (instance {{ $labels.instance }})
        description: "There are slow consumers in NATS for {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: NatsServerDown
      expr: 'absent(up{job="nats"})'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Nats server down (instance {{ $labels.instance }})
        description: "NATS server has been down for more than 5 minutes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: NatsHighCpuUsage
      expr: 'rate(gnatsd_varz_cpu[5m]) > 0.8'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Nats high CPU usage (instance {{ $labels.instance }})
        description: "NATS server is using more than 80% CPU for the last 5 minutes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: NatsHighNumberOfConnections
      expr: 'gnatsd_connz_num_connections > 1000'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Nats high number of connections (instance {{ $labels.instance }})
        description: "NATS server has more than 1000 active connections\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: NatsHighJetstreamStoreUsage
      expr: 'gnatsd_varz_jetstream_stats_storage / gnatsd_varz_jetstream_config_max_storage > 0.8'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Nats high JetStream store usage (instance {{ $labels.instance }})
        description: "JetStream store usage is over 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: NatsHighJetstreamMemoryUsage
      expr: 'gnatsd_varz_jetstream_stats_memory / gnatsd_varz_jetstream_config_max_memory > 0.8'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Nats high JetStream memory usage (instance {{ $labels.instance }})
        description: "JetStream memory usage is over 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: NatsHighNumberOfSubscriptions
      expr: 'gnatsd_connz_subscriptions > 1000'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Nats high number of subscriptions (instance {{ $labels.instance }})
        description: "NATS server has more than 1000 active subscriptions\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: NatsHighPendingBytes
      expr: 'gnatsd_connz_pending_bytes > 100000'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Nats high pending bytes (instance {{ $labels.instance }})
        description: "NATS server has more than 100,000 pending bytes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: NatsTooManyErrors
      expr: 'increase(gnatsd_varz_jetstream_stats_api_errors[5m]) > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Nats too many errors (instance {{ $labels.instance }})
        description: "NATS server has encountered errors in the last 5 minutes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: NatsJetstreamConsumersExceeded
      expr: 'sum(gnatsd_varz_jetstream_stats_accounts) > 100'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Nats JetStream consumers exceeded (instance {{ $labels.instance }})
        description: "JetStream has more than 100 active consumers\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: NatsFrequentAuthenticationTimeouts
      expr: 'increase(gnatsd_varz_auth_timeout[5m]) > 5'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Nats frequent authentication timeouts (instance {{ $labels.instance }})
        description: "There have been more than 5 authentication timeouts in the last 5 minutes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: NatsMaxPayloadSizeExceeded
      expr: 'max(gnatsd_varz_max_payload) > 1024 * 1024'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Nats max payload size exceeded (instance {{ $labels.instance }})
        description: "The max payload size allowed by NATS has been exceeded (1MB)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: NatsLeafNodeConnectionIssue
      expr: 'increase(gnatsd_varz_leafnodes[5m]) == 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Nats leaf node connection issue (instance {{ $labels.instance }})
        description: "No leaf node connections have been established in the last 5 minutes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: NatsMaxPingOperationsExceeded
      expr: 'gnatsd_varz_ping_max > 50'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Nats max ping operations exceeded (instance {{ $labels.instance }})
        description: "The maximum number of ping operations in NATS has exceeded 50\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: NatsWriteDeadlineExceeded
      expr: 'gnatsd_varz_write_deadline > 10'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: Nats write deadline exceeded (instance {{ $labels.instance }})
        description: "The write deadline has been exceeded in NATS, indicating potential message delivery issues\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/dist/rules/netdata/embedded-exporter.yml
+++ b/dist/rules/netdata/embedded-exporter.yml
@ -23,7 +23,7 @@ groups:
        description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: NetdataHighMemoryUsage
-      expr: '100 / netdata_system_ram_MB_average * netdata_system_ram_MB_average{dimension=~"free|cached"} < 20'
+      expr: '100 / netdata_system_ram_MiB_average * netdata_system_ram_MiB_average{dimension=~"free|cached"} < 20'
      for: 5m
      labels:
        severity: warning
--- a/dist/rules/postgresql/postgres-exporter.yml
+++ b/dist/rules/postgresql/postgres-exporter.yml
@ -32,7 +32,7 @@ groups:
        description: "Postgresql exporter is showing errors. A query may be buggy in query.yaml\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PostgresqlTableNotAutoVacuumed
-      expr: '(pg_stat_user_tables_last_autovacuum > 0) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10'
+      expr: '((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_vacuum_threshold) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10'
      for: 0m
      labels:
        severity: warning
@ -41,7 +41,7 @@ groups:
        description: "Table {{ $labels.relname }} has not been auto vacuumed for 10 days\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PostgresqlTableNotAutoAnalyzed
-      expr: '(pg_stat_user_tables_last_autoanalyze > 0) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10'
+      expr: '((pg_stat_user_tables_n_tup_del + pg_stat_user_tables_n_tup_upd + pg_stat_user_tables_n_tup_hot_upd) > pg_settings_autovacuum_analyze_threshold) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10'
      for: 0m
      labels:
        severity: warning
@ -50,7 +50,7 @@ groups:
        description: "Table {{ $labels.relname }} has not been auto analyzed for 10 days\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PostgresqlTooManyConnections
-      expr: ''
+      expr: 'sum by (instance, job, server) (pg_stat_activity_count) > min by (instance, job, server) (pg_settings_max_connections * 0.8)'
      for: 2m
      labels:
        severity: warning
@ -62,7 +62,7 @@ groups:
      expr: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5'
      for: 2m
      labels:
-        severity: warning
+        severity: critical
      annotations:
        summary: Postgresql not enough connections (instance {{ $labels.instance }})
        description: "PostgreSQL instance should have more connections (> 5)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
@ -86,7 +86,7 @@ groups:
        description: "Ratio of transactions being aborted compared to committed is > 2 %\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PostgresqlCommitRateLow
-      expr: 'rate(pg_stat_database_xact_commit[1m]) < 10'
+      expr: 'increase(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[5m]) < 5'
      for: 2m
      labels:
        severity: critical
@ -140,7 +140,7 @@ groups:
        description: "PostgreSQL dead tuples is too large\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PostgresqlConfigurationChanged
-      expr: '{__name__=~"pg_settings_.*"} != ON(__name__) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"} OFFSET 5m'
+      expr: '{__name__=~"pg_settings_.*"} != ON(__name__, instance) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"} OFFSET 5m'
      for: 0m
      labels:
        severity: info
@ -155,7 +155,7 @@ groups:
        severity: critical
      annotations:
        summary: Postgresql SSL compression active (instance {{ $labels.instance }})
-        description: "Database connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Database allows connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PostgresqlTooManyLocksAcquired
      expr: '((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20'
@ -183,3 +183,12 @@ groups:
      annotations:
        summary: Postgresql bloat table high (> 80%) (instance {{ $labels.instance }})
        description: "The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PostgresqlInvalidIndex
      expr: 'pg_general_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}'
      for: 6h
      labels:
        severity: warning
      annotations:
        summary: Postgresql invalid index (instance {{ $labels.instance }})
        description: "The table {{ $labels.relname }} has an invalid index: {{ $labels.indexrelname }}. You should execute `DROP INDEX {{ $labels.indexrelname }};`\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/dist/rules/prometheus-self-monitoring/embedded-exporter.yml
+++ b/dist/rules/prometheus-self-monitoring/embedded-exporter.yml
@ -32,7 +32,7 @@ groups:
        description: "A Prometheus job does not have living target anymore.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PrometheusTargetMissingWithWarmupTime
-      expr: 'sum by (instance, job) ((up == 0) * on (instance) group_right(job) (node_time_seconds - node_boot_time_seconds > 600))'
+      expr: 'sum by (instance, job) ((up == 0) * on (instance) group_left(__name__) (node_time_seconds - node_boot_time_seconds > 600))'
      for: 0m
      labels:
        severity: critical
--- a/dist/rules/rabbitmq/kbudde-rabbitmq-exporter.yml
+++ b/dist/rules/rabbitmq/kbudde-rabbitmq-exporter.yml
@ -10,7 +10,7 @@ groups:
      labels:
        severity: critical
      annotations:
-        summary: Rabbitmq down (instance {{ $labels.instance }})
+        summary: RabbitMQ down (instance {{ $labels.instance }})
        description: "RabbitMQ node down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: RabbitmqClusterDown
@ -19,7 +19,7 @@ groups:
      labels:
        severity: critical
      annotations:
-        summary: Rabbitmq cluster down (instance {{ $labels.instance }})
+        summary: RabbitMQ cluster down (instance {{ $labels.instance }})
        description: "Less than 3 nodes running in RabbitMQ cluster\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: RabbitmqClusterPartition
@ -28,7 +28,7 @@ groups:
      labels:
        severity: critical
      annotations:
-        summary: Rabbitmq cluster partition (instance {{ $labels.instance }})
+        summary: RabbitMQ cluster partition (instance {{ $labels.instance }})
        description: "Cluster partition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: RabbitmqOutOfMemory
@ -37,7 +37,7 @@ groups:
      labels:
        severity: warning
      annotations:
-        summary: Rabbitmq out of memory (instance {{ $labels.instance }})
+        summary: RabbitMQ out of memory (instance {{ $labels.instance }})
        description: "Memory available for RabbmitMQ is low (< 10%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: RabbitmqTooManyConnections
@ -46,7 +46,7 @@ groups:
      labels:
        severity: warning
      annotations:
-        summary: Rabbitmq too many connections (instance {{ $labels.instance }})
+        summary: RabbitMQ too many connections (instance {{ $labels.instance }})
        description: "RabbitMQ instance has too many connections (> 1000)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: RabbitmqDeadLetterQueueFillingUp
@ -55,7 +55,7 @@ groups:
      labels:
        severity: warning
      annotations:
-        summary: Rabbitmq dead letter queue filling up (instance {{ $labels.instance }})
+        summary: RabbitMQ dead letter queue filling up (instance {{ $labels.instance }})
        description: "Dead letter queue is filling up (> 10 msgs)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: RabbitmqTooManyMessagesInQueue
@ -64,7 +64,7 @@ groups:
      labels:
        severity: warning
      annotations:
-        summary: Rabbitmq too many messages in queue (instance {{ $labels.instance }})
+        summary: RabbitMQ too many messages in queue (instance {{ $labels.instance }})
        description: "Queue is filling up (> 1000 msgs)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: RabbitmqSlowQueueConsuming
@ -73,7 +73,7 @@ groups:
      labels:
        severity: warning
      annotations:
-        summary: Rabbitmq slow queue consuming (instance {{ $labels.instance }})
+        summary: RabbitMQ slow queue consuming (instance {{ $labels.instance }})
        description: "Queue messages are consumed slowly (> 60s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: RabbitmqNoConsumer
@ -82,7 +82,7 @@ groups:
      labels:
        severity: critical
      annotations:
-        summary: Rabbitmq no consumer (instance {{ $labels.instance }})
+        summary: RabbitMQ no consumer (instance {{ $labels.instance }})
        description: "Queue has no consumer\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: RabbitmqTooManyConsumers
@ -91,7 +91,7 @@ groups:
      labels:
        severity: critical
      annotations:
-        summary: Rabbitmq too many consumers (instance {{ $labels.instance }})
+        summary: RabbitMQ too many consumers (instance {{ $labels.instance }})
        description: "Queue should have only 1 consumer\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: RabbitmqUnactiveExchange
@ -100,5 +100,5 @@ groups:
      labels:
        severity: warning
      annotations:
-        summary: Rabbitmq unactive exchange (instance {{ $labels.instance }})
+        summary: RabbitMQ unactive exchange (instance {{ $labels.instance }})
        description: "Exchange receive less than 5 msgs per second\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/dist/rules/rabbitmq/rabbitmq-exporter.yml
+++ b/dist/rules/rabbitmq/rabbitmq-exporter.yml
@ -10,7 +10,7 @@ groups:
      labels:
        severity: critical
      annotations:
-        summary: Rabbitmq node down (instance {{ $labels.instance }})
+        summary: RabbitMQ node down (instance {{ $labels.instance }})
        description: "Less than 3 nodes running in RabbitMQ cluster\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: RabbitmqNodeNotDistributed
@ -19,7 +19,7 @@ groups:
      labels:
        severity: critical
      annotations:
-        summary: Rabbitmq node not distributed (instance {{ $labels.instance }})
+        summary: RabbitMQ node not distributed (instance {{ $labels.instance }})
        description: "Distribution link state is not 'up'\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: RabbitmqInstancesDifferentVersions
@ -28,8 +28,8 @@ groups:
      labels:
        severity: warning
      annotations:
-        summary: Rabbitmq instances different versions (instance {{ $labels.instance }})
+        summary: RabbitMQ instances different versions (instance {{ $labels.instance }})
-        description: "Running different version of Rabbitmq in the same cluster, can lead to failure.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Running different version of RabbitMQ in the same cluster, can lead to failure.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: RabbitmqMemoryHigh
      expr: 'rabbitmq_process_resident_memory_bytes / rabbitmq_resident_memory_limit_bytes * 100 > 90'
@ -37,7 +37,7 @@ groups:
      labels:
        severity: warning
      annotations:
-        summary: Rabbitmq memory high (instance {{ $labels.instance }})
+        summary: RabbitMQ memory high (instance {{ $labels.instance }})
        description: "A node use more than 90% of allocated RAM\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: RabbitmqFileDescriptorsUsage
@ -46,16 +46,25 @@ groups:
      labels:
        severity: warning
      annotations:
-        summary: Rabbitmq file descriptors usage (instance {{ $labels.instance }})
+        summary: RabbitMQ file descriptors usage (instance {{ $labels.instance }})
        description: "A node use more than 90% of file descriptors\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: RabbitmqTooManyReadyMessages
      expr: 'sum(rabbitmq_queue_messages_ready) BY (queue) > 1000'
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: RabbitMQ too many ready messages (instance {{ $labels.instance }})
        description: "RabbitMQ too many ready messages on {{ $labels.instace }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: RabbitmqTooManyUnackMessages
      expr: 'sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000'
      for: 1m
      labels:
        severity: warning
      annotations:
-        summary: Rabbitmq too many unack messages (instance {{ $labels.instance }})
+        summary: RabbitMQ too many unack messages (instance {{ $labels.instance }})
        description: "Too many unacknowledged messages\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: RabbitmqTooManyConnections
@ -64,7 +73,7 @@ groups:
      labels:
        severity: warning
      annotations:
-        summary: Rabbitmq too many connections (instance {{ $labels.instance }})
+        summary: RabbitMQ too many connections (instance {{ $labels.instance }})
        description: "The total connections of a node is too high\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: RabbitmqNoQueueConsumer
@ -73,7 +82,7 @@ groups:
      labels:
        severity: warning
      annotations:
-        summary: Rabbitmq no queue consumer (instance {{ $labels.instance }})
+        summary: RabbitMQ no queue consumer (instance {{ $labels.instance }})
        description: "A queue has less than 1 consumer\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: RabbitmqUnroutableMessages
@ -82,5 +91,5 @@ groups:
      labels:
        severity: warning
      annotations:
-        summary: Rabbitmq unroutable messages (instance {{ $labels.instance }})
+        summary: RabbitMQ unroutable messages (instance {{ $labels.instance }})
        description: "A queue has unroutable messages\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/dist/rules/redis/oliver006-redis-exporter.yml
+++ b/dist/rules/redis/oliver006-redis-exporter.yml
@ -77,7 +77,7 @@ groups:
        description: "Redis is running out of system memory (> 90%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: RedisOutOfConfiguredMaxmemory
-      expr: 'redis_memory_used_bytes / redis_memory_max_bytes * 100 > 90'
+      expr: 'redis_memory_used_bytes / redis_memory_max_bytes * 100 > 90 and on(instance) redis_memory_max_bytes > 0'
      for: 2m
      labels:
        severity: warning
--- a/dist/rules/s.m.a.r.t-device-monitoring/smartctl-exporter.yml
+++ b/dist/rules/s.m.a.r.t-device-monitoring/smartctl-exporter.yml
@ -0,0 +1,77 @@
 groups:
 - name: SmartctlExporter
  rules:
    - alert: SmartDeviceTemperatureWarning
      expr: '(avg_over_time(smartctl_device_temperature{temperature_type="current"} [5m]) unless on (instance, device) smartctl_device_temperature{temperature_type="drive_trip"}) > 60'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: SMART device temperature warning (instance {{ $labels.instance }})
        description: "Device temperature warning on {{ $labels.instance }} drive {{ $labels.device }} over 60°C\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: SmartDeviceTemperatureCritical
      expr: '(max_over_time(smartctl_device_temperature{temperature_type="current"} [5m]) unless on (instance, device) smartctl_device_temperature{temperature_type="drive_trip"}) > 70'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: SMART device temperature critical (instance {{ $labels.instance }})
        description: "Device temperature critical on {{ $labels.instance }} drive {{ $labels.device }} over 70°C\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: SmartDeviceTemperatureOverTripValue
      expr: 'max_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) >= on(device, instance) smartctl_device_temperature{temperature_type="drive_trip"}'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: SMART device temperature over trip value (instance {{ $labels.instance }})
        description: "Device temperature over trip value on {{ $labels.instance }} drive {{ $labels.device }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: SmartDeviceTemperatureNearingTripValue
      expr: 'max_over_time(smartctl_device_temperature{temperature_type="current"} [10m]) >= on(device, instance) (smartctl_device_temperature{temperature_type="drive_trip"} * .80)'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: SMART device temperature nearing trip value (instance {{ $labels.instance }})
        description: "Device temperature at 80% of trip value on {{ $labels.instance }} drive {{ $labels.device }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: SmartStatus
      expr: 'smartctl_device_smart_status != 1'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: SMART status (instance {{ $labels.instance }})
        description: "Device has a SMART status failure on {{ $labels.instance }} drive {{ $labels.device }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: SmartCriticalWarning
      expr: 'smartctl_device_critical_warning > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: SMART critical warning (instance {{ $labels.instance }})
        description: "Disk controller has critical warning on {{ $labels.instance }} drive {{ $labels.device }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: SmartMediaErrors
      expr: 'smartctl_device_media_errors > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: SMART media errors (instance {{ $labels.instance }})
        description: "Disk controller detected media errors on {{ $labels.instance }} drive {{ $labels.device }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: SmartWearoutIndicator
      expr: 'smartctl_device_available_spare < smartctl_device_available_spare_threshold'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: SMART Wearout Indicator (instance {{ $labels.instance }})
        description: "Device is wearing out on {{ $labels.instance }} drive {{ $labels.device }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/dist/template.yml
+++ b/dist/template.yml
@ -11,6 +11,6 @@ groups:
      labels:
        severity: {{ rule.severity }}
      annotations:
-        summary: {{ rule.name }} (instance {% raw %}{{ $labels.instance }}{% endraw %})
+        summary: {% if rule.summary %}{{ rule.summary }}{% else %}{{ rule.name }} (instance {% raw %}{{ $labels.instance }}{% endraw %}){% endif %}
        description: "{{ rule.description | replace: '"', '\"' }}\n  VALUE = {% raw %}{{ $value }}{% endraw %}\n  LABELS = {% raw %}{{ $labels }}{% endraw %}"
 {% endfor %}