awesome-prometheus-alerts/site/src/pages/alertmanager.astro

---
import GuideLayout from '../layouts/GuideLayout.astro';

const base = import.meta.env.BASE_URL.replace(/\/$/, '');

const howToJsonLd = {
  '@context': 'https://schema.org',
  '@type': 'HowTo',
  name: 'How to configure Prometheus and AlertManager for production alerting',
  description:
    'Set up Prometheus alert rules, configure AlertManager routing and receivers, use recording rules to reduce load, and troubleshoot alert delivery delays.',
  step: [
    {
      '@type': 'HowToStep',
      name: 'Configure Prometheus scrape and evaluation intervals',
      text: 'In prometheus.yml, set scrape_interval and evaluation_interval (e.g. 20s). Point rule_files at your alerts/*.yml directory.',
    },
    {
      '@type': 'HowToStep',
      name: 'Write alert rules',
      text: 'Create YAML rule files with alert name, expr (PromQL), for duration, severity label, and summary/description annotations.',
    },
    {
      '@type': 'HowToStep',
      name: 'Configure AlertManager routing',
      text: 'In alertmanager.yml, define a route tree with group_wait, group_interval, repeat_interval, and child routes that match severity labels to specific receivers.',
    },
    {
      '@type': 'HowToStep',
      name: 'Set up receivers (Slack, PagerDuty, webhook)',
      text: 'Add receiver blocks for each notification channel. For Slack, provide api_url, channel, and a message template. Use continue: true if multiple receivers should handle the same alert.',
    },
    {
      '@type': 'HowToStep',
      name: 'Add recording rules for expensive queries',
      text: 'Wrap high-cardinality or frequently evaluated expressions in recording rules. Reference the recorded metric in your alert expressions to reduce Prometheus CPU usage.',
    },
  ],
};
---

<GuideLayout
  title="AlertManager Configuration"
  description="Prometheus and AlertManager configuration examples, recorded rules, inhibition, and troubleshooting guide for alert timing and notification routing."
  breadcrumbs={[{ label: 'Guides' }, { label: 'AlertManager Config' }]}
  icon="bell"
  badge="Configuration Guide"
  extraJsonLd={howToJsonLd}
  dateUpdated="2025-01-15"
  readingTime={5}
  keywords="Prometheus, AlertManager, alerting, notification routing, alert timing, Slack alerts, recorded rules, inhibition, PromQL"
>
  <p>
    If you notice a delay between an event and the first notification, read this post:
    {' '}<a href="https://pracucci.com/prometheus-understanding-the-delays-on-alerting.html" target="_blank" rel="noopener noreferrer">
      Understanding the delays on alerting
    </a>.
  </p>

  <h2 id="prometheus-config">Prometheus configuration</h2>

  <p>
    Prometheus reads alert rules from YAML files and evaluates them on every <code>evaluation_interval</code> cycle.
    Keep both <code>scrape_interval</code> and <code>evaluation_interval</code> consistent — a mismatch causes stale data in range queries.
  </p>

  <pre class="rule-code"><code>{`# prometheus.yml

global:
  scrape_interval: 20s

  # A short evaluation_interval will check alerting rules very often.
  # It can be costly if you run Prometheus with 100+ alerts.
  evaluation_interval: 20s

rule_files:
  - 'alerts/*.yml'

scrape_configs:
  # ...`}</code></pre>

  <pre class="rule-code"><code>{`# alerts/example-redis.yml

groups:

- name: ExampleRedisGroup
  rules:
  - alert: ExampleRedisDown
    expr: redis_up == 0
    for: 2m
    labels:
      severity: critical
    annotations:
      summary: Redis instance down (instance {{ $labels.instance }})
      description: "Redis is unreachable\\n  VALUE = {{ $value }}\\n  LABELS = {{ $labels }}"

  - alert: ExampleRedisHighMemory
    expr: redis_memory_used_bytes / redis_memory_max_bytes > 0.9
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: Redis memory usage above 90% (instance {{ $labels.instance }})
      description: "Redis memory usage is {{ $value | humanizePercentage }}\\n  LABELS = {{ $labels }}"`}</code></pre>

  <h2 id="alertmanager-config">AlertManager configuration</h2>

  <p>
    AlertManager receives alerts from Prometheus, deduplicates and groups them, then routes them to the right receiver.
    The three key timing parameters control when notifications are sent:
  </p>
  <ul>
    <li><code>group_wait</code> — how long to wait for more alerts to batch into the first notification</li>
    <li><code>group_interval</code> — how long to wait before sending a follow-up for an ongoing group</li>
    <li><code>repeat_interval</code> — how often to re-notify if an alert hasn't resolved</li>
  </ul>

  <pre class="rule-code"><code>{`# alertmanager.yml

route:
  group_wait: 10s
  group_interval: 30s
  repeat_interval: 4h
  receiver: "slack"

  routes:
    # warnings and criticals → Slack
    - receiver: "slack"
      matchers:
        - severity =~ "critical|warning"
      continue: true

    # criticals also → PagerDuty
    - receiver: "pagerduty"
      matchers:
        - severity = "critical"

receivers:
  - name: "slack"
    slack_configs:
      - api_url: 'https://hooks.slack.com/services/XXXXXXXXX/XXXXXXXXX/xxxxxxxxxxxxxxxxxxxxxxxxxxx'
        send_resolved: true
        channel: '#monitoring'
        title: '{{ if eq .Status "firing" }}:fire:{{ else }}:white_check_mark:{{ end }} {{ .CommonLabels.alertname }}'
        text: |
          {{ range .Alerts }}
          *Alert:* {{ .Annotations.summary }}
          *Description:* {{ .Annotations.description }}
          *Severity:* {{ .Labels.severity }}
          {{ end }}

  - name: "pagerduty"
    pagerduty_configs:
      - routing_key: '<your-pagerduty-integration-key>'
        send_resolved: true`}</code></pre>

  <h2 id="inhibition">Inhibition rules</h2>

  <p>
    Inhibition suppresses lower-priority alerts when a higher-priority alert is already firing for the same target.
    A common pattern: silence <code>warning</code> alerts when a <code>critical</code> alert is active on the same instance.
  </p>

  <pre class="rule-code"><code>{`# alertmanager.yml

inhibit_rules:
  # Suppress warnings when a critical is firing for the same instance
  - source_matchers:
      - severity = "critical"
    target_matchers:
      - severity = "warning"
    equal:
      - alertname
      - instance

  # Suppress all alerts for a node when NodeDown is firing
  - source_matchers:
      - alertname = "NodeDown"
    target_matchers:
      - job = "node"
    equal:
      - instance`}</code></pre>

  <h2 id="recorded-rules">Reduce Prometheus server load</h2>

  <p>
    For expensive or frequently evaluated PromQL queries, use recording rules to precompute results.
    AlertManager and dashboards then reference the lightweight recorded metric instead of re-evaluating the full expression.
  </p>

  <pre class="rule-code"><code>{`groups:

  # 1. Define the recording rule
  - name: recordings
    rules:
    - record: job:rabbitmq_queue_messages_delivered_total:rate5m
      expr: rate(rabbitmq_queue_messages_delivered_total[5m])

  # 2. Reference it in alert rules
  - name: alerts
    rules:
    - alert: RabbitmqLowMessageDelivery
      expr: sum(job:rabbitmq_queue_messages_delivered_total:rate5m) < 10
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: Low message delivery rate in RabbitMQ
        description: "Delivery rate is {{ $value | humanize }} msg/s\\n  LABELS = {{ $labels }}"`}</code></pre>

  <h2 id="troubleshooting">Troubleshooting alert delays</h2>

  <p>
    The total time from an event occurring to a notification being sent is the sum of several independent delays.
    Work through them in order:
  </p>
  <ul>
    <li><strong>Scrape delay</strong>: up to <code>scrape_interval</code> (20s) before the metric is collected</li>
    <li><strong>Evaluation delay</strong>: up to <code>evaluation_interval</code> (20s) before the rule fires</li>
    <li><strong>Pending duration</strong>: the <code>for: 5m</code> window must be satisfied before the alert state changes to <em>firing</em></li>
    <li><strong>GroupWait</strong>: AlertManager waits <code>group_wait</code> (10s) for other alerts to batch</li>
  </ul>

  <p>
    In the worst case with <code>for: 5m</code>: 20s + 20s + 5m + 10s ≈ <strong>6 minutes</strong> from event to notification.
    Reduce <code>evaluation_interval</code> and <code>for:</code> for time-sensitive alerts, but be careful of false positives from transient spikes.
  </p>

  <h2 id="resources">Further reading</h2>

  <ul>
    <li><a href="https://pracucci.com/prometheus-understanding-the-delays-on-alerting.html" target="_blank" rel="noopener noreferrer">Understanding the delays on alerting</a></li>
    <li><a href="https://hodovi.cc/blog/creating-awesome-alertmanager-templates-for-slack/" target="_blank" rel="noopener noreferrer">Creating awesome AlertManager templates for Slack</a></li>
    <li><a href="https://prometheus.io/docs/alerting/latest/configuration/" target="_blank" rel="noopener noreferrer">AlertManager configuration reference</a></li>
    <li><a href="https://grafana.com/blog/2024/10/03/how-to-use-prometheus-to-efficiently-detect-anomalies-at-scale/" target="_blank" rel="noopener noreferrer">How to use Prometheus to efficiently detect anomalies at scale</a></li>
  </ul>
</GuideLayout>