awesome-prometheus-alerts/dist/rules/ceph/embedded-exporter.yml

groups:

- name: EmbeddedExporter


  rules:

    # ceph_health_status: 0=HEALTH_OK, 1=HEALTH_WARN, 2=HEALTH_ERR.
    # This rule fires on any non-OK state. Split into separate warning/critical rules by using ==1 and ==2 thresholds if needed.
    - alert: CephState
      expr: 'ceph_health_status != 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: Ceph State (instance {{ $labels.instance }})
        description: "Ceph instance unhealthy\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CephMonitorClockSkew
      expr: 'abs(ceph_monitor_clock_skew_seconds) > 0.2'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Ceph monitor clock skew (instance {{ $labels.instance }})
        description: "Ceph monitor clock skew detected. Please check ntp and hardware clock settings\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CephMonitorLowSpace
      expr: 'ceph_monitor_avail_percent < 10'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Ceph monitor low space (instance {{ $labels.instance }})
        description: "Ceph monitor storage is low.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CephOsdDown
      expr: 'ceph_osd_up == 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: Ceph OSD Down (instance {{ $labels.instance }})
        description: "Ceph Object Storage Daemon Down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Threshold of 5000ms (5 seconds). Adjust based on your expected OSD performance.
    - alert: CephHighOsdLatency
      expr: 'ceph_osd_apply_latency_ms > 5000'
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: Ceph high OSD latency (instance {{ $labels.instance }})
        description: "Ceph Object Storage Daemon latency is high. Please check if it doesn't stuck in weird state.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Ceph internally triggers OSD_NEARFULL based on the nearfull_ratio (default 85%).
    # ceph_health_detail can also be used for more granular OSD space alerts.
    - alert: CephOsdNearFull
      expr: 'ceph_health_detail{name="OSD_NEARFULL"} == 1'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Ceph OSD near full (instance {{ $labels.instance }})
        description: "A Ceph OSD is dangerously full. Please add more disks.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CephOsdReweighted
      expr: 'ceph_osd_weight < 1'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Ceph OSD reweighted (instance {{ $labels.instance }})
        description: "Ceph Object Storage Daemon takes too much time to resize.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CephPgDown
      expr: 'ceph_pg_down > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Ceph PG down (instance {{ $labels.instance }})
        description: "Some Ceph placement groups are down. Please ensure that all the data are available.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CephPgIncomplete
      expr: 'ceph_pg_incomplete > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Ceph PG incomplete (instance {{ $labels.instance }})
        description: "Some Ceph placement groups are incomplete. Please ensure that all the data are available.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CephPgInconsistent
      expr: 'ceph_pg_inconsistent > 0'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Ceph PG inconsistent (instance {{ $labels.instance }})
        description: "Some Ceph placement groups are inconsistent. Data is available but inconsistent across nodes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CephPgActivationLong
      expr: 'ceph_pg_activating > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Ceph PG activation long (instance {{ $labels.instance }})
        description: "Some Ceph placement groups are too long to activate.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CephPgBackfillFull
      expr: 'ceph_pg_backfill_toofull > 0'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Ceph PG backfill full (instance {{ $labels.instance }})
        description: "Some Ceph placement groups are located on full Object Storage Daemon on cluster. Those PGs can be unavailable shortly. Please check OSDs, change weight or reconfigure CRUSH rules.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CephPgUnavailable
      expr: 'ceph_pg_total - ceph_pg_active > 0'
      for: 1m
      labels:
        severity: critical
      annotations:
        summary: Ceph PG unavailable (instance {{ $labels.instance }})
        description: "Some Ceph placement groups are unavailable.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"