awesome-prometheus-alerts/dist/rules/aws-cloudwatch/prometheus-cloudwatch-exporter.yml

groups:

- name: PrometheusCloudwatchExporter

  # CloudWatch metrics are exported as aws_{namespace}_{metric_name}_{statistic} gauges.
  # The rules below cover both exporter health and common AWS service alerts.
  # Adjust thresholds and label filters to match your CloudWatch exporter configuration.

  rules:

    - alert: CloudwatchExporterScrapeError
      expr: 'cloudwatch_exporter_scrape_error > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: CloudWatch exporter scrape error (instance {{ $labels.instance }})
        description: "CloudWatch exporter on {{ $labels.instance }} failed to scrape metrics from AWS CloudWatch API.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    - alert: CloudwatchExporterSlowScrape
      expr: 'cloudwatch_exporter_scrape_duration_seconds > 300'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: CloudWatch exporter slow scrape (instance {{ $labels.instance }})
        description: "CloudWatch exporter on {{ $labels.instance }} scrape is taking more than 5 minutes ({{ $value }}s). Consider reducing the number of metrics or splitting across multiple exporters.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # CloudWatch API calls cost money (~$0.01 per 1000 GetMetricData requests).
    # 100 requests/minute ≈ $45/month. Adjust the threshold based on your budget.
    - alert: CloudwatchApiHighRequestRate
      expr: 'sum by (instance, namespace) (rate(cloudwatch_requests_total[5m])) * 60 > 100'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: CloudWatch API high request rate (instance {{ $labels.instance }})
        description: "CloudWatch exporter on {{ $labels.instance }} is making {{ $value }} API calls per minute to namespace {{ $labels.namespace }}. This can lead to high AWS costs.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Requires EC2 CPUUtilization metric configured in the CloudWatch exporter.
    - alert: AwsEc2HighCpuUtilization
      expr: 'aws_ec2_cpuutilization_average > 90'
      for: 15m
      labels:
        severity: warning
      annotations:
        summary: AWS EC2 high CPU utilization (instance {{ $labels.instance }})
        description: "EC2 instance {{ $labels.instance_id }} CPU utilization is above 90% ({{ $value }}%).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Requires RDS FreeStorageSpace metric. The threshold of 2GB is a rough default.
    # Adjust based on your database size.
    - alert: AwsRdsLowFreeStorageSpace
      expr: 'aws_rds_free_storage_space_average < 2000000000'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: AWS RDS low free storage space (instance {{ $labels.instance }})
        description: "RDS instance {{ $labels.dbinstance_identifier }} has less than 2GB free storage ({{ $value }} bytes remaining).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Requires RDS CPUUtilization metric configured in the CloudWatch exporter.
    - alert: AwsRdsHighCpuUtilization
      expr: 'aws_rds_cpuutilization_average > 90'
      for: 15m
      labels:
        severity: warning
      annotations:
        summary: AWS RDS high CPU utilization (instance {{ $labels.instance }})
        description: "RDS instance {{ $labels.dbinstance_identifier }} CPU utilization is above 90% ({{ $value }}%).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # The threshold depends on the RDS instance class. Adjust based on your
    # instance type's max_connections parameter.
    - alert: AwsRdsHighDatabaseConnections
      expr: 'aws_rds_database_connections_average > 100'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: AWS RDS high database connections (instance {{ $labels.instance }})
        description: "RDS instance {{ $labels.dbinstance_identifier }} has {{ $value }} active connections.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Requires SQS ApproximateNumberOfMessagesVisible metric. The threshold of 1000
    # is a rough default. Adjust based on your expected queue depth.
    - alert: AwsSqsQueueMessagesVisible
      expr: 'aws_sqs_approximate_number_of_messages_visible_average > 1000'
      for: 10m
      labels:
        severity: warning
      annotations:
        summary: AWS SQS queue messages visible (instance {{ $labels.instance }})
        description: "SQS queue {{ $labels.queue_name }} has {{ $value }} messages waiting to be processed.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Requires SQS ApproximateAgeOfOldestMessage metric.
    - alert: AwsSqsMessageAgeTooOld
      expr: 'aws_sqs_approximate_age_of_oldest_message_maximum > 3600'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: AWS SQS message age too old (instance {{ $labels.instance }})
        description: "SQS queue {{ $labels.queue_name }} has messages older than 1 hour ({{ $value }}s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Requires ApplicationELB UnHealthyHostCount metric.
    - alert: AwsAlbUnhealthyTargets
      expr: 'aws_applicationelb_unhealthy_host_count_average > 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: AWS ALB unhealthy targets (instance {{ $labels.instance }})
        description: "ALB {{ $labels.load_balancer }} has {{ $value }} unhealthy target(s) in target group {{ $labels.target_group }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Requires ApplicationELB HTTPCode_ELB_5XX_Count and RequestCount metrics.
    - alert: AwsAlbHigh5xxErrorRate
      expr: '(aws_applicationelb_httpcode_elb_5_xx_count_sum / aws_applicationelb_request_count_sum) * 100 > 5 and aws_applicationelb_request_count_sum > 0'
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: AWS ALB high 5xx error rate (instance {{ $labels.instance }})
        description: "ALB {{ $labels.load_balancer }} 5xx error rate is above 5% ({{ $value }}%).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Requires ApplicationELB TargetResponseTime metric.
    - alert: AwsAlbHighTargetResponseTime
      expr: 'aws_applicationelb_target_response_time_average > 2'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: AWS ALB high target response time (instance {{ $labels.instance }})
        description: "ALB {{ $labels.load_balancer }} average target response time is above 2 seconds ({{ $value }}s).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

    # Requires Lambda Errors and Invocations metrics.
    - alert: AwsLambdaHighErrorRate
      expr: '(aws_lambda_errors_sum / aws_lambda_invocations_sum) * 100 > 5 and aws_lambda_invocations_sum > 0'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: AWS Lambda high error rate (instance {{ $labels.instance }})
        description: "Lambda function {{ $labels.function_name }} error rate is above 5% ({{ $value }}%).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"