groups: - name: PrometheusCloudwatchExporter # CloudWatch metrics are exported as aws_{namespace}_{metric_name}_{statistic} gauges. # The rules below cover both exporter health and common AWS service alerts. # Adjust thresholds and label filters to match your CloudWatch exporter configuration. rules: - alert: CloudwatchExporterScrapeError expr: 'cloudwatch_exporter_scrape_error > 0' for: 5m labels: severity: warning annotations: summary: CloudWatch exporter scrape error (instance {{ $labels.instance }}) description: "CloudWatch exporter on {{ $labels.instance }} failed to scrape metrics from AWS CloudWatch API.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: CloudwatchExporterSlowScrape expr: 'cloudwatch_exporter_scrape_duration_seconds > 300' for: 5m labels: severity: warning annotations: summary: CloudWatch exporter slow scrape (instance {{ $labels.instance }}) description: "CloudWatch exporter on {{ $labels.instance }} scrape is taking more than 5 minutes ({{ $value }}s). Consider reducing the number of metrics or splitting across multiple exporters.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # CloudWatch API calls cost money (~$0.01 per 1000 GetMetricData requests). # 100 requests/minute ≈ $45/month. Adjust the threshold based on your budget. - alert: CloudwatchApiHighRequestRate expr: 'sum by (instance, namespace) (rate(cloudwatch_requests_total[5m])) * 60 > 100' for: 0m labels: severity: warning annotations: summary: CloudWatch API high request rate (instance {{ $labels.instance }}) description: "CloudWatch exporter on {{ $labels.instance }} is making {{ $value }} API calls per minute to namespace {{ $labels.namespace }}. This can lead to high AWS costs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Requires EC2 CPUUtilization metric configured in the CloudWatch exporter. - alert: AwsEc2HighCpuUtilization expr: 'aws_ec2_cpuutilization_average > 90' for: 15m labels: severity: warning annotations: summary: AWS EC2 high CPU utilization (instance {{ $labels.instance }}) description: "EC2 instance {{ $labels.instance_id }} CPU utilization is above 90% ({{ $value }}%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Requires RDS FreeStorageSpace metric. The threshold of 2GB is a rough default. # Adjust based on your database size. - alert: AwsRdsLowFreeStorageSpace expr: 'aws_rds_free_storage_space_average < 2000000000' for: 5m labels: severity: warning annotations: summary: AWS RDS low free storage space (instance {{ $labels.instance }}) description: "RDS instance {{ $labels.dbinstance_identifier }} has less than 2GB free storage ({{ $value }} bytes remaining).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Requires RDS CPUUtilization metric configured in the CloudWatch exporter. - alert: AwsRdsHighCpuUtilization expr: 'aws_rds_cpuutilization_average > 90' for: 15m labels: severity: warning annotations: summary: AWS RDS high CPU utilization (instance {{ $labels.instance }}) description: "RDS instance {{ $labels.dbinstance_identifier }} CPU utilization is above 90% ({{ $value }}%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # The threshold depends on the RDS instance class. Adjust based on your # instance type's max_connections parameter. - alert: AwsRdsHighDatabaseConnections expr: 'aws_rds_database_connections_average > 100' for: 5m labels: severity: warning annotations: summary: AWS RDS high database connections (instance {{ $labels.instance }}) description: "RDS instance {{ $labels.dbinstance_identifier }} has {{ $value }} active connections.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Requires SQS ApproximateNumberOfMessagesVisible metric. The threshold of 1000 # is a rough default. Adjust based on your expected queue depth. - alert: AwsSqsQueueMessagesVisible expr: 'aws_sqs_approximate_number_of_messages_visible_average > 1000' for: 10m labels: severity: warning annotations: summary: AWS SQS queue messages visible (instance {{ $labels.instance }}) description: "SQS queue {{ $labels.queue_name }} has {{ $value }} messages waiting to be processed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Requires SQS ApproximateAgeOfOldestMessage metric. - alert: AwsSqsMessageAgeTooOld expr: 'aws_sqs_approximate_age_of_oldest_message_maximum > 3600' for: 0m labels: severity: warning annotations: summary: AWS SQS message age too old (instance {{ $labels.instance }}) description: "SQS queue {{ $labels.queue_name }} has messages older than 1 hour ({{ $value }}s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Requires ApplicationELB UnHealthyHostCount metric. - alert: AwsAlbUnhealthyTargets expr: 'aws_applicationelb_unhealthy_host_count_average > 0' for: 5m labels: severity: critical annotations: summary: AWS ALB unhealthy targets (instance {{ $labels.instance }}) description: "ALB {{ $labels.load_balancer }} has {{ $value }} unhealthy target(s) in target group {{ $labels.target_group }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Requires ApplicationELB HTTPCode_ELB_5XX_Count and RequestCount metrics. - alert: AwsAlbHigh5xxErrorRate expr: '(aws_applicationelb_httpcode_elb_5_xx_count_sum / aws_applicationelb_request_count_sum) * 100 > 5 and aws_applicationelb_request_count_sum > 0' for: 5m labels: severity: critical annotations: summary: AWS ALB high 5xx error rate (instance {{ $labels.instance }}) description: "ALB {{ $labels.load_balancer }} 5xx error rate is above 5% ({{ $value }}%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Requires ApplicationELB TargetResponseTime metric. - alert: AwsAlbHighTargetResponseTime expr: 'aws_applicationelb_target_response_time_average > 2' for: 5m labels: severity: warning annotations: summary: AWS ALB high target response time (instance {{ $labels.instance }}) description: "ALB {{ $labels.load_balancer }} average target response time is above 2 seconds ({{ $value }}s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" # Requires Lambda Errors and Invocations metrics. - alert: AwsLambdaHighErrorRate expr: '(aws_lambda_errors_sum / aws_lambda_invocations_sum) * 100 > 5 and aws_lambda_invocations_sum > 0' for: 5m labels: severity: warning annotations: summary: AWS Lambda high error rate (instance {{ $labels.instance }}) description: "Lambda function {{ $labels.function_name }} error rate is above 5% ({{ $value }}%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"