mirror of
https://github.com/samber/awesome-prometheus-alerts.git
synced 2026-06-22 01:17:19 +08:00
141 lines
7 KiB
YAML
141 lines
7 KiB
YAML
groups:
|
|
|
|
- name: PrometheusCloudwatchExporter
|
|
|
|
# CloudWatch metrics are exported as aws_{namespace}_{metric_name}_{statistic} gauges.
|
|
# The rules below cover both exporter health and common AWS service alerts.
|
|
# Adjust thresholds and label filters to match your CloudWatch exporter configuration.
|
|
|
|
rules:
|
|
|
|
- alert: CloudwatchExporterScrapeError
|
|
expr: 'cloudwatch_exporter_scrape_error > 0'
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: CloudWatch exporter scrape error (instance {{ $labels.instance }})
|
|
description: "CloudWatch exporter on {{ $labels.instance }} failed to scrape metrics from AWS CloudWatch API.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: CloudwatchExporterSlowScrape
|
|
expr: 'cloudwatch_exporter_scrape_duration_seconds > 300'
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: CloudWatch exporter slow scrape (instance {{ $labels.instance }})
|
|
description: "CloudWatch exporter on {{ $labels.instance }} scrape is taking more than 5 minutes ({{ $value }}s). Consider reducing the number of metrics or splitting across multiple exporters.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
# CloudWatch API calls cost money (~$0.01 per 1000 GetMetricData requests).
|
|
# 100 requests/minute ≈ $45/month. Adjust the threshold based on your budget.
|
|
- alert: CloudwatchApiHighRequestRate
|
|
expr: 'sum by (instance, namespace) (rate(cloudwatch_requests_total[5m])) * 60 > 100'
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: CloudWatch API high request rate (instance {{ $labels.instance }})
|
|
description: "CloudWatch exporter on {{ $labels.instance }} is making {{ $value }} API calls per minute to namespace {{ $labels.namespace }}. This can lead to high AWS costs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
# Requires EC2 CPUUtilization metric configured in the CloudWatch exporter.
|
|
- alert: AwsEc2HighCpuUtilization
|
|
expr: 'aws_ec2_cpuutilization_average > 90'
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: AWS EC2 high CPU utilization (instance {{ $labels.instance }})
|
|
description: "EC2 instance {{ $labels.instance_id }} CPU utilization is above 90% ({{ $value }}%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
# Requires RDS FreeStorageSpace metric. The threshold of 2GB is a rough default.
|
|
# Adjust based on your database size.
|
|
- alert: AwsRdsLowFreeStorageSpace
|
|
expr: 'aws_rds_free_storage_space_average < 2000000000'
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: AWS RDS low free storage space (instance {{ $labels.instance }})
|
|
description: "RDS instance {{ $labels.dbinstance_identifier }} has less than 2GB free storage ({{ $value }} bytes remaining).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
# Requires RDS CPUUtilization metric configured in the CloudWatch exporter.
|
|
- alert: AwsRdsHighCpuUtilization
|
|
expr: 'aws_rds_cpuutilization_average > 90'
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: AWS RDS high CPU utilization (instance {{ $labels.instance }})
|
|
description: "RDS instance {{ $labels.dbinstance_identifier }} CPU utilization is above 90% ({{ $value }}%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
# The threshold depends on the RDS instance class. Adjust based on your
|
|
# instance type's max_connections parameter.
|
|
- alert: AwsRdsHighDatabaseConnections
|
|
expr: 'aws_rds_database_connections_average > 100'
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: AWS RDS high database connections (instance {{ $labels.instance }})
|
|
description: "RDS instance {{ $labels.dbinstance_identifier }} has {{ $value }} active connections.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
# Requires SQS ApproximateNumberOfMessagesVisible metric. The threshold of 1000
|
|
# is a rough default. Adjust based on your expected queue depth.
|
|
- alert: AwsSqsQueueMessagesVisible
|
|
expr: 'aws_sqs_approximate_number_of_messages_visible_average > 1000'
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: AWS SQS queue messages visible (instance {{ $labels.instance }})
|
|
description: "SQS queue {{ $labels.queue_name }} has {{ $value }} messages waiting to be processed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
# Requires SQS ApproximateAgeOfOldestMessage metric.
|
|
- alert: AwsSqsMessageAgeTooOld
|
|
expr: 'aws_sqs_approximate_age_of_oldest_message_maximum > 3600'
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: AWS SQS message age too old (instance {{ $labels.instance }})
|
|
description: "SQS queue {{ $labels.queue_name }} has messages older than 1 hour ({{ $value }}s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
# Requires ApplicationELB UnHealthyHostCount metric.
|
|
- alert: AwsAlbUnhealthyTargets
|
|
expr: 'aws_applicationelb_unhealthy_host_count_average > 0'
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: AWS ALB unhealthy targets (instance {{ $labels.instance }})
|
|
description: "ALB {{ $labels.load_balancer }} has {{ $value }} unhealthy target(s) in target group {{ $labels.target_group }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
# Requires ApplicationELB HTTPCode_ELB_5XX_Count and RequestCount metrics.
|
|
- alert: AwsAlbHigh5xxErrorRate
|
|
expr: '(aws_applicationelb_httpcode_elb_5_xx_count_sum / aws_applicationelb_request_count_sum) * 100 > 5 and aws_applicationelb_request_count_sum > 0'
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: AWS ALB high 5xx error rate (instance {{ $labels.instance }})
|
|
description: "ALB {{ $labels.load_balancer }} 5xx error rate is above 5% ({{ $value }}%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
# Requires ApplicationELB TargetResponseTime metric.
|
|
- alert: AwsAlbHighTargetResponseTime
|
|
expr: 'aws_applicationelb_target_response_time_average > 2'
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: AWS ALB high target response time (instance {{ $labels.instance }})
|
|
description: "ALB {{ $labels.load_balancer }} average target response time is above 2 seconds ({{ $value }}s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
# Requires Lambda Errors and Invocations metrics.
|
|
- alert: AwsLambdaHighErrorRate
|
|
expr: '(aws_lambda_errors_sum / aws_lambda_invocations_sum) * 100 > 5 and aws_lambda_invocations_sum > 0'
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: AWS Lambda high error rate (instance {{ $labels.instance }})
|
|
description: "Lambda function {{ $labels.function_name }} error rate is above 5% ({{ $value }}%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|