mirror of
https://github.com/samber/awesome-prometheus-alerts.git
synced 2026-06-21 00:47:18 +08:00
Publish
This commit is contained in:
parent
8b443be6d2
commit
eeba1ebbaa
5 changed files with 420 additions and 0 deletions
141
dist/rules/aws-cloudwatch/prometheus-cloudwatch-exporter.yml
vendored
Normal file
141
dist/rules/aws-cloudwatch/prometheus-cloudwatch-exporter.yml
vendored
Normal file
|
|
@ -0,0 +1,141 @@
|
|||
groups:
|
||||
|
||||
- name: PrometheusCloudwatchExporter
|
||||
|
||||
# CloudWatch metrics are exported as aws_{namespace}_{metric_name}_{statistic} gauges.
|
||||
# The rules below cover both exporter health and common AWS service alerts.
|
||||
# Adjust thresholds and label filters to match your CloudWatch exporter configuration.
|
||||
|
||||
rules:
|
||||
|
||||
- alert: CloudwatchExporterScrapeError
|
||||
expr: 'cloudwatch_exporter_scrape_error > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: CloudWatch exporter scrape error (instance {{ $labels.instance }})
|
||||
description: "CloudWatch exporter on {{ $labels.instance }} failed to scrape metrics from AWS CloudWatch API.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: CloudwatchExporterSlowScrape
|
||||
expr: 'cloudwatch_exporter_scrape_duration_seconds > 300'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: CloudWatch exporter slow scrape (instance {{ $labels.instance }})
|
||||
description: "CloudWatch exporter on {{ $labels.instance }} scrape is taking more than 5 minutes ({{ $value }}s). Consider reducing the number of metrics or splitting across multiple exporters.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# CloudWatch API calls cost money (~$0.01 per 1000 GetMetricData requests).
|
||||
# 100 requests/minute ≈ $45/month. Adjust the threshold based on your budget.
|
||||
- alert: CloudwatchApiHighRequestRate
|
||||
expr: 'sum by (instance, namespace) (rate(cloudwatch_requests_total[5m])) * 60 > 100'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: CloudWatch API high request rate (instance {{ $labels.instance }})
|
||||
description: "CloudWatch exporter on {{ $labels.instance }} is making {{ $value }} API calls per minute to namespace {{ $labels.namespace }}. This can lead to high AWS costs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Requires EC2 CPUUtilization metric configured in the CloudWatch exporter.
|
||||
- alert: AwsEc2HighCpuUtilization
|
||||
expr: 'aws_ec2_cpuutilization_average > 90'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: AWS EC2 high CPU utilization (instance {{ $labels.instance }})
|
||||
description: "EC2 instance {{ $labels.instance_id }} CPU utilization is above 90% ({{ $value }}%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Requires RDS FreeStorageSpace metric. The threshold of 2GB is a rough default.
|
||||
# Adjust based on your database size.
|
||||
- alert: AwsRdsLowFreeStorageSpace
|
||||
expr: 'aws_rds_free_storage_space_average < 2000000000'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: AWS RDS low free storage space (instance {{ $labels.instance }})
|
||||
description: "RDS instance {{ $labels.dbinstance_identifier }} has less than 2GB free storage ({{ $value }} bytes remaining).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Requires RDS CPUUtilization metric configured in the CloudWatch exporter.
|
||||
- alert: AwsRdsHighCpuUtilization
|
||||
expr: 'aws_rds_cpuutilization_average > 90'
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: AWS RDS high CPU utilization (instance {{ $labels.instance }})
|
||||
description: "RDS instance {{ $labels.dbinstance_identifier }} CPU utilization is above 90% ({{ $value }}%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# The threshold depends on the RDS instance class. Adjust based on your
|
||||
# instance type's max_connections parameter.
|
||||
- alert: AwsRdsHighDatabaseConnections
|
||||
expr: 'aws_rds_database_connections_average > 100'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: AWS RDS high database connections (instance {{ $labels.instance }})
|
||||
description: "RDS instance {{ $labels.dbinstance_identifier }} has {{ $value }} active connections.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Requires SQS ApproximateNumberOfMessagesVisible metric. The threshold of 1000
|
||||
# is a rough default. Adjust based on your expected queue depth.
|
||||
- alert: AwsSqsQueueMessagesVisible
|
||||
expr: 'aws_sqs_approximate_number_of_messages_visible_average > 1000'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: AWS SQS queue messages visible (instance {{ $labels.instance }})
|
||||
description: "SQS queue {{ $labels.queue_name }} has {{ $value }} messages waiting to be processed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Requires SQS ApproximateAgeOfOldestMessage metric.
|
||||
- alert: AwsSqsMessageAgeTooOld
|
||||
expr: 'aws_sqs_approximate_age_of_oldest_message_maximum > 3600'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: AWS SQS message age too old (instance {{ $labels.instance }})
|
||||
description: "SQS queue {{ $labels.queue_name }} has messages older than 1 hour ({{ $value }}s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Requires ApplicationELB UnHealthyHostCount metric.
|
||||
- alert: AwsAlbUnhealthyTargets
|
||||
expr: 'aws_applicationelb_unhealthy_host_count_average > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: AWS ALB unhealthy targets (instance {{ $labels.instance }})
|
||||
description: "ALB {{ $labels.load_balancer }} has {{ $value }} unhealthy target(s) in target group {{ $labels.target_group }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Requires ApplicationELB HTTPCode_ELB_5XX_Count and RequestCount metrics.
|
||||
- alert: AwsAlbHigh5xxErrorRate
|
||||
expr: '(aws_applicationelb_httpcode_elb_5_xx_count_sum / aws_applicationelb_request_count_sum) * 100 > 5'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: AWS ALB high 5xx error rate (instance {{ $labels.instance }})
|
||||
description: "ALB {{ $labels.load_balancer }} 5xx error rate is above 5% ({{ $value }}%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Requires ApplicationELB TargetResponseTime metric.
|
||||
- alert: AwsAlbHighTargetResponseTime
|
||||
expr: 'aws_applicationelb_target_response_time_average > 2'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: AWS ALB high target response time (instance {{ $labels.instance }})
|
||||
description: "ALB {{ $labels.load_balancer }} average target response time is above 2 seconds ({{ $value }}s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Requires Lambda Errors and Invocations metrics.
|
||||
- alert: AwsLambdaHighErrorRate
|
||||
expr: '(aws_lambda_errors_sum / aws_lambda_invocations_sum) * 100 > 5'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: AWS Lambda high error rate (instance {{ $labels.instance }})
|
||||
description: "Lambda function {{ $labels.function_name }} error rate is above 5% ({{ $value }}%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
57
dist/rules/azure/azure-metrics-exporter.yml
vendored
Normal file
57
dist/rules/azure/azure-metrics-exporter.yml
vendored
Normal file
|
|
@ -0,0 +1,57 @@
|
|||
groups:
|
||||
|
||||
- name: AzureMetricsExporter
|
||||
|
||||
# The exporter uses azurerm_resource_metric as the default metric name for forwarded Azure Monitor metrics.
|
||||
# The metric name can be customized via the name parameter in probe configuration.
|
||||
# Self-monitoring metrics use the azurerm_stats_* and azurerm_api_* prefixes.
|
||||
|
||||
rules:
|
||||
|
||||
- alert: AzureExporterRequestErrors
|
||||
expr: 'increase(azurerm_stats_metric_requests{result="error"}[15m]) > 5'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Azure exporter request errors (instance {{ $labels.instance }})
|
||||
description: "Azure metrics exporter on {{ $labels.instance }} has {{ $value }} API request errors in the last 15 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: AzureExporterHighErrorRate
|
||||
expr: 'sum by (instance) (rate(azurerm_stats_metric_requests{result="error"}[5m])) / sum by (instance) (rate(azurerm_stats_metric_requests[5m])) * 100 > 10'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Azure exporter high error rate (instance {{ $labels.instance }})
|
||||
description: "Azure metrics exporter on {{ $labels.instance }} has an error rate above 10% ({{ $value }}%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Azure Resource Manager enforces rate limits per subscription.
|
||||
# The threshold of 100 remaining calls is a rough default. Adjust based on your
|
||||
# scrape interval and number of monitored resources.
|
||||
- alert: AzureApiReadRateLimitApproaching
|
||||
expr: 'azurerm_api_ratelimit{type="read"} < 100'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Azure API read rate limit approaching (instance {{ $labels.instance }})
|
||||
description: "Azure API read rate limit for subscription {{ $labels.subscriptionID }} is running low ({{ $value }} remaining).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: AzureApiWriteRateLimitApproaching
|
||||
expr: 'azurerm_api_ratelimit{type="write"} < 50'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Azure API write rate limit approaching (instance {{ $labels.instance }})
|
||||
description: "Azure API write rate limit for subscription {{ $labels.subscriptionID }} is running low ({{ $value }} remaining).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: AzureExporterSlowCollection
|
||||
expr: 'azurerm_stats_metric_collecttime > 300'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Azure exporter slow collection (instance {{ $labels.instance }})
|
||||
description: "Azure metrics exporter on {{ $labels.instance }} metric collection is taking more than 5 minutes ({{ $value }}s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
97
dist/rules/digitalocean/digitalocean-exporter.yml
vendored
Normal file
97
dist/rules/digitalocean/digitalocean-exporter.yml
vendored
Normal file
|
|
@ -0,0 +1,97 @@
|
|||
groups:
|
||||
|
||||
- name: DigitaloceanExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: DigitaloceanDropletDown
|
||||
expr: 'digitalocean_droplet_up == 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: DigitalOcean droplet down (instance {{ $labels.instance }})
|
||||
description: "DigitalOcean droplet {{ $labels.name }} ({{ $labels.id }}) in {{ $labels.region }} is not running.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: DigitaloceanAccountNotActive
|
||||
expr: 'digitalocean_account_active != 1'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: DigitalOcean account not active (instance {{ $labels.instance }})
|
||||
description: "DigitalOcean account is not active. It may be suspended or locked.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: DigitaloceanDatabaseDown
|
||||
expr: 'digitalocean_database_status == 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: DigitalOcean database down (instance {{ $labels.instance }})
|
||||
description: "DigitalOcean managed database {{ $labels.name }} ({{ $labels.engine }}) in {{ $labels.region }} is offline.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: DigitaloceanKubernetesClusterDown
|
||||
expr: 'digitalocean_kubernetes_cluster_up == 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: DigitalOcean Kubernetes cluster down (instance {{ $labels.instance }})
|
||||
description: "DigitalOcean Kubernetes cluster {{ $labels.name }} ({{ $labels.version }}) in {{ $labels.region }} is not running.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: DigitaloceanLoadBalancerDown
|
||||
expr: 'digitalocean_loadbalancer_status == 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: DigitalOcean load balancer down (instance {{ $labels.instance }})
|
||||
description: "DigitalOcean load balancer {{ $labels.name }} ({{ $labels.ip }}) is not active.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: DigitaloceanLoadBalancerNoBackends
|
||||
expr: 'digitalocean_loadbalancer_droplets == 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: DigitalOcean load balancer no backends (instance {{ $labels.instance }})
|
||||
description: "DigitalOcean load balancer {{ $labels.name }} ({{ $labels.ip }}) has no droplets attached.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: DigitaloceanFloatingIpNotAssigned
|
||||
expr: 'digitalocean_floating_ipv4_active == 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: DigitalOcean floating IP not assigned (instance {{ $labels.instance }})
|
||||
description: "DigitalOcean floating IP {{ $labels.ipv4 }} in {{ $labels.region }} is not assigned to any droplet.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: DigitaloceanActiveIncidents
|
||||
expr: 'digitalocean_incidents_total > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: DigitalOcean active incidents (instance {{ $labels.instance }})
|
||||
description: "DigitalOcean platform has {{ $value }} active incident(s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: DigitaloceanExporterCollectionErrors
|
||||
expr: 'increase(digitalocean_errors_total[5m]) > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: DigitalOcean exporter collection errors (instance {{ $labels.instance }})
|
||||
description: "DigitalOcean exporter {{ $labels.collector }} collector has {{ $value }} errors.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Fires when more than 80% of the account's droplet limit is in use.
|
||||
- alert: DigitaloceanDropletLimitApproaching
|
||||
expr: '(count(digitalocean_droplet_up) / digitalocean_account_droplet_limit) * 100 > 80'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: DigitalOcean droplet limit approaching (instance {{ $labels.instance }})
|
||||
description: "DigitalOcean account is using {{ $value }}% of its droplet quota.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
53
dist/rules/google-cloud-stackdriver/stackdriver-exporter.yml
vendored
Normal file
53
dist/rules/google-cloud-stackdriver/stackdriver-exporter.yml
vendored
Normal file
|
|
@ -0,0 +1,53 @@
|
|||
groups:
|
||||
|
||||
- name: StackdriverExporter
|
||||
|
||||
# Self-monitoring metrics use the stackdriver_monitoring_* prefix.
|
||||
# All self-monitoring metrics include a project_id label.
|
||||
|
||||
rules:
|
||||
|
||||
- alert: StackdriverExporterScrapeError
|
||||
expr: 'stackdriver_monitoring_last_scrape_error > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Stackdriver exporter scrape error (instance {{ $labels.instance }})
|
||||
description: "Stackdriver exporter failed to scrape metrics from Google Cloud Monitoring API for project {{ $labels.project_id }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: StackdriverExporterSlowScrape
|
||||
expr: 'stackdriver_monitoring_last_scrape_duration_seconds > 300'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Stackdriver exporter slow scrape (instance {{ $labels.instance }})
|
||||
description: "Stackdriver exporter scrape for project {{ $labels.project_id }} is taking more than 5 minutes ({{ $value }}s).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: StackdriverExporterScrapeErrorsIncreasing
|
||||
expr: 'increase(stackdriver_monitoring_scrape_errors_total[15m]) > 5'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Stackdriver exporter scrape errors increasing (instance {{ $labels.instance }})
|
||||
description: "Stackdriver exporter has had {{ $value }} scrape errors in the last 15 minutes for project {{ $labels.project_id }}.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: StackdriverExporterHighApiCalls
|
||||
expr: 'rate(stackdriver_monitoring_api_calls_total[5m]) * 60 > 100'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Stackdriver exporter high API calls (instance {{ $labels.instance }})
|
||||
description: "Stackdriver exporter is making {{ $value }} API calls per minute for project {{ $labels.project_id }}. This may hit Google Cloud Monitoring API quotas.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: StackdriverExporterScrapeStale
|
||||
expr: 'time() - stackdriver_monitoring_last_scrape_timestamp > 600'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Stackdriver exporter scrape stale (instance {{ $labels.instance }})
|
||||
description: "Stackdriver exporter has not successfully scraped metrics for project {{ $labels.project_id }} in the last 10 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
72
dist/rules/systemd/systemd-exporter.yml
vendored
Normal file
72
dist/rules/systemd/systemd-exporter.yml
vendored
Normal file
|
|
@ -0,0 +1,72 @@
|
|||
groups:
|
||||
|
||||
- name: SystemdExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: SystemdUnitFailed
|
||||
expr: 'systemd_unit_state{state="failed"} == 1'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Systemd unit failed (instance {{ $labels.instance }})
|
||||
description: "Systemd unit {{ $labels.name }} has entered failed state. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Many units are legitimately inactive. You must adjust the name=~ filter to match your critical services.
|
||||
- alert: SystemdUnitInactive
|
||||
expr: 'systemd_unit_state{state="inactive", type="service", name=~"your-critical-service.+"} == 1'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Systemd unit inactive (instance {{ $labels.instance }})
|
||||
description: "Systemd unit {{ $labels.name }} is inactive. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: SystemdServiceCrashLooping
|
||||
expr: 'increase(systemd_service_restart_total[1h]) > 5'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Systemd service crash looping (instance {{ $labels.instance }})
|
||||
description: "Systemd service {{ $labels.name }} has restarted {{ $value }} times in the last hour. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: SystemdUnitTasksNearLimit
|
||||
expr: 'systemd_unit_tasks_current / systemd_unit_tasks_max > 0.9 and systemd_unit_tasks_max > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Systemd unit tasks near limit (instance {{ $labels.instance }})
|
||||
description: "Systemd unit {{ $labels.name }} is using {{ $value | humanizePercentage }} of its task limit. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: SystemdSocketRefusedConnections
|
||||
expr: 'increase(systemd_socket_refused_connections_total[5m]) > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Systemd socket refused connections (instance {{ $labels.instance }})
|
||||
description: "Systemd socket {{ $labels.name }} is refusing connections. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold of 100 connections is arbitrary. Adjust to your workload.
|
||||
- alert: SystemdSocketHighConnections
|
||||
expr: 'systemd_socket_current_connections > 100'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Systemd socket high connections (instance {{ $labels.instance }})
|
||||
description: "Systemd socket {{ $labels.name }} has {{ $value }} active connections. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Triggers if timer hasn't fired in 24 hours. Adjust threshold per timer schedule.
|
||||
- alert: SystemdTimerMissedTrigger
|
||||
expr: '(time() - systemd_timer_last_trigger_seconds) / 3600 > 24 and systemd_timer_last_trigger_seconds > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Systemd timer missed trigger (instance {{ $labels.instance }})
|
||||
description: "Systemd timer {{ $labels.name }} has not triggered for over 24 hours. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
Loading…
Reference in a new issue