mirror of
https://github.com/samber/awesome-prometheus-alerts.git
synced 2026-06-23 01:47:17 +08:00
102 lines
4.7 KiB
YAML
102 lines
4.7 KiB
YAML
groups:
|
|
|
|
- name: Jmx_exporter
|
|
|
|
|
|
rules:
|
|
|
|
# When targets are managed via service discovery, a disappeared target goes stale rather than reporting up==0,
|
|
# so this alert may not fire. Prefer application-level availability metrics if available.
|
|
# Rename job="hadoop-namenode" to match the actual job name in your Prometheus scrape config.
|
|
- alert: HadoopNameNodeDown
|
|
expr: 'up{job="hadoop-namenode"} == 0'
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Hadoop Name Node Down (instance {{ $labels.instance }})
|
|
description: "The Hadoop NameNode service is unavailable.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
# When targets are managed via service discovery, a disappeared target goes stale rather than reporting up==0,
|
|
# so this alert may not fire. Prefer application-level availability metrics if available.
|
|
# Rename job="hadoop-resourcemanager" to match the actual job name in your Prometheus scrape config.
|
|
- alert: HadoopResourceManagerDown
|
|
expr: 'up{job="hadoop-resourcemanager"} == 0'
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Hadoop Resource Manager Down (instance {{ $labels.instance }})
|
|
description: "The Hadoop ResourceManager service is unavailable.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: HadoopDataNodeOutOfService
|
|
expr: 'hadoop_datanode_last_heartbeat == 0'
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Hadoop Data Node Out Of Service (instance {{ $labels.instance }})
|
|
description: "The Hadoop DataNode is not sending heartbeats.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: HadoopHdfsDiskSpaceLow
|
|
expr: '(hadoop_hdfs_bytes_total - hadoop_hdfs_bytes_used) / hadoop_hdfs_bytes_total < 0.1 and hadoop_hdfs_bytes_total > 0'
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Hadoop HDFS Disk Space Low (instance {{ $labels.instance }})
|
|
description: "Available HDFS disk space is running low.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: HadoopMapReduceTaskFailures
|
|
expr: 'increase(hadoop_mapreduce_task_failures_total[1h]) > 100'
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: Hadoop Map Reduce Task Failures (instance {{ $labels.instance }})
|
|
description: "There is an unusually high number of MapReduce task failures.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: HadoopResourceManagerMemoryHigh
|
|
expr: 'hadoop_resourcemanager_memory_bytes / hadoop_resourcemanager_memory_max_bytes > 0.8 and hadoop_resourcemanager_memory_max_bytes > 0'
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Hadoop Resource Manager Memory High (instance {{ $labels.instance }})
|
|
description: "The Hadoop ResourceManager is approaching its memory limit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: HadoopYarnContainerAllocationFailures
|
|
expr: 'increase(hadoop_yarn_container_allocation_failures_total[1h]) > 10'
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Hadoop YARN Container Allocation Failures (instance {{ $labels.instance }})
|
|
description: "There is a significant number of YARN container allocation failures.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: HadoopHbaseRegionCountHigh
|
|
expr: 'hadoop_hbase_region_count > 5000'
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Hadoop HBase Region Count High (instance {{ $labels.instance }})
|
|
description: "The HBase cluster has an unusually high number of regions.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: HadoopHbaseRegionServerHeapLow
|
|
expr: 'hadoop_hbase_region_server_heap_bytes / hadoop_hbase_region_server_max_heap_bytes > 0.8 and hadoop_hbase_region_server_max_heap_bytes > 0'
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Hadoop HBase Region Server Heap Low (instance {{ $labels.instance }})
|
|
description: "HBase Region Servers are running low on heap space.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
|
|
- alert: HadoopHbaseWriteRequestsLatencyHigh
|
|
expr: 'hadoop_hbase_write_requests_latency_seconds > 0.5'
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: Hadoop HBase Write Requests Latency High (instance {{ $labels.instance }})
|
|
description: "HBase Write Requests are experiencing high latency.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|