This commit is contained in:
samber 2023-10-06 16:50:22 +00:00
parent 7a8f883df6
commit 82f2798620
2 changed files with 95 additions and 102 deletions

View file

@ -1,102 +0,0 @@
groups:
- name: HadoopAlerts
rules:
# Alert rule for NameNode availability
- alert: HadoopNameNodeDown
expr: up{job="hadoop-namenode"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Hadoop NameNode is down"
description: "The Hadoop NameNode service is unavailable."
# Alert rule for ResourceManager availability
- alert: HadoopResourceManagerDown
expr: up{job="hadoop-resourcemanager"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Hadoop ResourceManager is down"
description: "The Hadoop ResourceManager service is unavailable."
# Alert rule for DataNode status
- alert: HadoopDataNodeOutOfService
expr: hadoop_datanode_last_heartbeat == 0
for: 10m
labels:
severity: warning
annotations:
summary: "Hadoop DataNode is out of service"
description: "The Hadoop DataNode is not sending heartbeats."
# Alert rule for low HDFS disk space
- alert: HadoopHDFSDiskSpaceLow
expr: (hadoop_hdfs_bytes_total - hadoop_hdfs_bytes_used) / hadoop_hdfs_bytes_total < 0.1
for: 15m
labels:
severity: warning
annotations:
summary: "Low HDFS Disk Space"
description: "Available HDFS disk space is running low."
# Alert rule for excessive MapReduce task failures
- alert: HadoopMapReduceTaskFailures
expr: hadoop_mapreduce_task_failures_total > 100
for: 10m
labels:
severity: critical
annotations:
summary: "Excessive MapReduce Task Failures"
description: "There is an unusually high number of MapReduce task failures."
# Alert rule for high ResourceManager memory usage
- alert: HadoopResourceManagerMemoryHigh
expr: hadoop_resourcemanager_memory_bytes / hadoop_resourcemanager_memory_max_bytes > 0.8
for: 15m
labels:
severity: warning
annotations:
summary: "High ResourceManager Memory Usage"
description: "The Hadoop ResourceManager is approaching its memory limit."
# Alert rule for high YARN container allocation failures
- alert: HadoopYARNContainerAllocationFailures
expr: hadoop_yarn_container_allocation_failures_total > 10
for: 10m
labels:
severity: warning
annotations:
summary: "High YARN Container Allocation Failures"
description: "There is a significant number of YARN container allocation failures."
# Alert rule for excessive HBase region server region count
- alert: HadoopHBaseRegionCountHigh
expr: hadoop_hbase_region_count > 5000
for: 15m
labels:
severity: warning
annotations:
summary: "Excessive HBase Region Count"
description: "The HBase cluster has an unusually high number of regions."
# Alert rule for low HBase region server heap space
- alert: HadoopHBaseRegionServerHeapLow
expr: hadoop_hbase_region_server_heap_bytes / hadoop_hbase_region_server_max_heap_bytes < 0.2
for: 10m
labels:
severity: critical
annotations:
summary: "Low HBase Region Server Heap Space"
description: "HBase Region Servers are running low on heap space."
# Alert rule for high HBase Write Requests latency
- alert: HadoopHBaseWriteRequestsLatencyHigh
expr: hadoop_hbase_write_requests_latency_seconds > 0.5
for: 10m
labels:
severity: warning
annotations:
summary: "High HBase Write Requests Latency"
description: "HBase Write Requests are experiencing high latency."

95
dist/rules/hadoop/jmx_exporter.yml vendored Normal file
View file

@ -0,0 +1,95 @@
groups:
- name: Jmx_exporter
rules:
- alert: HadoopNameNodeDown
expr: 'up{job="hadoop-namenode"} == 0'
for: 5m
labels:
severity: critical
annotations:
summary: Hadoop Name Node Down (instance {{ $labels.instance }})
description: "The Hadoop NameNode service is unavailable.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HadoopResourceManagerDown
expr: 'up{job="hadoop-resourcemanager"} == 0'
for: 5m
labels:
severity: critical
annotations:
summary: Hadoop Resource Manager Down (instance {{ $labels.instance }})
description: "The Hadoop ResourceManager service is unavailable.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HadoopDataNodeOutOfService
expr: 'hadoop_datanode_last_heartbeat == 0'
for: 10m
labels:
severity: warning
annotations:
summary: Hadoop Data Node Out Of Service (instance {{ $labels.instance }})
description: "The Hadoop DataNode is not sending heartbeats.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HadoopHdfsDiskSpaceLow
expr: '(hadoop_hdfs_bytes_total - hadoop_hdfs_bytes_used) / hadoop_hdfs_bytes_total < 0.1'
for: 15m
labels:
severity: warning
annotations:
summary: Hadoop HDFS Disk Space Low (instance {{ $labels.instance }})
description: "Available HDFS disk space is running low.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HadoopMapReduceTaskFailures
expr: 'hadoop_mapreduce_task_failures_total > 100'
for: 10m
labels:
severity: critical
annotations:
summary: Hadoop Map Reduce Task Failures (instance {{ $labels.instance }})
description: "There is an unusually high number of MapReduce task failures.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HadoopResourceManagerMemoryHigh
expr: 'hadoop_resourcemanager_memory_bytes / hadoop_resourcemanager_memory_max_bytes > 0.8'
for: 15m
labels:
severity: warning
annotations:
summary: Hadoop Resource Manager Memory High (instance {{ $labels.instance }})
description: "The Hadoop ResourceManager is approaching its memory limit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HadoopYarnContainerAllocationFailures
expr: 'hadoop_yarn_container_allocation_failures_total > 10'
for: 10m
labels:
severity: warning
annotations:
summary: Hadoop YARN Container Allocation Failures (instance {{ $labels.instance }})
description: "There is a significant number of YARN container allocation failures.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HadoopHbaseRegionCountHigh
expr: 'hadoop_hbase_region_count > 5000'
for: 15m
labels:
severity: warning
annotations:
summary: Hadoop HBase Region Count High (instance {{ $labels.instance }})
description: "The HBase cluster has an unusually high number of regions.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HadoopHbaseRegionServerHeapLow
expr: 'hadoop_hbase_region_server_heap_bytes / hadoop_hbase_region_server_max_heap_bytes < 0.2'
for: 10m
labels:
severity: critical
annotations:
summary: Hadoop HBase Region Server Heap Low (instance {{ $labels.instance }})
description: "HBase Region Servers are running low on heap space.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HadoopHbaseWriteRequestsLatencyHigh
expr: 'hadoop_hbase_write_requests_latency_seconds > 0.5'
for: 10m
labels:
severity: warning
annotations:
summary: Hadoop HBase Write Requests Latency High (instance {{ $labels.instance }})
description: "HBase Write Requests are experiencing high latency.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"