From 7a8f883df6633cdbccd9c9a54c618e0aff496658 Mon Sep 17 00:00:00 2001 From: Vicky Wilson Jacob Date: Fri, 6 Oct 2023 12:48:54 -0400 Subject: [PATCH] feat: adding hadoop jmx exporter (#391) * adding hadoop exporter * added hadoop rules with jmx exporter * added hadoop rules with jmx exporter * Update rules.yml --------- Co-authored-by: Samuel Berthe --- _data/rules.yml | 77 ++++++++++++++++ dist/rules/hadoop/hadoop-jmx-exporter.yml | 102 ++++++++++++++++++++++ 2 files changed, 179 insertions(+) create mode 100644 dist/rules/hadoop/hadoop-jmx-exporter.yml diff --git a/_data/rules.yml b/_data/rules.yml index d818035..12d8e34 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -1405,6 +1405,83 @@ groups: query: 'solr_collections_live_nodes < 2' severity: critical + - name: Hadoop + exporters: + - name: hadoop/jmx_exporter + slug: jmx_exporter + doc_url: https://github.com/prometheus/jmx_exporter + rules: + # Alert rule for NameNode availability + - name: Hadoop Name Node Down + query: up{job="hadoop-namenode"} == 0 + for: 5m + severity: critical + description: "The Hadoop NameNode service is unavailable." + + # Alert rule for ResourceManager availability + - name: Hadoop Resource Manager Down + query: up{job="hadoop-resourcemanager"} == 0 + for: 5m + severity: critical + description: "The Hadoop ResourceManager service is unavailable." + + # Alert rule for DataNode status + - name: Hadoop Data Node Out Of Service + query: hadoop_datanode_last_heartbeat == 0 + for: 10m + severity: warning + description: "The Hadoop DataNode is not sending heartbeats." + + # Alert rule for low HDFS disk space + - name: Hadoop HDFS Disk Space Low + query: (hadoop_hdfs_bytes_total - hadoop_hdfs_bytes_used) / hadoop_hdfs_bytes_total < 0.1 + for: 15m + severity: warning + description: "Available HDFS disk space is running low." + + # Alert rule for excessive MapReduce task failures + - name: Hadoop Map Reduce Task Failures + query: hadoop_mapreduce_task_failures_total > 100 + for: 10m + severity: critical + description: "There is an unusually high number of MapReduce task failures." + + # Alert rule for high ResourceManager memory usage + - name: Hadoop Resource Manager Memory High + query: hadoop_resourcemanager_memory_bytes / hadoop_resourcemanager_memory_max_bytes > 0.8 + for: 15m + severity: warning + description: "The Hadoop ResourceManager is approaching its memory limit." + + # Alert rule for high YARN container allocation failures + - name: Hadoop YARN Container Allocation Failures + query: hadoop_yarn_container_allocation_failures_total > 10 + for: 10m + severity: warning + description: "There is a significant number of YARN container allocation failures." + + # Alert rule for excessive HBase region server region count + - name: Hadoop HBase Region Count High + query: hadoop_hbase_region_count > 5000 + for: 15m + severity: warning + description: "The HBase cluster has an unusually high number of regions." + + # Alert rule for low HBase region server heap space + - name: Hadoop HBase Region Server Heap Low + query: hadoop_hbase_region_server_heap_bytes / hadoop_hbase_region_server_max_heap_bytes < 0.2 + for: 10m + severity: critical + description: "HBase Region Servers are running low on heap space." + + # Alert rule for high HBase Write Requests latency + - name: Hadoop HBase Write Requests Latency High + query: hadoop_hbase_write_requests_latency_seconds > 0.5 + for: 10m + severity: warning + description: "HBase Write Requests are experiencing high latency." + + - name: Reverse proxies and load balancers services: - name: Nginx diff --git a/dist/rules/hadoop/hadoop-jmx-exporter.yml b/dist/rules/hadoop/hadoop-jmx-exporter.yml new file mode 100644 index 0000000..42e3478 --- /dev/null +++ b/dist/rules/hadoop/hadoop-jmx-exporter.yml @@ -0,0 +1,102 @@ +groups: + - name: HadoopAlerts + rules: + # Alert rule for NameNode availability + - alert: HadoopNameNodeDown + expr: up{job="hadoop-namenode"} == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Hadoop NameNode is down" + description: "The Hadoop NameNode service is unavailable." + + # Alert rule for ResourceManager availability + - alert: HadoopResourceManagerDown + expr: up{job="hadoop-resourcemanager"} == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Hadoop ResourceManager is down" + description: "The Hadoop ResourceManager service is unavailable." + + # Alert rule for DataNode status + - alert: HadoopDataNodeOutOfService + expr: hadoop_datanode_last_heartbeat == 0 + for: 10m + labels: + severity: warning + annotations: + summary: "Hadoop DataNode is out of service" + description: "The Hadoop DataNode is not sending heartbeats." + + # Alert rule for low HDFS disk space + - alert: HadoopHDFSDiskSpaceLow + expr: (hadoop_hdfs_bytes_total - hadoop_hdfs_bytes_used) / hadoop_hdfs_bytes_total < 0.1 + for: 15m + labels: + severity: warning + annotations: + summary: "Low HDFS Disk Space" + description: "Available HDFS disk space is running low." + + # Alert rule for excessive MapReduce task failures + - alert: HadoopMapReduceTaskFailures + expr: hadoop_mapreduce_task_failures_total > 100 + for: 10m + labels: + severity: critical + annotations: + summary: "Excessive MapReduce Task Failures" + description: "There is an unusually high number of MapReduce task failures." + + # Alert rule for high ResourceManager memory usage + - alert: HadoopResourceManagerMemoryHigh + expr: hadoop_resourcemanager_memory_bytes / hadoop_resourcemanager_memory_max_bytes > 0.8 + for: 15m + labels: + severity: warning + annotations: + summary: "High ResourceManager Memory Usage" + description: "The Hadoop ResourceManager is approaching its memory limit." + + # Alert rule for high YARN container allocation failures + - alert: HadoopYARNContainerAllocationFailures + expr: hadoop_yarn_container_allocation_failures_total > 10 + for: 10m + labels: + severity: warning + annotations: + summary: "High YARN Container Allocation Failures" + description: "There is a significant number of YARN container allocation failures." + + # Alert rule for excessive HBase region server region count + - alert: HadoopHBaseRegionCountHigh + expr: hadoop_hbase_region_count > 5000 + for: 15m + labels: + severity: warning + annotations: + summary: "Excessive HBase Region Count" + description: "The HBase cluster has an unusually high number of regions." + + # Alert rule for low HBase region server heap space + - alert: HadoopHBaseRegionServerHeapLow + expr: hadoop_hbase_region_server_heap_bytes / hadoop_hbase_region_server_max_heap_bytes < 0.2 + for: 10m + labels: + severity: critical + annotations: + summary: "Low HBase Region Server Heap Space" + description: "HBase Region Servers are running low on heap space." + + # Alert rule for high HBase Write Requests latency + - alert: HadoopHBaseWriteRequestsLatencyHigh + expr: hadoop_hbase_write_requests_latency_seconds > 0.5 + for: 10m + labels: + severity: warning + annotations: + summary: "High HBase Write Requests Latency" + description: "HBase Write Requests are experiencing high latency." \ No newline at end of file