diff --git a/dist/rules/hadoop/hadoop-jmx-exporter.yml b/dist/rules/hadoop/hadoop-jmx-exporter.yml deleted file mode 100644 index 42e3478..0000000 --- a/dist/rules/hadoop/hadoop-jmx-exporter.yml +++ /dev/null @@ -1,102 +0,0 @@ -groups: - - name: HadoopAlerts - rules: - # Alert rule for NameNode availability - - alert: HadoopNameNodeDown - expr: up{job="hadoop-namenode"} == 0 - for: 5m - labels: - severity: critical - annotations: - summary: "Hadoop NameNode is down" - description: "The Hadoop NameNode service is unavailable." - - # Alert rule for ResourceManager availability - - alert: HadoopResourceManagerDown - expr: up{job="hadoop-resourcemanager"} == 0 - for: 5m - labels: - severity: critical - annotations: - summary: "Hadoop ResourceManager is down" - description: "The Hadoop ResourceManager service is unavailable." - - # Alert rule for DataNode status - - alert: HadoopDataNodeOutOfService - expr: hadoop_datanode_last_heartbeat == 0 - for: 10m - labels: - severity: warning - annotations: - summary: "Hadoop DataNode is out of service" - description: "The Hadoop DataNode is not sending heartbeats." - - # Alert rule for low HDFS disk space - - alert: HadoopHDFSDiskSpaceLow - expr: (hadoop_hdfs_bytes_total - hadoop_hdfs_bytes_used) / hadoop_hdfs_bytes_total < 0.1 - for: 15m - labels: - severity: warning - annotations: - summary: "Low HDFS Disk Space" - description: "Available HDFS disk space is running low." - - # Alert rule for excessive MapReduce task failures - - alert: HadoopMapReduceTaskFailures - expr: hadoop_mapreduce_task_failures_total > 100 - for: 10m - labels: - severity: critical - annotations: - summary: "Excessive MapReduce Task Failures" - description: "There is an unusually high number of MapReduce task failures." - - # Alert rule for high ResourceManager memory usage - - alert: HadoopResourceManagerMemoryHigh - expr: hadoop_resourcemanager_memory_bytes / hadoop_resourcemanager_memory_max_bytes > 0.8 - for: 15m - labels: - severity: warning - annotations: - summary: "High ResourceManager Memory Usage" - description: "The Hadoop ResourceManager is approaching its memory limit." - - # Alert rule for high YARN container allocation failures - - alert: HadoopYARNContainerAllocationFailures - expr: hadoop_yarn_container_allocation_failures_total > 10 - for: 10m - labels: - severity: warning - annotations: - summary: "High YARN Container Allocation Failures" - description: "There is a significant number of YARN container allocation failures." - - # Alert rule for excessive HBase region server region count - - alert: HadoopHBaseRegionCountHigh - expr: hadoop_hbase_region_count > 5000 - for: 15m - labels: - severity: warning - annotations: - summary: "Excessive HBase Region Count" - description: "The HBase cluster has an unusually high number of regions." - - # Alert rule for low HBase region server heap space - - alert: HadoopHBaseRegionServerHeapLow - expr: hadoop_hbase_region_server_heap_bytes / hadoop_hbase_region_server_max_heap_bytes < 0.2 - for: 10m - labels: - severity: critical - annotations: - summary: "Low HBase Region Server Heap Space" - description: "HBase Region Servers are running low on heap space." - - # Alert rule for high HBase Write Requests latency - - alert: HadoopHBaseWriteRequestsLatencyHigh - expr: hadoop_hbase_write_requests_latency_seconds > 0.5 - for: 10m - labels: - severity: warning - annotations: - summary: "High HBase Write Requests Latency" - description: "HBase Write Requests are experiencing high latency." \ No newline at end of file diff --git a/dist/rules/hadoop/jmx_exporter.yml b/dist/rules/hadoop/jmx_exporter.yml new file mode 100644 index 0000000..42d6ee3 --- /dev/null +++ b/dist/rules/hadoop/jmx_exporter.yml @@ -0,0 +1,95 @@ +groups: + +- name: Jmx_exporter + + rules: + + - alert: HadoopNameNodeDown + expr: 'up{job="hadoop-namenode"} == 0' + for: 5m + labels: + severity: critical + annotations: + summary: Hadoop Name Node Down (instance {{ $labels.instance }}) + description: "The Hadoop NameNode service is unavailable.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HadoopResourceManagerDown + expr: 'up{job="hadoop-resourcemanager"} == 0' + for: 5m + labels: + severity: critical + annotations: + summary: Hadoop Resource Manager Down (instance {{ $labels.instance }}) + description: "The Hadoop ResourceManager service is unavailable.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HadoopDataNodeOutOfService + expr: 'hadoop_datanode_last_heartbeat == 0' + for: 10m + labels: + severity: warning + annotations: + summary: Hadoop Data Node Out Of Service (instance {{ $labels.instance }}) + description: "The Hadoop DataNode is not sending heartbeats.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HadoopHdfsDiskSpaceLow + expr: '(hadoop_hdfs_bytes_total - hadoop_hdfs_bytes_used) / hadoop_hdfs_bytes_total < 0.1' + for: 15m + labels: + severity: warning + annotations: + summary: Hadoop HDFS Disk Space Low (instance {{ $labels.instance }}) + description: "Available HDFS disk space is running low.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HadoopMapReduceTaskFailures + expr: 'hadoop_mapreduce_task_failures_total > 100' + for: 10m + labels: + severity: critical + annotations: + summary: Hadoop Map Reduce Task Failures (instance {{ $labels.instance }}) + description: "There is an unusually high number of MapReduce task failures.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HadoopResourceManagerMemoryHigh + expr: 'hadoop_resourcemanager_memory_bytes / hadoop_resourcemanager_memory_max_bytes > 0.8' + for: 15m + labels: + severity: warning + annotations: + summary: Hadoop Resource Manager Memory High (instance {{ $labels.instance }}) + description: "The Hadoop ResourceManager is approaching its memory limit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HadoopYarnContainerAllocationFailures + expr: 'hadoop_yarn_container_allocation_failures_total > 10' + for: 10m + labels: + severity: warning + annotations: + summary: Hadoop YARN Container Allocation Failures (instance {{ $labels.instance }}) + description: "There is a significant number of YARN container allocation failures.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HadoopHbaseRegionCountHigh + expr: 'hadoop_hbase_region_count > 5000' + for: 15m + labels: + severity: warning + annotations: + summary: Hadoop HBase Region Count High (instance {{ $labels.instance }}) + description: "The HBase cluster has an unusually high number of regions.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HadoopHbaseRegionServerHeapLow + expr: 'hadoop_hbase_region_server_heap_bytes / hadoop_hbase_region_server_max_heap_bytes < 0.2' + for: 10m + labels: + severity: critical + annotations: + summary: Hadoop HBase Region Server Heap Low (instance {{ $labels.instance }}) + description: "HBase Region Servers are running low on heap space.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HadoopHbaseWriteRequestsLatencyHigh + expr: 'hadoop_hbase_write_requests_latency_seconds > 0.5' + for: 10m + labels: + severity: warning + annotations: + summary: Hadoop HBase Write Requests Latency High (instance {{ $labels.instance }}) + description: "HBase Write Requests are experiencing high latency.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"