Publish

2026-06-21 17:07:24 +08:00 · 2026-03-16 03:46:30 +00:00 · 2026-03-16 03:46:30 +00:00 · c390641203
commit c390641203
parent e6cdcdb9e5
2 changed files with 207 additions and 0 deletions
--- a/dist/rules/apache-flink/flink-prometheus-reporter.yml
+++ b/dist/rules/apache-flink/flink-prometheus-reporter.yml
@ -0,0 +1,119 @@
+groups:
+
+- name: FlinkPrometheusReporter
+
+  
+  rules:
+
+    - alert: FlinkJobIsNotRunning
+      expr: 'flink_jobmanager_numRunningJobs == 0'
+      for: 1m
+      labels:
+        severity: critical
+      annotations:
+        summary: Flink job is not running (instance {{ $labels.instance }})
+        description: "No Flink jobs are currently running. All jobs may have failed or been cancelled.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: FlinkNoTaskmanagersRegistered
+      expr: 'flink_jobmanager_numRegisteredTaskManagers == 0'
+      for: 1m
+      labels:
+        severity: critical
+      annotations:
+        summary: Flink no TaskManagers registered (instance {{ $labels.instance }})
+        description: "No TaskManagers are registered with the JobManager. The cluster has no processing capacity.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    # This alert fires when there are no available task slots. Adjust the threshold if your cluster is expected to run at full capacity.
+    - alert: FlinkAllTaskSlotsUsed
+      expr: 'flink_jobmanager_taskSlotsAvailable == 0'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Flink all task slots used (instance {{ $labels.instance }})
+        description: "All Flink task slots are in use ({{ $value }} available). New jobs cannot be scheduled.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: FlinkJobRestartIncreasing
+      expr: 'increase(flink_jobmanager_job_numRestarts[5m]) > 0'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Flink job restart increasing (instance {{ $labels.instance }})
+        description: "Flink job {{ $labels.job_name }} has restarted {{ $value }} times in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: FlinkCheckpointFailures
+      expr: 'increase(flink_jobmanager_job_numberOfFailedCheckpoints[10m]) > 0'
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: Flink checkpoint failures (instance {{ $labels.instance }})
+        description: "Flink job {{ $labels.job_name }} has {{ $value }} failed checkpoints in the last 10 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    # Threshold is 60 seconds. Adjust based on your checkpoint interval and state size.
+    - alert: FlinkCheckpointDurationHigh
+      expr: 'flink_jobmanager_job_lastCheckpointDuration > 60000'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Flink checkpoint duration high (instance {{ $labels.instance }})
+        description: "Flink job {{ $labels.job_name }} last checkpoint took {{ $value | humanizeDuration }} to complete.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: FlinkTaskBackpressured
+      expr: 'flink_taskmanager_job_task_isBackPressured == 1'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Flink task backpressured (instance {{ $labels.instance }})
+        description: "Flink task {{ $labels.task_name }} in job {{ $labels.job_name }} is backpressured.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    # Fires when a task spends more than 500ms/sec backpressured. This indicates the task cannot keep up with upstream data rate.
+    - alert: FlinkTaskHighBackpressureTime
+      expr: 'flink_taskmanager_job_task_backPressuredTimeMsPerSecond > 500'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Flink task high backpressure time (instance {{ $labels.instance }})
+        description: "Flink task {{ $labels.task_name }} is spending {{ $value | humanize }}ms/sec in backpressure.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: FlinkTaskmanagerHeapMemoryHigh
+      expr: 'flink_taskmanager_Status_JVM_Memory_Heap_Used / flink_taskmanager_Status_JVM_Memory_Heap_Max > 0.9'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Flink TaskManager heap memory high (instance {{ $labels.instance }})
+        description: "Flink TaskManager {{ $labels.instance }} heap memory usage is above 90%.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: FlinkJobmanagerHeapMemoryHigh
+      expr: 'flink_jobmanager_Status_JVM_Memory_Heap_Used / flink_jobmanager_Status_JVM_Memory_Heap_Max > 0.9'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Flink JobManager heap memory high (instance {{ $labels.instance }})
+        description: "Flink JobManager {{ $labels.instance }} heap memory usage is above 90%.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    # Threshold: more than 100ms/sec of GC time (10% of wall clock). Adjust based on your workload.
+    - alert: FlinkTaskmanagerGcTimeHigh
+      expr: 'rate(flink_taskmanager_Status_JVM_GarbageCollector_All_Time[5m]) > 100'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Flink TaskManager GC time high (instance {{ $labels.instance }})
+        description: "Flink TaskManager {{ $labels.instance }} is spending more than 10% of time in garbage collection.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    # Only fires for tasks that have previously received records, to avoid false positives during startup.
+    - alert: FlinkNoRecordsProcessed
+      expr: 'rate(flink_taskmanager_job_task_numRecordsIn[5m]) == 0 and flink_taskmanager_job_task_numRecordsIn > 0'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Flink no records processed (instance {{ $labels.instance }})
+        description: "Flink task {{ $labels.task_name }} has not processed any records in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/dist/rules/apache-spark/spark-prometheus.yml
+++ b/dist/rules/apache-spark/spark-prometheus.yml
@ -0,0 +1,88 @@
+groups:
+
+- name: SparkPrometheus
+
+  # Spark exposes metrics via two built-in endpoints:
+  # - PrometheusServlet: master/worker/driver metrics at /metrics/prometheus/ (ports 8080, 8081, 4040)
+  # - PrometheusResource: executor metrics at /metrics/executors/prometheus/ (port 4040, requires spark.ui.prometheus.enabled=true in Spark 3.x)
+  # Metric names from PrometheusServlet include a dynamic namespace (application ID), making static PromQL queries challenging.
+  # Configuration: spark.metrics.conf.*.sink.prometheusServlet.class=org.apache.spark.metrics.sink.PrometheusServlet
+  
+  rules:
+
+    - alert: SparkNoAliveWorkers
+      expr: 'metrics_master_aliveWorkers_Value == 0'
+      for: 1m
+      labels:
+        severity: critical
+      annotations:
+        summary: Spark no alive workers (instance {{ $labels.instance }})
+        description: "No Spark workers are alive. The cluster has no processing capacity.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    # Adjust the threshold based on your cluster's typical queuing behavior.
+    - alert: SparkTooManyWaitingApps
+      expr: 'metrics_master_waitingApps_Value > 10'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Spark too many waiting apps (instance {{ $labels.instance }})
+        description: "Spark has {{ $value }} applications waiting for resources.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: SparkWorkerMemoryExhausted
+      expr: 'metrics_worker_memFree_MB_Value == 0'
+      for: 2m
+      labels:
+        severity: warning
+      annotations:
+        summary: Spark worker memory exhausted (instance {{ $labels.instance }})
+        description: "Spark worker {{ $labels.instance }} has no free memory ({{ $value }}MB free).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    # Fires when a worker has no free cores. This may be normal under high load but can indicate capacity issues.
+    - alert: SparkWorkerCoresExhausted
+      expr: 'metrics_worker_coresFree_Value == 0'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Spark worker cores exhausted (instance {{ $labels.instance }})
+        description: "Spark worker {{ $labels.instance }} has no free cores.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    # Fires when more than 10% of executor time is spent in garbage collection.
+    # This metric comes from the PrometheusResource endpoint (/metrics/executors/prometheus/).
+    - alert: SparkExecutorHighGcTime
+      expr: 'metrics_executor_totalGCTime / (metrics_executor_totalDuration > 0) > 0.1'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Spark executor high GC time (instance {{ $labels.instance }})
+        description: "Spark executor {{ $labels.executor_id }} in {{ $labels.application_name }} is spending too much time in GC.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: SparkExecutorAllTasksFailing
+      expr: 'metrics_executor_failedTasks > 0 and metrics_executor_completedTasks == 0'
+      for: 5m
+      labels:
+        severity: critical
+      annotations:
+        summary: Spark executor all tasks failing (instance {{ $labels.instance }})
+        description: "Spark executor {{ $labels.executor_id }} has only failing tasks ({{ $value }} failed, 0 completed).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    - alert: SparkExecutorHighTaskFailureRate
+      expr: 'metrics_executor_failedTasks / (metrics_executor_totalTasks > 0) > 0.1'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Spark executor high task failure rate (instance {{ $labels.instance }})
+        description: "Spark executor {{ $labels.executor_id }} has a task failure rate above 10%.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+
+    # Disk spilling indicates insufficient memory for the workload.
+    - alert: SparkExecutorHighDiskSpill
+      expr: 'rate(metrics_executor_diskUsed_bytes[5m]) > 0'
+      for: 5m
+      labels:
+        severity: warning
+      annotations:
+        summary: Spark executor high disk spill (instance {{ $labels.instance }})
+        description: "Spark executor {{ $labels.executor_id }} is spilling data to disk. Consider increasing executor memory.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"