feat(data): adding k8s rule - detect container killed by oomkiller

This commit is contained in:
Samuel Berthe 2021-05-01 19:33:03 +02:00
parent 18672ff0f9
commit 1c44cd7818
No known key found for this signature in database
GPG key ID: 64863511FFBD0E3C

View file

@ -1501,6 +1501,10 @@ groups:
query: 'sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(pod, namespace) group_left(node) (0 * kube_pod_info)) / sum(kube_node_status_allocatable_pods) by (node) * 100 > 90'
severity: warning
for: 2m
- name: Kubernetes container oom killer
description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes."
query: '(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1'
severity: warning
- name: Kubernetes Job failed
description: "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete"
query: 'kube_job_status_failed > 0'