From 7dedf1269510adfe509985afe7d3c965f52113bd Mon Sep 17 00:00:00 2001 From: Jonathan Davies Date: Tue, 25 Jun 2019 14:15:43 +0100 Subject: [PATCH] _data/rules.yml: Added missing jobs rules. --- _data/rules.yml | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/_data/rules.yml b/_data/rules.yml index 1378091..a2ed5b8 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -295,18 +295,38 @@ services: - name: Kubernetes exporters: - rules: + - name: KubeAPIServerDown + description: "Kubernetes API has disappeared from Prometheus target discovery." + query: 'absent(up{job="kubernetes-apiservers"}) == 1' + severity: critical - name: KubeClientErrors - description: "Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ printf \"%0.0f\" $value }}% errors.'" + description: "Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ printf "%0.0f" $value }}% errors.'" query: '(sum(rate(rest_client_requests_total{job=~"kubernetes-.*",code=~"5.."}[5m])) by (instance, job) / sum(rate(rest_client_requests_total[5m])) by (instance, job)) * 100 > 1' severity: warning + - name: KubeControllerManagerDown + description: "Kubernetes controller manager has disappeared from Prometheus target discovery." + query: 'absent(up{k8s-app="kubernetes-controller-manager"}) == 1' + severity: critical - name: KubeNodeNotReady description: "{{ $labels.node }} has been unready for more than an hour." query: 'kube_node_status_condition{condition="Ready",status="true"} == 0' severity: warning + - name: KubeSchedulerDown + description: "Kubernetes scheduler has disappeared from Prometheus target discovery." + query: 'absent(up{k8s-app="kubernetes-scheduler"}) == 1' + severity: critical + - name: KubeStateMetricsDown + description: "Kubernetes state metrics has disappeared from Prometheus target discovery." + query: 'absent(up{component="kube-state-metrics"}) == 1' + severity: critical - name: KubeVersionMismatch description: "There are {{ $value }} different semantic versions of Kubernetes components running." query: 'count(count by (gitVersion) (label_replace(kubernetes_build_info{job!="coredns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*.[0-9]*).*"))) > 1' severity: warning + - name: KubeletDown + description: "Kubelet has disappeared from Prometheus target discovery." + query: 'absent(up{job="kubernetes-nodes"} == 1)' + severity: warning - name: KubeletTooManyPods description: "Kubelet {{ $labels.instance }} is running {{ $value }} pods, close to the limit of 110." query: 'kubelet_running_pod_count{job="kubernetes-nodes"} > (110 * 0.9)'