From 52ce326823c08ffbecef87b7044e19dc39c47b43 Mon Sep 17 00:00:00 2001 From: Sofrony Pavel Date: Mon, 11 Feb 2019 15:46:46 +0300 Subject: [PATCH 1/3] Elasticsearch alert rules --- _data/rules.yml | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/_data/rules.yml b/_data/rules.yml index a655c8b..e5d4f8f 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -216,6 +216,46 @@ services: - name: justwatchcom/elasticsearch_exporter doc_url: https://github.com/justwatchcom/elasticsearch_exporter rules: + - name: Elastic Heap Usage Too High + description: 'The heap usage is over 90% for 5m - {{ $value | printf "%.2f"}}%' + query: '(elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 90' + severity: critical + - name: Elastic Cluster RED + description: Elastic Cluster RED status + query: 'elasticsearch_cluster_health_status{color="red"} == 1' + severity: critical + - name: Elastic Cluster Yellow + description: Elastic Cluster Yellow status + query: 'elasticsearch_cluster_health_status{color="yellow"} == 1' + severity: warning + - name: Number of Elastic Healthy Nodes + description: 'Number Healthy Nodes less then number_of_nodes - {{ $value }}' + query: 'elasticsearch_cluster_health_number_of_nodes < number_of_nodes' + severity: critical + - name: Number of Elastic Healthy Data Nodes + description: 'Number Healthy Data Nodes less then number_of_data_nodes - {{ $value }}' + query: 'elasticsearch_cluster_health_number_of_data_nodes < number_of_data_nodes' + severity: critical + - name: Number of relocation shards + description: 'Number of relocation shards for 20 min - {{ $value }}' + query: 'elasticsearch_cluster_health_relocating_shards > 0' + severity: critical + - name: Number of initializing shards + description: 'Number of initializing shards for 10 min - {{ $value }}' + query: 'elasticsearch_cluster_health_initializing_shards > 0' + severity: critical + - name: Number of unassigned shards + description: 'Number of unassigned shards for 2 min - {{ $value }}' + query: 'elasticsearch_cluster_health_unassigned_shards > 0' + severity: critical + - name: Number of pending tasks + description: 'Number of pending tasks for 10 min - {{ $value }}. Cluster works slowly.' + query: 'elasticsearch_cluster_health_number_of_pending_tasks > 0' + severity: warning + - name: Elastic no new documents + description: No new documents for 10 min! + query: 'rate(elasticsearch_indices_docs{es_master_node="false"}[10m]) < 1' + severity: warning - name: Apache exporters: From eab8b1a86d74555d10b6341509f86eb8834adf33 Mon Sep 17 00:00:00 2001 From: Sofrony Pavel Date: Mon, 11 Feb 2019 16:50:26 +0300 Subject: [PATCH 2/3] Elasticsearch Heap Usage warning (>80%) --- _data/rules.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/_data/rules.yml b/_data/rules.yml index e5d4f8f..c5aa279 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -220,6 +220,10 @@ services: description: 'The heap usage is over 90% for 5m - {{ $value | printf "%.2f"}}%' query: '(elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 90' severity: critical + - name: Elastic Heap Usage warning + description: 'The heap usage is over 80% for 5m - {{ $value | printf "%.2f"}}%' + query: '(elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 80' + severity: warning - name: Elastic Cluster RED description: Elastic Cluster RED status query: 'elasticsearch_cluster_health_status{color="red"} == 1' From 0999af4aa8193192daafa16457ae5ab2677278a9 Mon Sep 17 00:00:00 2001 From: Sofrony Pavel Date: Mon, 11 Feb 2019 16:58:15 +0300 Subject: [PATCH 3/3] consistent naming for severity --- _data/rules.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/_data/rules.yml b/_data/rules.yml index c5aa279..39d2140 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -219,7 +219,7 @@ services: - name: Elastic Heap Usage Too High description: 'The heap usage is over 90% for 5m - {{ $value | printf "%.2f"}}%' query: '(elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 90' - severity: critical + severity: error - name: Elastic Heap Usage warning description: 'The heap usage is over 80% for 5m - {{ $value | printf "%.2f"}}%' query: '(elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 80' @@ -227,7 +227,7 @@ services: - name: Elastic Cluster RED description: Elastic Cluster RED status query: 'elasticsearch_cluster_health_status{color="red"} == 1' - severity: critical + severity: error - name: Elastic Cluster Yellow description: Elastic Cluster Yellow status query: 'elasticsearch_cluster_health_status{color="yellow"} == 1' @@ -235,23 +235,23 @@ services: - name: Number of Elastic Healthy Nodes description: 'Number Healthy Nodes less then number_of_nodes - {{ $value }}' query: 'elasticsearch_cluster_health_number_of_nodes < number_of_nodes' - severity: critical + severity: error - name: Number of Elastic Healthy Data Nodes description: 'Number Healthy Data Nodes less then number_of_data_nodes - {{ $value }}' query: 'elasticsearch_cluster_health_number_of_data_nodes < number_of_data_nodes' - severity: critical + severity: error - name: Number of relocation shards description: 'Number of relocation shards for 20 min - {{ $value }}' query: 'elasticsearch_cluster_health_relocating_shards > 0' - severity: critical + severity: error - name: Number of initializing shards description: 'Number of initializing shards for 10 min - {{ $value }}' query: 'elasticsearch_cluster_health_initializing_shards > 0' - severity: critical + severity: error - name: Number of unassigned shards description: 'Number of unassigned shards for 2 min - {{ $value }}' query: 'elasticsearch_cluster_health_unassigned_shards > 0' - severity: critical + severity: error - name: Number of pending tasks description: 'Number of pending tasks for 10 min - {{ $value }}. Cluster works slowly.' query: 'elasticsearch_cluster_health_number_of_pending_tasks > 0'