mirror of
https://github.com/samber/awesome-prometheus-alerts.git
synced 2026-06-26 11:27:00 +08:00
Merge branch 'master' into KubernetesJobSlowCompletion-exclude-failed
This commit is contained in:
commit
da71664d22
9 changed files with 98 additions and 47 deletions
18
.github/workflows/test.yml
vendored
18
.github/workflows/test.yml
vendored
|
|
@ -25,13 +25,27 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
gem install liquid-cli
|
gem install liquid-cli
|
||||||
cat _data/rules.yml | yq -I 0 -o json > _data/rules.json
|
cat _data/rules.yml | yq -I 0 -o json > _data/rules.json
|
||||||
cat test/template.yml | liquid "$(< _data/rules.json)" > test/rules.yml
|
|
||||||
|
for service in $(cat _data/rules.json | jq -r '.groups[].services[] | @base64'); do
|
||||||
|
subdir=test/rules/$(echo ${service} | base64 --decode | jq -r '.name | ascii_downcase | split(" ") | join("-")')
|
||||||
|
mkdir -p "${subdir}"
|
||||||
|
|
||||||
|
# groupName=$(echo "{% assign groupName = name | split: ' ' %}{% capture groupNameCamelcase %}{% for word in groupName %}{{ word | capitalize }} {% endfor %}{% endcapture %} {{ groupNameCamelcase | remove: ' ' | remove: '-' }}" | liquid $(echo ${service} | base64 --decode | jq -r '.name | ascii_downcase | split(" ") | join("-")'))
|
||||||
|
|
||||||
|
for exporter in $(echo ${service} | base64 --decode | jq -r '.exporters[] | @base64'); do
|
||||||
|
exporterName=$(echo ${exporter} | base64 --decode | jq -r '.slug')
|
||||||
|
cat dist/template.yml | liquid "$(echo ${exporter} | base64 --decode)" > ${subdir}/${exporterName}.yml
|
||||||
|
echo ${subdir}/${exporterName}.yml
|
||||||
|
done
|
||||||
|
done
|
||||||
|
|
||||||
|
rm _data/rules.json
|
||||||
|
|
||||||
- name: Check Prometheus alert rules
|
- name: Check Prometheus alert rules
|
||||||
uses: peimanja/promtool-github-actions@master
|
uses: peimanja/promtool-github-actions@master
|
||||||
with:
|
with:
|
||||||
promtool_actions_subcommand: 'rules'
|
promtool_actions_subcommand: 'rules'
|
||||||
promtool_actions_files: 'test/rules.yml'
|
promtool_actions_files: 'test/rules/*/*.yml'
|
||||||
promtool_actions_comment: true
|
promtool_actions_comment: true
|
||||||
env:
|
env:
|
||||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
|
|
||||||
2
.gitignore
vendored
2
.gitignore
vendored
|
|
@ -3,4 +3,4 @@ _site/
|
||||||
.jekyll-cache/
|
.jekyll-cache/
|
||||||
.jekyll-metadata
|
.jekyll-metadata
|
||||||
_data/rules.json
|
_data/rules.json
|
||||||
test/rules.yml
|
test/rules/
|
||||||
|
|
|
||||||
|
|
@ -14,7 +14,7 @@ GEM
|
||||||
execjs
|
execjs
|
||||||
coffee-script-source (1.11.1)
|
coffee-script-source (1.11.1)
|
||||||
colorator (1.1.0)
|
colorator (1.1.0)
|
||||||
commonmarker (0.23.9)
|
commonmarker (0.23.10)
|
||||||
concurrent-ruby (1.2.0)
|
concurrent-ruby (1.2.0)
|
||||||
dnsruby (1.61.9)
|
dnsruby (1.61.9)
|
||||||
simpleidn (~> 0.1)
|
simpleidn (~> 0.1)
|
||||||
|
|
|
||||||
|
|
@ -125,8 +125,8 @@ groups:
|
||||||
description: 'Prometheus encountered {{ $value }} TSDB WAL truncation failures'
|
description: 'Prometheus encountered {{ $value }} TSDB WAL truncation failures'
|
||||||
query: 'increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0'
|
query: 'increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0'
|
||||||
severity: critical
|
severity: critical
|
||||||
- name: Prometheus timeserie cardinality
|
- name: Prometheus timeseries cardinality
|
||||||
description: 'The "{{ $labels.name }}" timeserie cardinality is getting very high: {{ $value }}'
|
description: 'The "{{ $labels.name }}" timeseries cardinality is getting very high: {{ $value }}'
|
||||||
query: 'label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000'
|
query: 'label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000'
|
||||||
severity: warning
|
severity: warning
|
||||||
|
|
||||||
|
|
@ -196,6 +196,10 @@ groups:
|
||||||
query: '(node_filesystem_files_free / node_filesystem_files * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
query: '(node_filesystem_files_free / node_filesystem_files * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||||
severity: warning
|
severity: warning
|
||||||
for: 2m
|
for: 2m
|
||||||
|
- name: Host filesystem device error
|
||||||
|
description: {{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem
|
||||||
|
query: 'node_filesystem_device_error == 1'
|
||||||
|
severity: critical
|
||||||
- name: Host inodes will fill in 24 hours
|
- name: Host inodes will fill in 24 hours
|
||||||
description: Filesystem is predicted to run out of inodes within the next 24 hours at current write rate
|
description: Filesystem is predicted to run out of inodes within the next 24 hours at current write rate
|
||||||
query: '(node_filesystem_files_free / node_filesystem_files * 100 < 10 and predict_linear(node_filesystem_files_free[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
query: '(node_filesystem_files_free / node_filesystem_files * 100 < 10 and predict_linear(node_filesystem_files_free[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||||
|
|
@ -317,7 +321,7 @@ groups:
|
||||||
description: 'Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.'
|
description: 'Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.'
|
||||||
query: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
query: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||||
severity: warning
|
severity: warning
|
||||||
for: 2m
|
for: 10m
|
||||||
- name: Host clock not synchronising
|
- name: Host clock not synchronising
|
||||||
description: 'Clock not synchronising. Ensure NTP is configured on this host.'
|
description: 'Clock not synchronising. Ensure NTP is configured on this host.'
|
||||||
query: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
query: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||||
|
|
@ -1339,6 +1343,33 @@ groups:
|
||||||
for: 1m
|
for: 1m
|
||||||
severity: critical
|
severity: critical
|
||||||
|
|
||||||
|
- name: Nats
|
||||||
|
exporters:
|
||||||
|
- name: nats-io/prometheus-nats-exporter
|
||||||
|
slug: nats-exporter
|
||||||
|
doc_url: https://github.com/nats-io/prometheus-nats-exporter
|
||||||
|
rules:
|
||||||
|
- name: Nats high connection count
|
||||||
|
description: High number of NATS connections ({{ $value }}) for {{ $labels.instance }}
|
||||||
|
query: 'gnatsd_varz_connections > 100'
|
||||||
|
severity: warning
|
||||||
|
for: 3m
|
||||||
|
- name: Nats high pending bytes
|
||||||
|
description: High number of NATS pending bytes ({{ $value }}) for {{ $labels.instance }}
|
||||||
|
query: 'gnatsd_connz_pending_bytes > 100000'
|
||||||
|
severity: warning
|
||||||
|
for: 3m
|
||||||
|
- name: Nats high subscriptions count
|
||||||
|
description: High number of NATS subscriptions ({{ $value }}) for {{ $labels.instance }}
|
||||||
|
query: 'gnatsd_connz_subscriptions > 50'
|
||||||
|
severity: warning
|
||||||
|
for: 3m
|
||||||
|
- name: Nats high routes count
|
||||||
|
description: High number of NATS routes ({{ $value }}) for {{ $labels.instance }}
|
||||||
|
query: 'gnatsd_routez_num_routes > 10'
|
||||||
|
severity: warning
|
||||||
|
for: 3m
|
||||||
|
|
||||||
- name: Solr
|
- name: Solr
|
||||||
exporters:
|
exporters:
|
||||||
- name: embedded exporter
|
- name: embedded exporter
|
||||||
|
|
@ -1985,8 +2016,8 @@ groups:
|
||||||
severity: warning
|
severity: warning
|
||||||
for: 1m
|
for: 1m
|
||||||
- name: Istio latency 99 percentile
|
- name: Istio latency 99 percentile
|
||||||
description: Istio 1% slowest requests are longer than 1s.
|
description: Istio 1% slowest requests are longer than 1000ms.
|
||||||
query: 'histogram_quantile(0.99, sum(rate(istio_request_duration_milliseconds_bucket[1m])) by (destination_canonical_service, destination_workload_namespace, source_canonical_service, source_workload_namespace, le)) > 1'
|
query: 'histogram_quantile(0.99, sum(rate(istio_request_duration_milliseconds_bucket[1m])) by (destination_canonical_service, destination_workload_namespace, source_canonical_service, source_workload_namespace, le)) > 1000'
|
||||||
severity: warning
|
severity: warning
|
||||||
for: 1m
|
for: 1m
|
||||||
- name: Istio Pilot Duplicate Entry
|
- name: Istio Pilot Duplicate Entry
|
||||||
|
|
|
||||||
|
|
@ -312,7 +312,7 @@ groups:
|
||||||
|
|
||||||
- alert: HostClockSkew
|
- alert: HostClockSkew
|
||||||
expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
|
||||||
for: 2m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
|
|
|
||||||
4
dist/rules/istio/embedded-exporter.yml
vendored
4
dist/rules/istio/embedded-exporter.yml
vendored
|
|
@ -77,13 +77,13 @@ groups:
|
||||||
description: "Istio average requests execution is longer than 100ms.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Istio average requests execution is longer than 100ms.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: IstioLatency99Percentile
|
- alert: IstioLatency99Percentile
|
||||||
expr: 'histogram_quantile(0.99, sum(rate(istio_request_duration_milliseconds_bucket[1m])) by (destination_canonical_service, destination_workload_namespace, source_canonical_service, source_workload_namespace, le)) > 1'
|
expr: 'histogram_quantile(0.99, sum(rate(istio_request_duration_milliseconds_bucket[1m])) by (destination_canonical_service, destination_workload_namespace, source_canonical_service, source_workload_namespace, le)) > 1000'
|
||||||
for: 1m
|
for: 1m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: Istio latency 99 percentile (instance {{ $labels.instance }})
|
summary: Istio latency 99 percentile (instance {{ $labels.instance }})
|
||||||
description: "Istio 1% slowest requests are longer than 1s.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "Istio 1% slowest requests are longer than 1000ms.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
- alert: IstioPilotDuplicateEntry
|
- alert: IstioPilotDuplicateEntry
|
||||||
expr: 'sum(rate(pilot_duplicate_envoy_clusters{}[5m])) > 0'
|
expr: 'sum(rate(pilot_duplicate_envoy_clusters{}[5m])) > 0'
|
||||||
|
|
|
||||||
41
dist/rules/nats/nats-exporter.yml
vendored
Normal file
41
dist/rules/nats/nats-exporter.yml
vendored
Normal file
|
|
@ -0,0 +1,41 @@
|
||||||
|
groups:
|
||||||
|
|
||||||
|
- name: NatsExporter
|
||||||
|
|
||||||
|
rules:
|
||||||
|
|
||||||
|
- alert: NatsHighConnectionCount
|
||||||
|
expr: 'gnatsd_varz_connections > 100'
|
||||||
|
for: 3m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Nats high connection count (instance {{ $labels.instance }})
|
||||||
|
description: "High number of NATS connections ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: NatsHighPendingBytes
|
||||||
|
expr: 'gnatsd_connz_pending_bytes > 100000'
|
||||||
|
for: 3m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Nats high pending bytes (instance {{ $labels.instance }})
|
||||||
|
description: "High number of NATS pending bytes ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: NatsHighSubscriptionsCount
|
||||||
|
expr: 'gnatsd_connz_subscriptions > 50'
|
||||||
|
for: 3m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Nats high subscriptions count (instance {{ $labels.instance }})
|
||||||
|
description: "High number of NATS subscriptions ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: NatsHighRoutesCount
|
||||||
|
expr: 'gnatsd_routez_num_routes > 10'
|
||||||
|
for: 3m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Nats high routes count (instance {{ $labels.instance }})
|
||||||
|
description: "High number of NATS routes ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
@ -1,17 +0,0 @@
|
||||||
|
|
||||||
Since some rules can be duplicated (multiple exporters), I added a prefix to rule name.
|
|
||||||
|
|
||||||
Error:
|
|
||||||
|
|
||||||
```
|
|
||||||
$ promtool check rules test/rules.yml
|
|
||||||
Checking rules.yml
|
|
||||||
29 duplicate rule(s) found.
|
|
||||||
Metric: CassandraClientRequestReadFailure
|
|
||||||
Label(s):
|
|
||||||
severity: critical
|
|
||||||
|
|
||||||
[...]
|
|
||||||
|
|
||||||
Might cause inconsistency while recording expressions.
|
|
||||||
```
|
|
||||||
|
|
@ -1,18 +0,0 @@
|
||||||
groups:
|
|
||||||
|
|
||||||
- name: AllRules
|
|
||||||
rules:
|
|
||||||
{% for group in groups %}{% assign groupIndex = forloop.index %}{% for service in group.services %}{% assign serviceIndex = forloop.index %}{% for exporter in service.exporters %}{% assign exporterIndex = forloop.index %}{% for rule in exporter.rules %}
|
|
||||||
{% assign ruleName = rule.name | split: ' ' %}{% capture ruleNameCamelcase %}{% for word in ruleName %}{{ word | capitalize }} {% endfor %}{% endcapture %}
|
|
||||||
|
|
||||||
{% for comment in comments %}# {{ comment | strip }}
|
|
||||||
{% endfor %}- alert: {{ groupIndex }}_{{ serviceIndex }}_{{ exporterIndex }}_{{ ruleNameCamelcase | remove: ' ' }}
|
|
||||||
expr: '{{ rule.query }}'
|
|
||||||
for: {% if rule.for %}{{ rule.for }}{% else %}0m{% endif %}
|
|
||||||
labels:
|
|
||||||
severity: {{ rule.severity }}
|
|
||||||
annotations:
|
|
||||||
summary: {{ rule.name }} (instance {% raw %}{{ $labels.instance }}{% endraw %})
|
|
||||||
description: "{{ rule.description | replace: '"', '\"' }}\n VALUE = {% raw %}{{ $value }}{% endraw %}\n LABELS = {% raw %}{{ $labels }}{% endraw %}"
|
|
||||||
|
|
||||||
{% endfor %}{% endfor %}{% endfor %}{% endfor %}
|
|
||||||
Loading…
Reference in a new issue