Merge branch 'master' into KubernetesJobSlowCompletion-exclude-failed

This commit is contained in:
Pavel Timofeev 2023-08-15 10:18:50 -06:00
commit da71664d22
9 changed files with 98 additions and 47 deletions

View file

@ -25,13 +25,27 @@ jobs:
run: |
gem install liquid-cli
cat _data/rules.yml | yq -I 0 -o json > _data/rules.json
cat test/template.yml | liquid "$(< _data/rules.json)" > test/rules.yml
for service in $(cat _data/rules.json | jq -r '.groups[].services[] | @base64'); do
subdir=test/rules/$(echo ${service} | base64 --decode | jq -r '.name | ascii_downcase | split(" ") | join("-")')
mkdir -p "${subdir}"
# groupName=$(echo "{% assign groupName = name | split: ' ' %}{% capture groupNameCamelcase %}{% for word in groupName %}{{ word | capitalize }} {% endfor %}{% endcapture %} {{ groupNameCamelcase | remove: ' ' | remove: '-' }}" | liquid $(echo ${service} | base64 --decode | jq -r '.name | ascii_downcase | split(" ") | join("-")'))
for exporter in $(echo ${service} | base64 --decode | jq -r '.exporters[] | @base64'); do
exporterName=$(echo ${exporter} | base64 --decode | jq -r '.slug')
cat dist/template.yml | liquid "$(echo ${exporter} | base64 --decode)" > ${subdir}/${exporterName}.yml
echo ${subdir}/${exporterName}.yml
done
done
rm _data/rules.json
- name: Check Prometheus alert rules
uses: peimanja/promtool-github-actions@master
with:
promtool_actions_subcommand: 'rules'
promtool_actions_files: 'test/rules.yml'
promtool_actions_files: 'test/rules/*/*.yml'
promtool_actions_comment: true
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

2
.gitignore vendored
View file

@ -3,4 +3,4 @@ _site/
.jekyll-cache/
.jekyll-metadata
_data/rules.json
test/rules.yml
test/rules/

View file

@ -14,7 +14,7 @@ GEM
execjs
coffee-script-source (1.11.1)
colorator (1.1.0)
commonmarker (0.23.9)
commonmarker (0.23.10)
concurrent-ruby (1.2.0)
dnsruby (1.61.9)
simpleidn (~> 0.1)

View file

@ -125,8 +125,8 @@ groups:
description: 'Prometheus encountered {{ $value }} TSDB WAL truncation failures'
query: 'increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0'
severity: critical
- name: Prometheus timeserie cardinality
description: 'The "{{ $labels.name }}" timeserie cardinality is getting very high: {{ $value }}'
- name: Prometheus timeseries cardinality
description: 'The "{{ $labels.name }}" timeseries cardinality is getting very high: {{ $value }}'
query: 'label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000'
severity: warning
@ -196,6 +196,10 @@ groups:
query: '(node_filesystem_files_free / node_filesystem_files * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
severity: warning
for: 2m
- name: Host filesystem device error
description: {{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem
query: 'node_filesystem_device_error == 1'
severity: critical
- name: Host inodes will fill in 24 hours
description: Filesystem is predicted to run out of inodes within the next 24 hours at current write rate
query: '(node_filesystem_files_free / node_filesystem_files * 100 < 10 and predict_linear(node_filesystem_files_free[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
@ -317,7 +321,7 @@ groups:
description: 'Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.'
query: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
severity: warning
for: 2m
for: 10m
- name: Host clock not synchronising
description: 'Clock not synchronising. Ensure NTP is configured on this host.'
query: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
@ -1339,6 +1343,33 @@ groups:
for: 1m
severity: critical
- name: Nats
exporters:
- name: nats-io/prometheus-nats-exporter
slug: nats-exporter
doc_url: https://github.com/nats-io/prometheus-nats-exporter
rules:
- name: Nats high connection count
description: High number of NATS connections ({{ $value }}) for {{ $labels.instance }}
query: 'gnatsd_varz_connections > 100'
severity: warning
for: 3m
- name: Nats high pending bytes
description: High number of NATS pending bytes ({{ $value }}) for {{ $labels.instance }}
query: 'gnatsd_connz_pending_bytes > 100000'
severity: warning
for: 3m
- name: Nats high subscriptions count
description: High number of NATS subscriptions ({{ $value }}) for {{ $labels.instance }}
query: 'gnatsd_connz_subscriptions > 50'
severity: warning
for: 3m
- name: Nats high routes count
description: High number of NATS routes ({{ $value }}) for {{ $labels.instance }}
query: 'gnatsd_routez_num_routes > 10'
severity: warning
for: 3m
- name: Solr
exporters:
- name: embedded exporter
@ -1985,8 +2016,8 @@ groups:
severity: warning
for: 1m
- name: Istio latency 99 percentile
description: Istio 1% slowest requests are longer than 1s.
query: 'histogram_quantile(0.99, sum(rate(istio_request_duration_milliseconds_bucket[1m])) by (destination_canonical_service, destination_workload_namespace, source_canonical_service, source_workload_namespace, le)) > 1'
description: Istio 1% slowest requests are longer than 1000ms.
query: 'histogram_quantile(0.99, sum(rate(istio_request_duration_milliseconds_bucket[1m])) by (destination_canonical_service, destination_workload_namespace, source_canonical_service, source_workload_namespace, le)) > 1000'
severity: warning
for: 1m
- name: Istio Pilot Duplicate Entry

View file

@ -312,7 +312,7 @@ groups:
- alert: HostClockSkew
expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
for: 2m
for: 10m
labels:
severity: warning
annotations:

View file

@ -77,13 +77,13 @@ groups:
description: "Istio average requests execution is longer than 100ms.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: IstioLatency99Percentile
expr: 'histogram_quantile(0.99, sum(rate(istio_request_duration_milliseconds_bucket[1m])) by (destination_canonical_service, destination_workload_namespace, source_canonical_service, source_workload_namespace, le)) > 1'
expr: 'histogram_quantile(0.99, sum(rate(istio_request_duration_milliseconds_bucket[1m])) by (destination_canonical_service, destination_workload_namespace, source_canonical_service, source_workload_namespace, le)) > 1000'
for: 1m
labels:
severity: warning
annotations:
summary: Istio latency 99 percentile (instance {{ $labels.instance }})
description: "Istio 1% slowest requests are longer than 1s.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Istio 1% slowest requests are longer than 1000ms.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: IstioPilotDuplicateEntry
expr: 'sum(rate(pilot_duplicate_envoy_clusters{}[5m])) > 0'

41
dist/rules/nats/nats-exporter.yml vendored Normal file
View file

@ -0,0 +1,41 @@
groups:
- name: NatsExporter
rules:
- alert: NatsHighConnectionCount
expr: 'gnatsd_varz_connections > 100'
for: 3m
labels:
severity: warning
annotations:
summary: Nats high connection count (instance {{ $labels.instance }})
description: "High number of NATS connections ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsHighPendingBytes
expr: 'gnatsd_connz_pending_bytes > 100000'
for: 3m
labels:
severity: warning
annotations:
summary: Nats high pending bytes (instance {{ $labels.instance }})
description: "High number of NATS pending bytes ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsHighSubscriptionsCount
expr: 'gnatsd_connz_subscriptions > 50'
for: 3m
labels:
severity: warning
annotations:
summary: Nats high subscriptions count (instance {{ $labels.instance }})
description: "High number of NATS subscriptions ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: NatsHighRoutesCount
expr: 'gnatsd_routez_num_routes > 10'
for: 3m
labels:
severity: warning
annotations:
summary: Nats high routes count (instance {{ $labels.instance }})
description: "High number of NATS routes ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -1,17 +0,0 @@
Since some rules can be duplicated (multiple exporters), I added a prefix to rule name.
Error:
```
$ promtool check rules test/rules.yml
Checking rules.yml
29 duplicate rule(s) found.
Metric: CassandraClientRequestReadFailure
Label(s):
severity: critical
[...]
Might cause inconsistency while recording expressions.
```

View file

@ -1,18 +0,0 @@
groups:
- name: AllRules
rules:
{% for group in groups %}{% assign groupIndex = forloop.index %}{% for service in group.services %}{% assign serviceIndex = forloop.index %}{% for exporter in service.exporters %}{% assign exporterIndex = forloop.index %}{% for rule in exporter.rules %}
{% assign ruleName = rule.name | split: ' ' %}{% capture ruleNameCamelcase %}{% for word in ruleName %}{{ word | capitalize }} {% endfor %}{% endcapture %}
{% for comment in comments %}# {{ comment | strip }}
{% endfor %}- alert: {{ groupIndex }}_{{ serviceIndex }}_{{ exporterIndex }}_{{ ruleNameCamelcase | remove: ' ' }}
expr: '{{ rule.query }}'
for: {% if rule.for %}{{ rule.for }}{% else %}0m{% endif %}
labels:
severity: {{ rule.severity }}
annotations:
summary: {{ rule.name }} (instance {% raw %}{{ $labels.instance }}{% endraw %})
description: "{{ rule.description | replace: '"', '\"' }}\n VALUE = {% raw %}{{ $value }}{% endraw %}\n LABELS = {% raw %}{{ $labels }}{% endraw %}"
{% endfor %}{% endfor %}{% endfor %}{% endfor %}