diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 62439ef..5d0cc3e 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -25,13 +25,27 @@ jobs: run: | gem install liquid-cli cat _data/rules.yml | yq -I 0 -o json > _data/rules.json - cat test/template.yml | liquid "$(< _data/rules.json)" > test/rules.yml + + for service in $(cat _data/rules.json | jq -r '.groups[].services[] | @base64'); do + subdir=test/rules/$(echo ${service} | base64 --decode | jq -r '.name | ascii_downcase | split(" ") | join("-")') + mkdir -p "${subdir}" + + # groupName=$(echo "{% assign groupName = name | split: ' ' %}{% capture groupNameCamelcase %}{% for word in groupName %}{{ word | capitalize }} {% endfor %}{% endcapture %} {{ groupNameCamelcase | remove: ' ' | remove: '-' }}" | liquid $(echo ${service} | base64 --decode | jq -r '.name | ascii_downcase | split(" ") | join("-")')) + + for exporter in $(echo ${service} | base64 --decode | jq -r '.exporters[] | @base64'); do + exporterName=$(echo ${exporter} | base64 --decode | jq -r '.slug') + cat dist/template.yml | liquid "$(echo ${exporter} | base64 --decode)" > ${subdir}/${exporterName}.yml + echo ${subdir}/${exporterName}.yml + done + done + + rm _data/rules.json - name: Check Prometheus alert rules uses: peimanja/promtool-github-actions@master with: promtool_actions_subcommand: 'rules' - promtool_actions_files: 'test/rules.yml' + promtool_actions_files: 'test/rules/*/*.yml' promtool_actions_comment: true env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.gitignore b/.gitignore index 8353a06..12ca387 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,4 @@ _site/ .jekyll-cache/ .jekyll-metadata _data/rules.json -test/rules.yml +test/rules/ diff --git a/Gemfile.lock b/Gemfile.lock index f0593ec..7b76ef9 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -14,7 +14,7 @@ GEM execjs coffee-script-source (1.11.1) colorator (1.1.0) - commonmarker (0.23.9) + commonmarker (0.23.10) concurrent-ruby (1.2.0) dnsruby (1.61.9) simpleidn (~> 0.1) diff --git a/_data/rules.yml b/_data/rules.yml index 2595167..6f8ea35 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -125,8 +125,8 @@ groups: description: 'Prometheus encountered {{ $value }} TSDB WAL truncation failures' query: 'increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0' severity: critical - - name: Prometheus timeserie cardinality - description: 'The "{{ $labels.name }}" timeserie cardinality is getting very high: {{ $value }}' + - name: Prometheus timeseries cardinality + description: 'The "{{ $labels.name }}" timeseries cardinality is getting very high: {{ $value }}' query: 'label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000' severity: warning @@ -196,6 +196,10 @@ groups: query: '(node_filesystem_files_free / node_filesystem_files * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' severity: warning for: 2m + - name: Host filesystem device error + description: {{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem + query: 'node_filesystem_device_error == 1' + severity: critical - name: Host inodes will fill in 24 hours description: Filesystem is predicted to run out of inodes within the next 24 hours at current write rate query: '(node_filesystem_files_free / node_filesystem_files * 100 < 10 and predict_linear(node_filesystem_files_free[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' @@ -317,7 +321,7 @@ groups: description: 'Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.' query: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' severity: warning - for: 2m + for: 10m - name: Host clock not synchronising description: 'Clock not synchronising. Ensure NTP is configured on this host.' query: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' @@ -1339,6 +1343,33 @@ groups: for: 1m severity: critical + - name: Nats + exporters: + - name: nats-io/prometheus-nats-exporter + slug: nats-exporter + doc_url: https://github.com/nats-io/prometheus-nats-exporter + rules: + - name: Nats high connection count + description: High number of NATS connections ({{ $value }}) for {{ $labels.instance }} + query: 'gnatsd_varz_connections > 100' + severity: warning + for: 3m + - name: Nats high pending bytes + description: High number of NATS pending bytes ({{ $value }}) for {{ $labels.instance }} + query: 'gnatsd_connz_pending_bytes > 100000' + severity: warning + for: 3m + - name: Nats high subscriptions count + description: High number of NATS subscriptions ({{ $value }}) for {{ $labels.instance }} + query: 'gnatsd_connz_subscriptions > 50' + severity: warning + for: 3m + - name: Nats high routes count + description: High number of NATS routes ({{ $value }}) for {{ $labels.instance }} + query: 'gnatsd_routez_num_routes > 10' + severity: warning + for: 3m + - name: Solr exporters: - name: embedded exporter @@ -1985,8 +2016,8 @@ groups: severity: warning for: 1m - name: Istio latency 99 percentile - description: Istio 1% slowest requests are longer than 1s. - query: 'histogram_quantile(0.99, sum(rate(istio_request_duration_milliseconds_bucket[1m])) by (destination_canonical_service, destination_workload_namespace, source_canonical_service, source_workload_namespace, le)) > 1' + description: Istio 1% slowest requests are longer than 1000ms. + query: 'histogram_quantile(0.99, sum(rate(istio_request_duration_milliseconds_bucket[1m])) by (destination_canonical_service, destination_workload_namespace, source_canonical_service, source_workload_namespace, le)) > 1000' severity: warning for: 1m - name: Istio Pilot Duplicate Entry diff --git a/dist/rules/host-and-hardware/node-exporter.yml b/dist/rules/host-and-hardware/node-exporter.yml index 942cf58..6e108d0 100644 --- a/dist/rules/host-and-hardware/node-exporter.yml +++ b/dist/rules/host-and-hardware/node-exporter.yml @@ -312,7 +312,7 @@ groups: - alert: HostClockSkew expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' - for: 2m + for: 10m labels: severity: warning annotations: diff --git a/dist/rules/istio/embedded-exporter.yml b/dist/rules/istio/embedded-exporter.yml index 6705070..4ce9f54 100644 --- a/dist/rules/istio/embedded-exporter.yml +++ b/dist/rules/istio/embedded-exporter.yml @@ -77,13 +77,13 @@ groups: description: "Istio average requests execution is longer than 100ms.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: IstioLatency99Percentile - expr: 'histogram_quantile(0.99, sum(rate(istio_request_duration_milliseconds_bucket[1m])) by (destination_canonical_service, destination_workload_namespace, source_canonical_service, source_workload_namespace, le)) > 1' + expr: 'histogram_quantile(0.99, sum(rate(istio_request_duration_milliseconds_bucket[1m])) by (destination_canonical_service, destination_workload_namespace, source_canonical_service, source_workload_namespace, le)) > 1000' for: 1m labels: severity: warning annotations: summary: Istio latency 99 percentile (instance {{ $labels.instance }}) - description: "Istio 1% slowest requests are longer than 1s.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Istio 1% slowest requests are longer than 1000ms.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: IstioPilotDuplicateEntry expr: 'sum(rate(pilot_duplicate_envoy_clusters{}[5m])) > 0' diff --git a/dist/rules/nats/nats-exporter.yml b/dist/rules/nats/nats-exporter.yml new file mode 100644 index 0000000..13eda2b --- /dev/null +++ b/dist/rules/nats/nats-exporter.yml @@ -0,0 +1,41 @@ +groups: + +- name: NatsExporter + + rules: + + - alert: NatsHighConnectionCount + expr: 'gnatsd_varz_connections > 100' + for: 3m + labels: + severity: warning + annotations: + summary: Nats high connection count (instance {{ $labels.instance }}) + description: "High number of NATS connections ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: NatsHighPendingBytes + expr: 'gnatsd_connz_pending_bytes > 100000' + for: 3m + labels: + severity: warning + annotations: + summary: Nats high pending bytes (instance {{ $labels.instance }}) + description: "High number of NATS pending bytes ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: NatsHighSubscriptionsCount + expr: 'gnatsd_connz_subscriptions > 50' + for: 3m + labels: + severity: warning + annotations: + summary: Nats high subscriptions count (instance {{ $labels.instance }}) + description: "High number of NATS subscriptions ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: NatsHighRoutesCount + expr: 'gnatsd_routez_num_routes > 10' + for: 3m + labels: + severity: warning + annotations: + summary: Nats high routes count (instance {{ $labels.instance }}) + description: "High number of NATS routes ({{ $value }}) for {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/test/README.md b/test/README.md deleted file mode 100644 index 90cbac0..0000000 --- a/test/README.md +++ /dev/null @@ -1,17 +0,0 @@ - -Since some rules can be duplicated (multiple exporters), I added a prefix to rule name. - -Error: - -``` -$ promtool check rules test/rules.yml -Checking rules.yml -29 duplicate rule(s) found. -Metric: CassandraClientRequestReadFailure -Label(s): - severity: critical - -[...] - -Might cause inconsistency while recording expressions. -``` diff --git a/test/template.yml b/test/template.yml deleted file mode 100644 index 106163f..0000000 --- a/test/template.yml +++ /dev/null @@ -1,18 +0,0 @@ -groups: - -- name: AllRules - rules: -{% for group in groups %}{% assign groupIndex = forloop.index %}{% for service in group.services %}{% assign serviceIndex = forloop.index %}{% for exporter in service.exporters %}{% assign exporterIndex = forloop.index %}{% for rule in exporter.rules %} -{% assign ruleName = rule.name | split: ' ' %}{% capture ruleNameCamelcase %}{% for word in ruleName %}{{ word | capitalize }} {% endfor %}{% endcapture %} - - {% for comment in comments %}# {{ comment | strip }} - {% endfor %}- alert: {{ groupIndex }}_{{ serviceIndex }}_{{ exporterIndex }}_{{ ruleNameCamelcase | remove: ' ' }} - expr: '{{ rule.query }}' - for: {% if rule.for %}{{ rule.for }}{% else %}0m{% endif %} - labels: - severity: {{ rule.severity }} - annotations: - summary: {{ rule.name }} (instance {% raw %}{{ $labels.instance }}{% endraw %}) - description: "{{ rule.description | replace: '"', '\"' }}\n VALUE = {% raw %}{{ $value }}{% endraw %}\n LABELS = {% raw %}{{ $labels }}{% endraw %}" - -{% endfor %}{% endfor %}{% endfor %}{% endfor %}