From 8941f71c6c413d0309e275cc0505d91737e08611 Mon Sep 17 00:00:00 2001 From: Samuel Berthe Date: Mon, 18 Apr 2022 23:30:32 +0200 Subject: [PATCH] chore(ci): adding test with promtool (#281) --- .github/workflows/test.yml | 35 +++++++++++++++++++++++++++++++++++ .gitignore | 2 ++ _data/rules.yml | 10 ++++------ test/README.md | 17 +++++++++++++++++ test/template.yml | 18 ++++++++++++++++++ 5 files changed, 76 insertions(+), 6 deletions(-) create mode 100644 .github/workflows/test.yml create mode 100644 test/README.md create mode 100644 test/template.yml diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..5c92f3c --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,35 @@ +name: Promtool check + +on: [pull_request, push] + +jobs: + promtool-check: + name: Check alert rules syntax + runs-on: ubuntu-latest + steps: + - name: Checkout Repo + uses: actions/checkout@v2 + + - name: Set up Ruby + uses: actions/setup-ruby@v1 + + - name: Set up yq + uses: mikefarah/yq@master + + - name: Install liquid + run: gem install liquid-cli + + - name: Build rule configuration + run: | + gem install liquid-cli + cat _data/rules.yml | yq -I 0 -o json > _data/rules.json + cat test/template.yml | liquid "$(< _data/rules.json)" > test/rules.yml + + - name: Check Prometheus alert rules + uses: peimanja/promtool-github-actions@master + with: + promtool_actions_subcommand: 'rules' + promtool_actions_files: 'test/rules.yml' + promtool_actions_comment: true + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.gitignore b/.gitignore index 2ca8682..8353a06 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,5 @@ _site/ .sass-cache/ .jekyll-cache/ .jekyll-metadata +_data/rules.json +test/rules.yml diff --git a/_data/rules.yml b/_data/rules.yml index 319e4ba..7e7dbdd 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -1212,9 +1212,7 @@ groups: severity: warning - name: Kafka consumer lag description: Kafka consumer has a 30 minutes and increasing lag - query: | - 'kafka_burrow_topic_partition_offset - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset >= (kafka_burrow_topic_partition_offset offset 15m - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset offset 15m) - AND kafka_burrow_topic_partition_offset - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset > 0' + query: 'kafka_burrow_topic_partition_offset - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset >= (kafka_burrow_topic_partition_offset offset 15m - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset offset 15m) AND kafka_burrow_topic_partition_offset - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset > 0' severity: warning for: 15m @@ -1414,7 +1412,7 @@ groups: for: 1m - name: HAProxy retry high description: High rate of retry on {{ $labels.fqdn }}/{{ $labels.backend }} backend - query: 'rate(sum by (backend) (haproxy_backend_retry_warnings_total)) > 10' + query: 'sum by (backend) (rate(haproxy_backend_retry_warnings_total[1m])) > 10' severity: warning for: 2m - name: HAProxy backend down @@ -1427,12 +1425,12 @@ groups: severity: critical - name: HAProxy frontend security blocked requests description: HAProxy is blocking requests for security reason - query: 'rate(sum by (frontend) (haproxy_frontend_requests_denied_total)) > 10' + query: 'sum by (frontend) (rate(haproxy_frontend_requests_denied_total[2m])) > 10' severity: warning for: 2m - name: HAProxy server healthcheck failure description: Some server healthcheck are failing on {{ $labels.server }} - query: 'increase(haproxy_server_check_failures_total) > 0' + query: 'increase(haproxy_server_check_failures_total[1m]) > 0' severity: warning for: 1m diff --git a/test/README.md b/test/README.md new file mode 100644 index 0000000..90cbac0 --- /dev/null +++ b/test/README.md @@ -0,0 +1,17 @@ + +Since some rules can be duplicated (multiple exporters), I added a prefix to rule name. + +Error: + +``` +$ promtool check rules test/rules.yml +Checking rules.yml +29 duplicate rule(s) found. +Metric: CassandraClientRequestReadFailure +Label(s): + severity: critical + +[...] + +Might cause inconsistency while recording expressions. +``` diff --git a/test/template.yml b/test/template.yml new file mode 100644 index 0000000..106163f --- /dev/null +++ b/test/template.yml @@ -0,0 +1,18 @@ +groups: + +- name: AllRules + rules: +{% for group in groups %}{% assign groupIndex = forloop.index %}{% for service in group.services %}{% assign serviceIndex = forloop.index %}{% for exporter in service.exporters %}{% assign exporterIndex = forloop.index %}{% for rule in exporter.rules %} +{% assign ruleName = rule.name | split: ' ' %}{% capture ruleNameCamelcase %}{% for word in ruleName %}{{ word | capitalize }} {% endfor %}{% endcapture %} + + {% for comment in comments %}# {{ comment | strip }} + {% endfor %}- alert: {{ groupIndex }}_{{ serviceIndex }}_{{ exporterIndex }}_{{ ruleNameCamelcase | remove: ' ' }} + expr: '{{ rule.query }}' + for: {% if rule.for %}{{ rule.for }}{% else %}0m{% endif %} + labels: + severity: {{ rule.severity }} + annotations: + summary: {{ rule.name }} (instance {% raw %}{{ $labels.instance }}{% endraw %}) + description: "{{ rule.description | replace: '"', '\"' }}\n VALUE = {% raw %}{{ $value }}{% endraw %}\n LABELS = {% raw %}{{ $labels }}{% endraw %}" + +{% endfor %}{% endfor %}{% endfor %}{% endfor %}