diff --git a/README.md b/README.md index 821c3d2..2b567c4 100644 --- a/README.md +++ b/README.md @@ -36,6 +36,7 @@ Collection available here: **[https://awesome-prometheus-alerts.grep.to](https:/ - [Cassandra](https://awesome-prometheus-alerts.grep.to/rules#cassandra) - [Zookeeper](https://awesome-prometheus-alerts.grep.to/rules#zookeeper) - [Kafka](https://awesome-prometheus-alerts.grep.to/rules#kafka) +- [Pulsar](https://awesome-prometheus-alerts.grep.to/rules#pulsar) - [Solr](https://awesome-prometheus-alerts.grep.to/rules#solr) #### Reverse proxies and load balancers diff --git a/_data/rules.yml b/_data/rules.yml index 3378ec6..c9f3644 100644 --- a/_data/rules.yml +++ b/_data/rules.yml @@ -1200,57 +1200,59 @@ groups: - name: Pulsar exporters: - - rules: - - name: Pulsar Subscription High Number Of Backlog Entries + - name: embedded exporter + doc_url: https://pulsar.apache.org/docs/reference-metrics/ + rules: + - name: Pulsar subscription high number of backlog entries + description: "The number of subscription backlog entries is over 5k" query: sum(pulsar_subscription_back_log) by (subscription) > 5000 for: 1h severity: warning - description: "The number of subscription backlog entries is over 5k\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - name: Pulsar Subscription Very High Number Of Backlog Entries + - name: Pulsar subscription very high number of backlog entries + description: "The number of subscription backlog entries is over 100k" query: sum(pulsar_subscription_back_log) by (subscription) > 100000 for: 1h severity: critical - description: "The number of subscription backlog entries is over 100k\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - name: Pulsar Topic Large Backlog Storage Size + - name: Pulsar topic large backlog storage size + description: "The topic backlog storage size is over 5 GB" query: sum(pulsar_storage_size > 5*1024*1024*1024) by (topic) for: 1h severity: warning - description: "The topic backlog storage size is over 5 GB\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - name: PulsarTopicVeryLargeBacklogStorageSize + - name: Pulsar topic very large backlog storage size + description: "The topic backlog storage size is over 20 GB" query: sum(pulsar_storage_size > 20*1024*1024*1024) by (topic) for: 1h severity: critical - description: "The topic backlog storage size is over 20 GB\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - name: Pulsar High Write Latency + - name: Pulsar high write latency + description: "Messages cannot be written in a timely fashion" query: sum(pulsar_storage_write_latency_overflow > 0) by (topic) for: 1h severity: critical - description: "Messages cannot be written in a timely fashion\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - name: Pulsar Large Message Payload + - name: Pulsar large message payload + description: "Observing large message payload (> 1MB)" query: sum(pulsar_entry_size_overflow > 0) by (topic) for: 1h severity: warning - description: "Observing large message payload (> 1MB)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - name: PulsarHighLedgerDiskUsage + - name: Pulsar high ledger disk usage + description: "Observing Ledger Disk Usage (> 75%)" query: sum(bookie_ledger_dir__pulsar_data_bookkeeper_ledgers_usage) by (kubernetes_pod_name) > 75 for: 1h severity: critical - description: "Observing Ledger Disk Usage (> 75%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - name: Pulsar Read Only Bookies + - name: Pulsar read only bookies + description: "Observing Readonly Bookies" query: count(bookie_SERVER_STATUS{} == 0) by (pod) for: 5m severity: critical - description: "Observing Readonly Bookies\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - name: Pulsar High Number Of Function Errors + - name: Pulsar high number of function errors + description: "Observing more than 10 Function errors per minute" query: sum((rate(pulsar_function_user_exceptions_total{}[1m]) + rate(pulsar_function_system_exceptions_total{}[1m])) > 10) by (name) for: 1m severity: critical - description: "Observing more than 10 Function errors per minute\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - name: Pulsar High Number Of Sink Errors + - name: Pulsar high number of sink errors + description: "Observing more than 10 Sink errors per minute" query: sum(rate(pulsar_sink_sink_exceptions_total{}[1m]) > 10) by (name) for: 1m severity: critical - description: "Observing more than 10 Sink errors per minute\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - name: Solr exporters: @@ -2206,4 +2208,3 @@ groups: * FAILURE 2 false - The build had a fatal error. * NOT_BUILT 3 false - The module was not built. * ABORTED 4 false - The build was manually aborted. -