mirror of
https://github.com/samber/awesome-prometheus-alerts.git
synced 2026-06-26 19:37:27 +08:00
commit
e2ffbfb653
84 changed files with 320 additions and 3 deletions
|
|
@ -18,7 +18,7 @@ Please ensure your pull request adheres to the following guidelines:
|
|||
|
||||
## Improving Github page
|
||||
|
||||
### Run localy
|
||||
### Run locally
|
||||
|
||||
```
|
||||
gem install bundler
|
||||
|
|
|
|||
|
|
@ -106,6 +106,7 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts
|
|||
- [Promtail](https://samber.github.io/awesome-prometheus-alerts/rules#promtail)
|
||||
- [Cortex](https://samber.github.io/awesome-prometheus-alerts/rules#cortex)
|
||||
- [Grafana Alloy](https://samber.github.io/awesome-prometheus-alerts/rules#grafana-alloy)
|
||||
- [OpenTelemetry Collector](https://samber.github.io/awesome-prometheus-alerts/rules#opentelemetry-collector)
|
||||
- [Jenkins](https://samber.github.io/awesome-prometheus-alerts/rules#jenkins)
|
||||
- [Graph Node](https://samber.github.io/awesome-prometheus-alerts/rules#graph-node)
|
||||
|
||||
|
|
|
|||
|
|
@ -3183,6 +3183,76 @@ groups:
|
|||
query: "count by (instance) (alloy_build_info) unless count by (instance) (alloy_build_info offset 2m) "
|
||||
severity: critical
|
||||
|
||||
- name: OpenTelemetry Collector
|
||||
exporters:
|
||||
- name: Embedded exporter
|
||||
slug: embedded-exporter
|
||||
doc_url: https://opentelemetry.io/docs/collector/internal-telemetry/
|
||||
comments: |
|
||||
OpenTelemetry Collector self-monitoring metrics are exposed on port 8888 by default at the /metrics endpoint.
|
||||
These alerts monitor the collector's health when metrics are ingested via the Prometheus OTLP endpoint or scraped directly.
|
||||
All collector internal metrics are prefixed with 'otelcol_'.
|
||||
rules:
|
||||
- name: OpenTelemetry Collector down
|
||||
description: OpenTelemetry Collector instance has disappeared or is not being scraped
|
||||
query: 'up{job=~".*otel.*collector.*"} == 0'
|
||||
severity: critical
|
||||
for: 1m
|
||||
- name: OpenTelemetry Collector receiver refused spans
|
||||
description: "OpenTelemetry Collector is refusing spans on {{ $labels.receiver }}"
|
||||
query: 'rate(otelcol_receiver_refused_spans[5m]) > 0'
|
||||
severity: critical
|
||||
for: 5m
|
||||
- name: OpenTelemetry Collector receiver refused metric points
|
||||
description: "OpenTelemetry Collector is refusing metric points on {{ $labels.receiver }}"
|
||||
query: 'rate(otelcol_receiver_refused_metric_points[5m]) > 0'
|
||||
severity: critical
|
||||
for: 5m
|
||||
- name: OpenTelemetry Collector receiver refused log records
|
||||
description: "OpenTelemetry Collector is refusing log records on {{ $labels.receiver }}"
|
||||
query: 'rate(otelcol_receiver_refused_log_records[5m]) > 0'
|
||||
severity: critical
|
||||
for: 5m
|
||||
- name: OpenTelemetry Collector exporter failed spans
|
||||
description: "OpenTelemetry Collector failing to send spans via {{ $labels.exporter }}"
|
||||
query: 'rate(otelcol_exporter_send_failed_spans[5m]) > 0'
|
||||
severity: warning
|
||||
for: 5m
|
||||
- name: OpenTelemetry Collector exporter failed metric points
|
||||
description: "OpenTelemetry Collector failing to send metric points via {{ $labels.exporter }}"
|
||||
query: 'rate(otelcol_exporter_send_failed_metric_points[5m]) > 0'
|
||||
severity: warning
|
||||
for: 5m
|
||||
- name: OpenTelemetry Collector exporter failed log records
|
||||
description: "OpenTelemetry Collector failing to send log records via {{ $labels.exporter }}"
|
||||
query: 'rate(otelcol_exporter_send_failed_log_records[5m]) > 0'
|
||||
severity: warning
|
||||
for: 5m
|
||||
- name: OpenTelemetry Collector exporter queue nearly full
|
||||
description: "OpenTelemetry Collector exporter {{ $labels.exporter }} queue is over 80% full"
|
||||
query: '(otelcol_exporter_queue_size / on(instance, job, exporter) otelcol_exporter_queue_capacity) > 0.8 and otelcol_exporter_queue_capacity > 0'
|
||||
severity: warning
|
||||
- name: OpenTelemetry Collector processor refused spans
|
||||
description: "OpenTelemetry Collector processor {{ $labels.processor }} is refusing spans, likely due to backpressure"
|
||||
query: 'rate(otelcol_processor_refused_spans[5m]) > 0'
|
||||
severity: warning
|
||||
for: 5m
|
||||
- name: OpenTelemetry Collector processor refused metric points
|
||||
description: "OpenTelemetry Collector processor {{ $labels.processor }} is refusing metric points, likely due to backpressure"
|
||||
query: 'rate(otelcol_processor_refused_metric_points[5m]) > 0'
|
||||
severity: warning
|
||||
for: 5m
|
||||
- name: OpenTelemetry Collector high memory usage
|
||||
description: "OpenTelemetry Collector memory usage is above 90%"
|
||||
query: '(otelcol_process_runtime_heap_alloc_bytes{job=~".*otel.*collector.*"} / on(instance, job) otelcol_process_runtime_total_sys_memory_bytes{job=~".*otel.*collector.*"}) > 0.9'
|
||||
severity: warning
|
||||
for: 5m
|
||||
- name: OpenTelemetry Collector OTLP receiver errors
|
||||
description: "OpenTelemetry Collector OTLP receiver is completely failing - all spans are being refused"
|
||||
query: 'rate(otelcol_receiver_accepted_spans{receiver=~"otlp"}[5m]) == 0 and rate(otelcol_receiver_refused_spans{receiver=~"otlp"}[5m]) > 0'
|
||||
severity: critical
|
||||
for: 2m
|
||||
|
||||
- name: Jenkins
|
||||
exporters:
|
||||
- name: Metric plugin
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: LusitaniaeApacheExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: ApacheDown
|
||||
|
|
|
|||
1
dist/rules/apc-ups/apcupsd_exporter.yml
vendored
1
dist/rules/apc-ups/apcupsd_exporter.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: Apcupsd_exporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: ApcUpsBatteryNearlyEmpty
|
||||
|
|
|
|||
1
dist/rules/argocd/embedded-exporter.yml
vendored
1
dist/rules/argocd/embedded-exporter.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: ArgocdServiceNotSynced
|
||||
|
|
|
|||
5
dist/rules/blackbox/blackbox-exporter.yml
vendored
5
dist/rules/blackbox/blackbox-exporter.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: BlackboxExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: BlackboxProbeFailed
|
||||
|
|
@ -58,6 +59,10 @@ groups:
|
|||
summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
|
||||
description: "SSL certificate expires in less than 3 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# For probe_ssl_earliest_cert_expiry to be exposed after expiration, you
|
||||
# need to enable insecure_skip_verify. Note that this will disable
|
||||
# certificate validation.
|
||||
# See https://github.com/prometheus/blackbox_exporter/blob/master/CONFIGURATION.md#tls_config
|
||||
- alert: BlackboxSslCertificateExpired
|
||||
expr: 'round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 0'
|
||||
for: 0m
|
||||
|
|
|
|||
1
dist/rules/caddy/embedded-exporter.yml
vendored
1
dist/rules/caddy/embedded-exporter.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: CaddyReverseProxyDown
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: CriteoCassandraExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: CassandraHintsCount
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: InstaclustrCassandraExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: CassandraNodeIsUnavailable
|
||||
|
|
|
|||
1
dist/rules/ceph/embedded-exporter.yml
vendored
1
dist/rules/ceph/embedded-exporter.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: CephState
|
||||
|
|
|
|||
4
dist/rules/clickhouse/embedded-exporter.yml
vendored
4
dist/rules/clickhouse/embedded-exporter.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: ClickhouseNodeDown
|
||||
|
|
@ -85,6 +86,7 @@ groups:
|
|||
summary: ClickHouse No Live Replicas (instance {{ $labels.instance }})
|
||||
description: "There are too few live replicas available, risking data loss and service disruption.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Please replace the threshold with an appropriate value
|
||||
- alert: ClickhouseHighNetworkTraffic
|
||||
expr: 'ClickHouseMetrics_NetworkSend > 250 or ClickHouseMetrics_NetworkReceive > 250'
|
||||
for: 5m
|
||||
|
|
@ -94,6 +96,7 @@ groups:
|
|||
summary: ClickHouse High Network Traffic (instance {{ $labels.instance }})
|
||||
description: "Network traffic is unusually high, may affect cluster performance.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Please replace the threshold with an appropriate value
|
||||
- alert: ClickhouseHighTcpConnections
|
||||
expr: 'ClickHouseMetrics_TCPConnection > 400'
|
||||
for: 5m
|
||||
|
|
@ -166,6 +169,7 @@ groups:
|
|||
summary: ClickHouse zookeeper hardware exception (instance {{ $labels.instance }})
|
||||
description: "Zookeeper hardware exception: network issues communicating with ZooKeeper\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Please replace the threshold with an appropriate value
|
||||
- alert: ClickhouseHighNetworkUsage
|
||||
expr: 'rate(ClickHouseProfileEvents_NetworkSendBytes[1m]) > 100*1024*1024 or rate(ClickHouseProfileEvents_NetworkReceiveBytes[1m]) > 100*1024*1024'
|
||||
for: 2m
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: LablabsCloudflareExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: CloudflareHttp4xxErrorRate
|
||||
|
|
|
|||
1
dist/rules/consul/consul-exporter.yml
vendored
1
dist/rules/consul/consul-exporter.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: ConsulExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: ConsulServiceHealthcheckFailed
|
||||
|
|
|
|||
1
dist/rules/coredns/embedded-exporter.yml
vendored
1
dist/rules/coredns/embedded-exporter.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: CorednsPanicCount
|
||||
|
|
|
|||
1
dist/rules/cortex/embedded-exporter.yml
vendored
1
dist/rules/cortex/embedded-exporter.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: CortexRulerConfigurationReloadFailure
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: GesellixCouchdbPrometheusExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: CouchdbNodeDown
|
||||
|
|
|
|||
|
|
@ -2,8 +2,10 @@ groups:
|
|||
|
||||
- name: GoogleCadvisor
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
# This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment.
|
||||
- alert: ContainerKilled
|
||||
expr: 'time() - container_last_seen > 60'
|
||||
for: 0m
|
||||
|
|
@ -13,6 +15,7 @@ groups:
|
|||
summary: Container killed (instance {{ $labels.instance }})
|
||||
description: "A container has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment.
|
||||
- alert: ContainerAbsent
|
||||
expr: 'absent(container_last_seen)'
|
||||
for: 5m
|
||||
|
|
@ -31,6 +34,7 @@ groups:
|
|||
summary: Container High CPU utilization (instance {{ $labels.instance }})
|
||||
description: "Container CPU utilization is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# See https://medium.com/faun/how-much-is-too-much-the-linux-oomkiller-and-used-memory-d32186f29c9d
|
||||
- alert: ContainerHighMemoryUsage
|
||||
expr: '(sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80'
|
||||
for: 2m
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: PrometheusCommunityElasticsearchExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: ElasticsearchHeapUsageTooHigh
|
||||
|
|
|
|||
1
dist/rules/etcd/embedded-exporter.yml
vendored
1
dist/rules/etcd/embedded-exporter.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: EtcdInsufficientMembers
|
||||
|
|
|
|||
1
dist/rules/fluxcd/embedded-exporter.yml
vendored
1
dist/rules/fluxcd/embedded-exporter.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: FluxKustomizationFailure
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: ZnerolFreeswitchExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: FreeswitchDown
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: GrafanaAlloyServiceDown
|
||||
|
|
|
|||
1
dist/rules/graph-node/embedded-exporter.yml
vendored
1
dist/rules/graph-node/embedded-exporter.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: ProviderFailedBecauseNet_versionFailed
|
||||
|
|
|
|||
1
dist/rules/hadoop/jmx_exporter.yml
vendored
1
dist/rules/hadoop/jmx_exporter.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: Jmx_exporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: HadoopNameNodeDown
|
||||
|
|
|
|||
1
dist/rules/haproxy/embedded-exporter-v2.yml
vendored
1
dist/rules/haproxy/embedded-exporter-v2.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: EmbeddedExporterV2
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: HaproxyHighHttp4xxErrorRateBackend
|
||||
|
|
|
|||
1
dist/rules/haproxy/haproxy-exporter-v1.yml
vendored
1
dist/rules/haproxy/haproxy-exporter-v1.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: HaproxyExporterV1
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: HaproxyDown
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: VaultSealed
|
||||
|
|
|
|||
12
dist/rules/host-and-hardware/node-exporter.yml
vendored
12
dist/rules/host-and-hardware/node-exporter.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: NodeExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: HostOutOfMemory
|
||||
|
|
@ -22,6 +23,7 @@ groups:
|
|||
summary: Host memory under memory pressure (instance {{ $labels.instance }})
|
||||
description: "The node is under heavy memory pressure. High rate of loading memory pages from disk.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
|
||||
- alert: HostMemoryIsUnderutilized
|
||||
expr: 'min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8'
|
||||
for: 0m
|
||||
|
|
@ -58,6 +60,9 @@ groups:
|
|||
summary: Host unusual disk read rate (instance {{ $labels.instance }})
|
||||
description: "Disk is too busy (IO wait > 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Please add ignored mountpoints in node_exporter parameters like
|
||||
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
|
||||
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
|
||||
- alert: HostOutOfDiskSpace
|
||||
expr: '(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0)'
|
||||
for: 2m
|
||||
|
|
@ -67,6 +72,9 @@ groups:
|
|||
summary: Host out of disk space (instance {{ $labels.instance }})
|
||||
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Please add ignored mountpoints in node_exporter parameters like
|
||||
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
|
||||
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
|
||||
- alert: HostDiskMayFillIn24Hours
|
||||
expr: 'predict_linear(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[3h], 86400) <= 0 and node_filesystem_avail_bytes > 0'
|
||||
for: 2m
|
||||
|
|
@ -130,6 +138,7 @@ groups:
|
|||
summary: Host high CPU load (instance {{ $labels.instance }})
|
||||
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
|
||||
- alert: HostCpuIsUnderutilized
|
||||
expr: '(min by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8'
|
||||
for: 1w
|
||||
|
|
@ -166,6 +175,9 @@ groups:
|
|||
summary: Host unusual disk IO (instance {{ $labels.instance }})
|
||||
description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# x2 context switches is an arbitrary number.
|
||||
# The alert threshold depends on the nature of the application.
|
||||
# Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
|
||||
- alert: HostContextSwitchingHigh
|
||||
expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2'
|
||||
for: 0m
|
||||
|
|
|
|||
1
dist/rules/istio/embedded-exporter.yml
vendored
1
dist/rules/istio/embedded-exporter.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: IstioKubernetesGatewayAvailabilityDrop
|
||||
|
|
|
|||
7
dist/rules/jenkins/metric-plugin.yml
vendored
7
dist/rules/jenkins/metric-plugin.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: MetricPlugin
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: JenkinsOffline
|
||||
|
|
@ -58,6 +59,12 @@ groups:
|
|||
summary: Jenkins build tests failing (instance {{ $labels.instance }})
|
||||
description: "Last build tests failed: {{$labels.jenkins_job}}. Failed build Tests for job `{{$labels.jenkins_job}}` on {{$labels.instance}}/{{$labels.env}} ({{$labels.region}})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# * RUNNING -1 true - The build had no errors.
|
||||
# * SUCCESS 0 true - The build had no errors.
|
||||
# * UNSTABLE 1 true - The build had some errors but they were not fatal. For example, some tests failed.
|
||||
# * FAILURE 2 false - The build had a fatal error.
|
||||
# * NOT_BUILT 3 false - The module was not built.
|
||||
# * ABORTED 4 false - The build was manually aborted.
|
||||
- alert: JenkinsLastBuildFailed
|
||||
expr: 'default_jenkins_builds_last_build_result_ordinal == 2'
|
||||
for: 0m
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: CzerwonkJunosExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: JuniperSwitchDown
|
||||
|
|
|
|||
1
dist/rules/jvm/jvm-exporter.yml
vendored
1
dist/rules/jvm/jvm-exporter.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: JvmExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: JvmMemoryFillingUp
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: DanielqsjKafkaExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: KafkaTopicsReplicas
|
||||
|
|
|
|||
1
dist/rules/kafka/linkedin-kafka-exporter.yml
vendored
1
dist/rules/kafka/linkedin-kafka-exporter.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: LinkedinKafkaExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: KafkaTopicOffsetDecreased
|
||||
|
|
|
|||
4
dist/rules/kubernetes/kubestate-exporter.yml
vendored
4
dist/rules/kubernetes/kubestate-exporter.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: KubestateExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: KubernetesNodeNotReady
|
||||
|
|
@ -13,6 +14,8 @@ groups:
|
|||
summary: Kubernetes Node ready (node {{ $labels.node }})
|
||||
description: "Node {{ $labels.node }} has been unready for a long time\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Kubernetes Node with disabled schedules are fine.
|
||||
# This alarm can be useful to get warned if there are nodes which are longer unscheduled.
|
||||
- alert: KubernetesNodeSchedulingDisabled
|
||||
expr: 'kube_node_spec_taint{key="node.kubernetes.io/unschedulable"} == 1'
|
||||
for: 30m
|
||||
|
|
@ -265,6 +268,7 @@ groups:
|
|||
summary: Kubernetes DaemonSet misscheduled ({{ $labels.namespace }}/{{ $labels.daemonset }})
|
||||
description: "Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Threshold should be customized for each cronjob name.
|
||||
- alert: KubernetesCronjobTooLong
|
||||
expr: 'kube_job_status_start_time > 0 and absent(kube_job_status_completion_time) and (time() - kube_job_status_start_time) > 3600'
|
||||
for: 0m
|
||||
|
|
|
|||
1
dist/rules/linkerd/embedded-exporter.yml
vendored
1
dist/rules/linkerd/embedded-exporter.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: LinkerdHighErrorRate
|
||||
|
|
|
|||
1
dist/rules/loki/embedded-exporter.yml
vendored
1
dist/rules/loki/embedded-exporter.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: LokiProcessTooManyRestarts
|
||||
|
|
|
|||
1
dist/rules/meilisearch/embedded-exporter.yml
vendored
1
dist/rules/meilisearch/embedded-exporter.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: MeilisearchIndexIsEmpty
|
||||
|
|
|
|||
1
dist/rules/minio/embedded-exporter.yml
vendored
1
dist/rules/minio/embedded-exporter.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: MinioClusterDiskOffline
|
||||
|
|
|
|||
1
dist/rules/mongodb/dcu-mongodb-exporter.yml
vendored
1
dist/rules/mongodb/dcu-mongodb-exporter.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: DcuMongodbExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: MongodbReplicationLag
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: PerconaMongodbExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: MongodbDown
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: StefanprodanMgobExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: MgobBackupFailed
|
||||
|
|
|
|||
1
dist/rules/mysql/mysqld-exporter.yml
vendored
1
dist/rules/mysql/mysqld-exporter.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: MysqldExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: MysqlDown
|
||||
|
|
|
|||
1
dist/rules/nats/nats-exporter.yml
vendored
1
dist/rules/nats/nats-exporter.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: NatsExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: NatsHighConnectionCount
|
||||
|
|
|
|||
1
dist/rules/netdata/embedded-exporter.yml
vendored
1
dist/rules/netdata/embedded-exporter.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: NetdataHighCpuUsage
|
||||
|
|
|
|||
1
dist/rules/nginx/knyar-nginx-exporter.yml
vendored
1
dist/rules/nginx/knyar-nginx-exporter.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: KnyarNginxExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: NginxHighHttp4xxErrorRate
|
||||
|
|
|
|||
1
dist/rules/nomad/embedded-exporter.yml
vendored
1
dist/rules/nomad/embedded-exporter.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: NomadJobFailed
|
||||
|
|
|
|||
1
dist/rules/openebs/embedded-exporter.yml
vendored
1
dist/rules/openebs/embedded-exporter.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: OpenebsUsedPoolCapacity
|
||||
|
|
|
|||
117
dist/rules/opentelemetry-collector/embedded-exporter.yml
vendored
Normal file
117
dist/rules/opentelemetry-collector/embedded-exporter.yml
vendored
Normal file
|
|
@ -0,0 +1,117 @@
|
|||
groups:
|
||||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
# OpenTelemetry Collector self-monitoring metrics are exposed on port 8888 by default at the /metrics endpoint.
|
||||
# These alerts monitor the collector's health when metrics are ingested via the Prometheus OTLP endpoint or scraped directly.
|
||||
# All collector internal metrics are prefixed with 'otelcol_'.
|
||||
|
||||
rules:
|
||||
|
||||
- alert: OpentelemetryCollectorDown
|
||||
expr: 'up{job=~".*otel.*collector.*"} == 0'
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: OpenTelemetry Collector down (instance {{ $labels.instance }})
|
||||
description: "OpenTelemetry Collector instance has disappeared or is not being scraped\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: OpentelemetryCollectorReceiverRefusedSpans
|
||||
expr: 'rate(otelcol_receiver_refused_spans[5m]) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: OpenTelemetry Collector receiver refused spans (instance {{ $labels.instance }})
|
||||
description: "OpenTelemetry Collector is refusing spans on {{ $labels.receiver }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: OpentelemetryCollectorReceiverRefusedMetricPoints
|
||||
expr: 'rate(otelcol_receiver_refused_metric_points[5m]) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: OpenTelemetry Collector receiver refused metric points (instance {{ $labels.instance }})
|
||||
description: "OpenTelemetry Collector is refusing metric points on {{ $labels.receiver }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: OpentelemetryCollectorReceiverRefusedLogRecords
|
||||
expr: 'rate(otelcol_receiver_refused_log_records[5m]) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: OpenTelemetry Collector receiver refused log records (instance {{ $labels.instance }})
|
||||
description: "OpenTelemetry Collector is refusing log records on {{ $labels.receiver }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: OpentelemetryCollectorExporterFailedSpans
|
||||
expr: 'rate(otelcol_exporter_send_failed_spans[5m]) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: OpenTelemetry Collector exporter failed spans (instance {{ $labels.instance }})
|
||||
description: "OpenTelemetry Collector failing to send spans via {{ $labels.exporter }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: OpentelemetryCollectorExporterFailedMetricPoints
|
||||
expr: 'rate(otelcol_exporter_send_failed_metric_points[5m]) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: OpenTelemetry Collector exporter failed metric points (instance {{ $labels.instance }})
|
||||
description: "OpenTelemetry Collector failing to send metric points via {{ $labels.exporter }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: OpentelemetryCollectorExporterFailedLogRecords
|
||||
expr: 'rate(otelcol_exporter_send_failed_log_records[5m]) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: OpenTelemetry Collector exporter failed log records (instance {{ $labels.instance }})
|
||||
description: "OpenTelemetry Collector failing to send log records via {{ $labels.exporter }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: OpentelemetryCollectorExporterQueueNearlyFull
|
||||
expr: '(otelcol_exporter_queue_size / on(instance, job, exporter) otelcol_exporter_queue_capacity) > 0.8 and otelcol_exporter_queue_capacity > 0'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: OpenTelemetry Collector exporter queue nearly full (instance {{ $labels.instance }})
|
||||
description: "OpenTelemetry Collector exporter {{ $labels.exporter }} queue is over 80% full\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: OpentelemetryCollectorProcessorRefusedSpans
|
||||
expr: 'rate(otelcol_processor_refused_spans[5m]) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: OpenTelemetry Collector processor refused spans (instance {{ $labels.instance }})
|
||||
description: "OpenTelemetry Collector processor {{ $labels.processor }} is refusing spans, likely due to backpressure\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: OpentelemetryCollectorProcessorRefusedMetricPoints
|
||||
expr: 'rate(otelcol_processor_refused_metric_points[5m]) > 0'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: OpenTelemetry Collector processor refused metric points (instance {{ $labels.instance }})
|
||||
description: "OpenTelemetry Collector processor {{ $labels.processor }} is refusing metric points, likely due to backpressure\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: OpentelemetryCollectorHighMemoryUsage
|
||||
expr: '(otelcol_process_runtime_heap_alloc_bytes{job=~".*otel.*collector.*"} / on(instance, job) otelcol_process_runtime_total_sys_memory_bytes{job=~".*otel.*collector.*"}) > 0.9'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: OpenTelemetry Collector high memory usage (instance {{ $labels.instance }})
|
||||
description: "OpenTelemetry Collector memory usage is above 90%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: OpentelemetryCollectorOtlpReceiverErrors
|
||||
expr: 'rate(otelcol_receiver_accepted_spans{receiver=~"otlp"}[5m]) == 0 and rate(otelcol_receiver_refused_spans{receiver=~"otlp"}[5m]) > 0'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: OpenTelemetry Collector OTLP receiver errors (instance {{ $labels.instance }})
|
||||
description: "OpenTelemetry Collector OTLP receiver is completely failing - all spans are being refused\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: EmbeddedExporterPatroni
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: PatroniHasNoLeader
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: SpreakerPgbouncerExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: PgbouncerActiveConnections
|
||||
|
|
|
|||
1
dist/rules/php-fpm/bakins-fpm-exporter.yml
vendored
1
dist/rules/php-fpm/bakins-fpm-exporter.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: BakinsFpmExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: Php-fpmMax-childrenReached
|
||||
|
|
|
|||
4
dist/rules/postgresql/postgres-exporter.yml
vendored
4
dist/rules/postgresql/postgres-exporter.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: PostgresExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: PostgresqlDown
|
||||
|
|
@ -166,6 +167,7 @@ groups:
|
|||
summary: Postgresql too many locks acquired (instance {{ $labels.instance }})
|
||||
description: "Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
|
||||
- alert: PostgresqlBloatIndexHigh(>80%)
|
||||
expr: 'pg_bloat_btree_bloat_pct > 80 and on (idxname) (pg_bloat_btree_real_size > 100000000)'
|
||||
for: 1h
|
||||
|
|
@ -175,6 +177,7 @@ groups:
|
|||
summary: Postgresql bloat index high (> 80%) (instance {{ $labels.instance }})
|
||||
description: "The index {{ $labels.idxname }} is bloated. You should execute `REINDEX INDEX CONCURRENTLY {{ $labels.idxname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
|
||||
- alert: PostgresqlBloatTableHigh(>80%)
|
||||
expr: 'pg_bloat_table_bloat_pct > 80 and on (relname) (pg_bloat_table_real_size > 200000000)'
|
||||
for: 1h
|
||||
|
|
@ -184,6 +187,7 @@ groups:
|
|||
summary: Postgresql bloat table high (> 80%) (instance {{ $labels.instance }})
|
||||
description: "The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
|
||||
- alert: PostgresqlInvalidIndex
|
||||
expr: 'pg_general_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}'
|
||||
for: 6h
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: PrometheusJobMissing
|
||||
|
|
|
|||
1
dist/rules/promtail/embedded-exporter.yml
vendored
1
dist/rules/promtail/embedded-exporter.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: PromtailRequestErrors
|
||||
|
|
|
|||
1
dist/rules/pulsar/embedded-exporter.yml
vendored
1
dist/rules/pulsar/embedded-exporter.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: PulsarSubscriptionHighNumberOfBacklogEntries
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: KbuddeRabbitmqExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: RabbitmqDown
|
||||
|
|
@ -49,6 +50,7 @@ groups:
|
|||
summary: RabbitMQ too many connections (instance {{ $labels.instance }})
|
||||
description: "RabbitMQ instance has too many connections (> 1000)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Indicate the queue name in dedicated label.
|
||||
- alert: RabbitmqDeadLetterQueueFillingUp
|
||||
expr: 'rabbitmq_queue_messages{queue="my-dead-letter-queue"} > 10'
|
||||
for: 1m
|
||||
|
|
@ -58,6 +60,7 @@ groups:
|
|||
summary: RabbitMQ dead letter queue filling up (instance {{ $labels.instance }})
|
||||
description: "Dead letter queue is filling up (> 10 msgs)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Indicate the queue name in dedicated label.
|
||||
- alert: RabbitmqTooManyMessagesInQueue
|
||||
expr: 'rabbitmq_queue_messages_ready{queue="my-queue"} > 1000'
|
||||
for: 2m
|
||||
|
|
@ -67,6 +70,7 @@ groups:
|
|||
summary: RabbitMQ too many messages in queue (instance {{ $labels.instance }})
|
||||
description: "Queue is filling up (> 1000 msgs)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Indicate the queue name in dedicated label.
|
||||
- alert: RabbitmqSlowQueueConsuming
|
||||
expr: 'time() - rabbitmq_queue_head_message_timestamp{queue="my-queue"} > 60'
|
||||
for: 2m
|
||||
|
|
@ -85,6 +89,7 @@ groups:
|
|||
summary: RabbitMQ no consumer (instance {{ $labels.instance }})
|
||||
description: "Queue has no consumer\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Indicate the queue name in dedicated label.
|
||||
- alert: RabbitmqTooManyConsumers
|
||||
expr: 'rabbitmq_queue_consumers{queue="my-queue"} > 1'
|
||||
for: 0m
|
||||
|
|
@ -94,6 +99,7 @@ groups:
|
|||
summary: RabbitMQ too many consumers (instance {{ $labels.instance }})
|
||||
description: "Queue should have only 1 consumer\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Indicate the exchange name in dedicated label.
|
||||
- alert: RabbitmqUnactiveExchange
|
||||
expr: 'rate(rabbitmq_exchange_messages_published_in_total{exchange="my-exchange"}[1m]) < 5'
|
||||
for: 2m
|
||||
|
|
|
|||
1
dist/rules/rabbitmq/rabbitmq-exporter.yml
vendored
1
dist/rules/rabbitmq/rabbitmq-exporter.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: RabbitmqExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: RabbitmqNodeDown
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: Oliver006RedisExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: RedisDown
|
||||
|
|
@ -67,6 +68,7 @@ groups:
|
|||
summary: Redis missing backup (instance {{ $labels.instance }})
|
||||
description: "Redis has not been backuped for 24 hours\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# The exporter must be started with --include-system-metrics flag or REDIS_EXPORTER_INCL_SYSTEM_METRICS=true environment variable.
|
||||
- alert: RedisOutOfSystemMemory
|
||||
expr: 'redis_memory_used_bytes / redis_total_system_memory_bytes * 100 > 90'
|
||||
for: 2m
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: SmartctlExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: SmartDeviceTemperatureWarning
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: StrechSidekiqExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: SidekiqQueueSize
|
||||
|
|
|
|||
1
dist/rules/solr/embedded-exporter.yml
vendored
1
dist/rules/solr/embedded-exporter.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: EmbeddedExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: SolrUpdateErrors
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: NlamiraultSpeedtestExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: SpeedtestSlowInternetDownload
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: OzarklakeMssqlExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: SqlServerDown
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: RibbybibbySslExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: SslCertificateProbeFailed
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: ThanosBucketReplicate
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: ThanosBucketReplicateErrorRate
|
||||
|
|
|
|||
1
dist/rules/thanos/thanos-compactor.yml
vendored
1
dist/rules/thanos/thanos-compactor.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: ThanosCompactor
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: ThanosCompactorMultipleRunning
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: ThanosComponentAbsent
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: ThanosCompactIsDown
|
||||
|
|
|
|||
1
dist/rules/thanos/thanos-query.yml
vendored
1
dist/rules/thanos/thanos-query.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: ThanosQuery
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: ThanosQueryHttpRequestQueryErrorRateHigh
|
||||
|
|
|
|||
1
dist/rules/thanos/thanos-receiver.yml
vendored
1
dist/rules/thanos/thanos-receiver.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: ThanosReceiver
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: ThanosReceiveHttpRequestErrorRateHigh
|
||||
|
|
|
|||
1
dist/rules/thanos/thanos-ruler.yml
vendored
1
dist/rules/thanos/thanos-ruler.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: ThanosRuler
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: ThanosRuleQueueIsDroppingAlerts
|
||||
|
|
|
|||
1
dist/rules/thanos/thanos-sidecar.yml
vendored
1
dist/rules/thanos/thanos-sidecar.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: ThanosSidecar
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: ThanosSidecarBucketOperationsFailed
|
||||
|
|
|
|||
1
dist/rules/thanos/thanos-store.yml
vendored
1
dist/rules/thanos/thanos-store.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: ThanosStore
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: ThanosStoreGrpcErrorRate
|
||||
|
|
|
|||
1
dist/rules/traefik/embedded-exporter-v1.yml
vendored
1
dist/rules/traefik/embedded-exporter-v1.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: EmbeddedExporterV1
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: TraefikBackendDown
|
||||
|
|
|
|||
1
dist/rules/traefik/embedded-exporter-v2.yml
vendored
1
dist/rules/traefik/embedded-exporter-v2.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: EmbeddedExporterV2
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: TraefikServiceDown
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: PryordaVmwareExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: VirtualMachineMemoryWarning
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: WindowsExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: WindowsServerCollectorError
|
||||
|
|
|
|||
1
dist/rules/zfs/node-exporter.yml
vendored
1
dist/rules/zfs/node-exporter.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: NodeExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: ZfsOfflinePool
|
||||
|
|
|
|||
8
dist/rules/zfs/zfs_exporter.yml
vendored
8
dist/rules/zfs/zfs_exporter.yml
vendored
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: Zfs_exporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: ZfsPoolOutOfSpace
|
||||
|
|
@ -13,6 +14,13 @@ groups:
|
|||
summary: ZFS pool out of space (instance {{ $labels.instance }})
|
||||
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# 0: ONLINE
|
||||
# 1: DEGRADED
|
||||
# 2: FAULTED
|
||||
# 3: OFFLINE
|
||||
# 4: UNAVAIL
|
||||
# 5: REMOVED
|
||||
# 6: SUSPENDED
|
||||
- alert: ZfsPoolUnhealthy
|
||||
expr: 'zfs_pool_health > 0'
|
||||
for: 0m
|
||||
|
|
|
|||
|
|
@ -2,4 +2,5 @@ groups:
|
|||
|
||||
- name: CloudflareKafkaZookeeperExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ groups:
|
|||
|
||||
- name: DabealuZookeeperExporter
|
||||
|
||||
|
||||
rules:
|
||||
|
||||
- alert: ZookeeperDown
|
||||
|
|
|
|||
6
dist/template.yml
vendored
6
dist/template.yml
vendored
|
|
@ -2,9 +2,13 @@ groups:
|
|||
{% assign groupName = slug | split: '-' %}{% capture groupNameCamelcase %}{% for word in groupName %}{{ word | capitalize }} {% endfor %}{% endcapture %}
|
||||
- name: {{ groupNameCamelcase | remove: ' ' | remove: '-' }}
|
||||
|
||||
{% assign lines = comments | split: "
|
||||
" %}{% for line in lines %}# {{ line | strip }}
|
||||
{% endfor %}
|
||||
rules:
|
||||
{% for rule in rules %}{% assign ruleName = rule.name | split: ' ' %}{% capture ruleNameCamelcase %}{% for word in ruleName %}{{ word | capitalize }} {% endfor %}{% endcapture %}
|
||||
{% for comment in comments %}# {{ comment | strip }}
|
||||
{% assign lines = rule.comments | split: "
|
||||
" %}{% for line in lines %}# {{ line | strip }}
|
||||
{% endfor %}- alert: {{ ruleNameCamelcase | remove: ' ' }}
|
||||
expr: '{{ rule.query }}'
|
||||
for: {% if rule.for %}{{ rule.for }}{% else %}0m{% endif %}
|
||||
|
|
|
|||
3
rules.md
3
rules.md
|
|
@ -62,8 +62,9 @@
|
|||
// @TODO: Please contribute => https://github.com/samber/awesome-prometheus-alerts 👋
|
||||
{% endhighlight %}
|
||||
{% else %}
|
||||
{{ exporter.comments | strip | newline_to_br }}
|
||||
{% highlight bash %}
|
||||
$ wget https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/{{ service.name | replace: " ", "-" | downcase }}/{{ exporter.slug }}.yml
|
||||
$ wget https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/ref/head/dist/rules/{{ service.name | replace: " ", "-" | downcase }}/{{ exporter.slug }}.yml
|
||||
{% endhighlight %}
|
||||
{% endif %}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue