Merge pull request #3 from samber/master

Merge latest from upstream
This commit is contained in:
Simon Matic Langford 2025-11-17 10:20:20 +00:00 committed by GitHub
commit e2ffbfb653
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
84 changed files with 320 additions and 3 deletions

View file

@ -18,7 +18,7 @@ Please ensure your pull request adheres to the following guidelines:
## Improving Github page ## Improving Github page
### Run localy ### Run locally
``` ```
gem install bundler gem install bundler

View file

@ -106,6 +106,7 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts
- [Promtail](https://samber.github.io/awesome-prometheus-alerts/rules#promtail) - [Promtail](https://samber.github.io/awesome-prometheus-alerts/rules#promtail)
- [Cortex](https://samber.github.io/awesome-prometheus-alerts/rules#cortex) - [Cortex](https://samber.github.io/awesome-prometheus-alerts/rules#cortex)
- [Grafana Alloy](https://samber.github.io/awesome-prometheus-alerts/rules#grafana-alloy) - [Grafana Alloy](https://samber.github.io/awesome-prometheus-alerts/rules#grafana-alloy)
- [OpenTelemetry Collector](https://samber.github.io/awesome-prometheus-alerts/rules#opentelemetry-collector)
- [Jenkins](https://samber.github.io/awesome-prometheus-alerts/rules#jenkins) - [Jenkins](https://samber.github.io/awesome-prometheus-alerts/rules#jenkins)
- [Graph Node](https://samber.github.io/awesome-prometheus-alerts/rules#graph-node) - [Graph Node](https://samber.github.io/awesome-prometheus-alerts/rules#graph-node)

View file

@ -3183,6 +3183,76 @@ groups:
query: "count by (instance) (alloy_build_info) unless count by (instance) (alloy_build_info offset 2m) " query: "count by (instance) (alloy_build_info) unless count by (instance) (alloy_build_info offset 2m) "
severity: critical severity: critical
- name: OpenTelemetry Collector
exporters:
- name: Embedded exporter
slug: embedded-exporter
doc_url: https://opentelemetry.io/docs/collector/internal-telemetry/
comments: |
OpenTelemetry Collector self-monitoring metrics are exposed on port 8888 by default at the /metrics endpoint.
These alerts monitor the collector's health when metrics are ingested via the Prometheus OTLP endpoint or scraped directly.
All collector internal metrics are prefixed with 'otelcol_'.
rules:
- name: OpenTelemetry Collector down
description: OpenTelemetry Collector instance has disappeared or is not being scraped
query: 'up{job=~".*otel.*collector.*"} == 0'
severity: critical
for: 1m
- name: OpenTelemetry Collector receiver refused spans
description: "OpenTelemetry Collector is refusing spans on {{ $labels.receiver }}"
query: 'rate(otelcol_receiver_refused_spans[5m]) > 0'
severity: critical
for: 5m
- name: OpenTelemetry Collector receiver refused metric points
description: "OpenTelemetry Collector is refusing metric points on {{ $labels.receiver }}"
query: 'rate(otelcol_receiver_refused_metric_points[5m]) > 0'
severity: critical
for: 5m
- name: OpenTelemetry Collector receiver refused log records
description: "OpenTelemetry Collector is refusing log records on {{ $labels.receiver }}"
query: 'rate(otelcol_receiver_refused_log_records[5m]) > 0'
severity: critical
for: 5m
- name: OpenTelemetry Collector exporter failed spans
description: "OpenTelemetry Collector failing to send spans via {{ $labels.exporter }}"
query: 'rate(otelcol_exporter_send_failed_spans[5m]) > 0'
severity: warning
for: 5m
- name: OpenTelemetry Collector exporter failed metric points
description: "OpenTelemetry Collector failing to send metric points via {{ $labels.exporter }}"
query: 'rate(otelcol_exporter_send_failed_metric_points[5m]) > 0'
severity: warning
for: 5m
- name: OpenTelemetry Collector exporter failed log records
description: "OpenTelemetry Collector failing to send log records via {{ $labels.exporter }}"
query: 'rate(otelcol_exporter_send_failed_log_records[5m]) > 0'
severity: warning
for: 5m
- name: OpenTelemetry Collector exporter queue nearly full
description: "OpenTelemetry Collector exporter {{ $labels.exporter }} queue is over 80% full"
query: '(otelcol_exporter_queue_size / on(instance, job, exporter) otelcol_exporter_queue_capacity) > 0.8 and otelcol_exporter_queue_capacity > 0'
severity: warning
- name: OpenTelemetry Collector processor refused spans
description: "OpenTelemetry Collector processor {{ $labels.processor }} is refusing spans, likely due to backpressure"
query: 'rate(otelcol_processor_refused_spans[5m]) > 0'
severity: warning
for: 5m
- name: OpenTelemetry Collector processor refused metric points
description: "OpenTelemetry Collector processor {{ $labels.processor }} is refusing metric points, likely due to backpressure"
query: 'rate(otelcol_processor_refused_metric_points[5m]) > 0'
severity: warning
for: 5m
- name: OpenTelemetry Collector high memory usage
description: "OpenTelemetry Collector memory usage is above 90%"
query: '(otelcol_process_runtime_heap_alloc_bytes{job=~".*otel.*collector.*"} / on(instance, job) otelcol_process_runtime_total_sys_memory_bytes{job=~".*otel.*collector.*"}) > 0.9'
severity: warning
for: 5m
- name: OpenTelemetry Collector OTLP receiver errors
description: "OpenTelemetry Collector OTLP receiver is completely failing - all spans are being refused"
query: 'rate(otelcol_receiver_accepted_spans{receiver=~"otlp"}[5m]) == 0 and rate(otelcol_receiver_refused_spans{receiver=~"otlp"}[5m]) > 0'
severity: critical
for: 2m
- name: Jenkins - name: Jenkins
exporters: exporters:
- name: Metric plugin - name: Metric plugin

View file

@ -2,6 +2,7 @@ groups:
- name: LusitaniaeApacheExporter - name: LusitaniaeApacheExporter
rules: rules:
- alert: ApacheDown - alert: ApacheDown

View file

@ -2,6 +2,7 @@ groups:
- name: Apcupsd_exporter - name: Apcupsd_exporter
rules: rules:
- alert: ApcUpsBatteryNearlyEmpty - alert: ApcUpsBatteryNearlyEmpty

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporter - name: EmbeddedExporter
rules: rules:
- alert: ArgocdServiceNotSynced - alert: ArgocdServiceNotSynced

View file

@ -2,6 +2,7 @@ groups:
- name: BlackboxExporter - name: BlackboxExporter
rules: rules:
- alert: BlackboxProbeFailed - alert: BlackboxProbeFailed
@ -58,6 +59,10 @@ groups:
summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }}) summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
description: "SSL certificate expires in less than 3 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "SSL certificate expires in less than 3 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# For probe_ssl_earliest_cert_expiry to be exposed after expiration, you
# need to enable insecure_skip_verify. Note that this will disable
# certificate validation.
# See https://github.com/prometheus/blackbox_exporter/blob/master/CONFIGURATION.md#tls_config
- alert: BlackboxSslCertificateExpired - alert: BlackboxSslCertificateExpired
expr: 'round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 0' expr: 'round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 0'
for: 0m for: 0m

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporter - name: EmbeddedExporter
rules: rules:
- alert: CaddyReverseProxyDown - alert: CaddyReverseProxyDown

View file

@ -2,6 +2,7 @@ groups:
- name: CriteoCassandraExporter - name: CriteoCassandraExporter
rules: rules:
- alert: CassandraHintsCount - alert: CassandraHintsCount

View file

@ -2,6 +2,7 @@ groups:
- name: InstaclustrCassandraExporter - name: InstaclustrCassandraExporter
rules: rules:
- alert: CassandraNodeIsUnavailable - alert: CassandraNodeIsUnavailable

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporter - name: EmbeddedExporter
rules: rules:
- alert: CephState - alert: CephState

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporter - name: EmbeddedExporter
rules: rules:
- alert: ClickhouseNodeDown - alert: ClickhouseNodeDown
@ -85,6 +86,7 @@ groups:
summary: ClickHouse No Live Replicas (instance {{ $labels.instance }}) summary: ClickHouse No Live Replicas (instance {{ $labels.instance }})
description: "There are too few live replicas available, risking data loss and service disruption.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "There are too few live replicas available, risking data loss and service disruption.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Please replace the threshold with an appropriate value
- alert: ClickhouseHighNetworkTraffic - alert: ClickhouseHighNetworkTraffic
expr: 'ClickHouseMetrics_NetworkSend > 250 or ClickHouseMetrics_NetworkReceive > 250' expr: 'ClickHouseMetrics_NetworkSend > 250 or ClickHouseMetrics_NetworkReceive > 250'
for: 5m for: 5m
@ -94,6 +96,7 @@ groups:
summary: ClickHouse High Network Traffic (instance {{ $labels.instance }}) summary: ClickHouse High Network Traffic (instance {{ $labels.instance }})
description: "Network traffic is unusually high, may affect cluster performance.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Network traffic is unusually high, may affect cluster performance.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Please replace the threshold with an appropriate value
- alert: ClickhouseHighTcpConnections - alert: ClickhouseHighTcpConnections
expr: 'ClickHouseMetrics_TCPConnection > 400' expr: 'ClickHouseMetrics_TCPConnection > 400'
for: 5m for: 5m
@ -166,6 +169,7 @@ groups:
summary: ClickHouse zookeeper hardware exception (instance {{ $labels.instance }}) summary: ClickHouse zookeeper hardware exception (instance {{ $labels.instance }})
description: "Zookeeper hardware exception: network issues communicating with ZooKeeper\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Zookeeper hardware exception: network issues communicating with ZooKeeper\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Please replace the threshold with an appropriate value
- alert: ClickhouseHighNetworkUsage - alert: ClickhouseHighNetworkUsage
expr: 'rate(ClickHouseProfileEvents_NetworkSendBytes[1m]) > 100*1024*1024 or rate(ClickHouseProfileEvents_NetworkReceiveBytes[1m]) > 100*1024*1024' expr: 'rate(ClickHouseProfileEvents_NetworkSendBytes[1m]) > 100*1024*1024 or rate(ClickHouseProfileEvents_NetworkReceiveBytes[1m]) > 100*1024*1024'
for: 2m for: 2m

View file

@ -2,6 +2,7 @@ groups:
- name: LablabsCloudflareExporter - name: LablabsCloudflareExporter
rules: rules:
- alert: CloudflareHttp4xxErrorRate - alert: CloudflareHttp4xxErrorRate

View file

@ -2,6 +2,7 @@ groups:
- name: ConsulExporter - name: ConsulExporter
rules: rules:
- alert: ConsulServiceHealthcheckFailed - alert: ConsulServiceHealthcheckFailed

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporter - name: EmbeddedExporter
rules: rules:
- alert: CorednsPanicCount - alert: CorednsPanicCount

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporter - name: EmbeddedExporter
rules: rules:
- alert: CortexRulerConfigurationReloadFailure - alert: CortexRulerConfigurationReloadFailure

View file

@ -2,6 +2,7 @@ groups:
- name: GesellixCouchdbPrometheusExporter - name: GesellixCouchdbPrometheusExporter
rules: rules:
- alert: CouchdbNodeDown - alert: CouchdbNodeDown

View file

@ -2,8 +2,10 @@ groups:
- name: GoogleCadvisor - name: GoogleCadvisor
rules: rules:
# This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment.
- alert: ContainerKilled - alert: ContainerKilled
expr: 'time() - container_last_seen > 60' expr: 'time() - container_last_seen > 60'
for: 0m for: 0m
@ -13,6 +15,7 @@ groups:
summary: Container killed (instance {{ $labels.instance }}) summary: Container killed (instance {{ $labels.instance }})
description: "A container has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "A container has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment.
- alert: ContainerAbsent - alert: ContainerAbsent
expr: 'absent(container_last_seen)' expr: 'absent(container_last_seen)'
for: 5m for: 5m
@ -31,6 +34,7 @@ groups:
summary: Container High CPU utilization (instance {{ $labels.instance }}) summary: Container High CPU utilization (instance {{ $labels.instance }})
description: "Container CPU utilization is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Container CPU utilization is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# See https://medium.com/faun/how-much-is-too-much-the-linux-oomkiller-and-used-memory-d32186f29c9d
- alert: ContainerHighMemoryUsage - alert: ContainerHighMemoryUsage
expr: '(sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80' expr: '(sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80'
for: 2m for: 2m

View file

@ -2,6 +2,7 @@ groups:
- name: PrometheusCommunityElasticsearchExporter - name: PrometheusCommunityElasticsearchExporter
rules: rules:
- alert: ElasticsearchHeapUsageTooHigh - alert: ElasticsearchHeapUsageTooHigh

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporter - name: EmbeddedExporter
rules: rules:
- alert: EtcdInsufficientMembers - alert: EtcdInsufficientMembers

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporter - name: EmbeddedExporter
rules: rules:
- alert: FluxKustomizationFailure - alert: FluxKustomizationFailure

View file

@ -2,6 +2,7 @@ groups:
- name: ZnerolFreeswitchExporter - name: ZnerolFreeswitchExporter
rules: rules:
- alert: FreeswitchDown - alert: FreeswitchDown

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporter - name: EmbeddedExporter
rules: rules:
- alert: GrafanaAlloyServiceDown - alert: GrafanaAlloyServiceDown

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporter - name: EmbeddedExporter
rules: rules:
- alert: ProviderFailedBecauseNet_versionFailed - alert: ProviderFailedBecauseNet_versionFailed

View file

@ -2,6 +2,7 @@ groups:
- name: Jmx_exporter - name: Jmx_exporter
rules: rules:
- alert: HadoopNameNodeDown - alert: HadoopNameNodeDown

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporterV2 - name: EmbeddedExporterV2
rules: rules:
- alert: HaproxyHighHttp4xxErrorRateBackend - alert: HaproxyHighHttp4xxErrorRateBackend

View file

@ -2,6 +2,7 @@ groups:
- name: HaproxyExporterV1 - name: HaproxyExporterV1
rules: rules:
- alert: HaproxyDown - alert: HaproxyDown

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporter - name: EmbeddedExporter
rules: rules:
- alert: VaultSealed - alert: VaultSealed

View file

@ -2,6 +2,7 @@ groups:
- name: NodeExporter - name: NodeExporter
rules: rules:
- alert: HostOutOfMemory - alert: HostOutOfMemory
@ -22,6 +23,7 @@ groups:
summary: Host memory under memory pressure (instance {{ $labels.instance }}) summary: Host memory under memory pressure (instance {{ $labels.instance }})
description: "The node is under heavy memory pressure. High rate of loading memory pages from disk.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "The node is under heavy memory pressure. High rate of loading memory pages from disk.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
- alert: HostMemoryIsUnderutilized - alert: HostMemoryIsUnderutilized
expr: 'min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8' expr: 'min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8'
for: 0m for: 0m
@ -58,6 +60,9 @@ groups:
summary: Host unusual disk read rate (instance {{ $labels.instance }}) summary: Host unusual disk read rate (instance {{ $labels.instance }})
description: "Disk is too busy (IO wait > 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Disk is too busy (IO wait > 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Please add ignored mountpoints in node_exporter parameters like
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
- alert: HostOutOfDiskSpace - alert: HostOutOfDiskSpace
expr: '(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0)' expr: '(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0)'
for: 2m for: 2m
@ -67,6 +72,9 @@ groups:
summary: Host out of disk space (instance {{ $labels.instance }}) summary: Host out of disk space (instance {{ $labels.instance }})
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Please add ignored mountpoints in node_exporter parameters like
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
- alert: HostDiskMayFillIn24Hours - alert: HostDiskMayFillIn24Hours
expr: 'predict_linear(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[3h], 86400) <= 0 and node_filesystem_avail_bytes > 0' expr: 'predict_linear(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[3h], 86400) <= 0 and node_filesystem_avail_bytes > 0'
for: 2m for: 2m
@ -130,6 +138,7 @@ groups:
summary: Host high CPU load (instance {{ $labels.instance }}) summary: Host high CPU load (instance {{ $labels.instance }})
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
- alert: HostCpuIsUnderutilized - alert: HostCpuIsUnderutilized
expr: '(min by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8' expr: '(min by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8'
for: 1w for: 1w
@ -166,6 +175,9 @@ groups:
summary: Host unusual disk IO (instance {{ $labels.instance }}) summary: Host unusual disk IO (instance {{ $labels.instance }})
description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# x2 context switches is an arbitrary number.
# The alert threshold depends on the nature of the application.
# Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
- alert: HostContextSwitchingHigh - alert: HostContextSwitchingHigh
expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2' expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2'
for: 0m for: 0m

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporter - name: EmbeddedExporter
rules: rules:
- alert: IstioKubernetesGatewayAvailabilityDrop - alert: IstioKubernetesGatewayAvailabilityDrop

View file

@ -2,6 +2,7 @@ groups:
- name: MetricPlugin - name: MetricPlugin
rules: rules:
- alert: JenkinsOffline - alert: JenkinsOffline
@ -58,6 +59,12 @@ groups:
summary: Jenkins build tests failing (instance {{ $labels.instance }}) summary: Jenkins build tests failing (instance {{ $labels.instance }})
description: "Last build tests failed: {{$labels.jenkins_job}}. Failed build Tests for job `{{$labels.jenkins_job}}` on {{$labels.instance}}/{{$labels.env}} ({{$labels.region}})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Last build tests failed: {{$labels.jenkins_job}}. Failed build Tests for job `{{$labels.jenkins_job}}` on {{$labels.instance}}/{{$labels.env}} ({{$labels.region}})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# * RUNNING -1 true - The build had no errors.
# * SUCCESS 0 true - The build had no errors.
# * UNSTABLE 1 true - The build had some errors but they were not fatal. For example, some tests failed.
# * FAILURE 2 false - The build had a fatal error.
# * NOT_BUILT 3 false - The module was not built.
# * ABORTED 4 false - The build was manually aborted.
- alert: JenkinsLastBuildFailed - alert: JenkinsLastBuildFailed
expr: 'default_jenkins_builds_last_build_result_ordinal == 2' expr: 'default_jenkins_builds_last_build_result_ordinal == 2'
for: 0m for: 0m

View file

@ -2,6 +2,7 @@ groups:
- name: CzerwonkJunosExporter - name: CzerwonkJunosExporter
rules: rules:
- alert: JuniperSwitchDown - alert: JuniperSwitchDown

View file

@ -2,6 +2,7 @@ groups:
- name: JvmExporter - name: JvmExporter
rules: rules:
- alert: JvmMemoryFillingUp - alert: JvmMemoryFillingUp

View file

@ -2,6 +2,7 @@ groups:
- name: DanielqsjKafkaExporter - name: DanielqsjKafkaExporter
rules: rules:
- alert: KafkaTopicsReplicas - alert: KafkaTopicsReplicas

View file

@ -2,6 +2,7 @@ groups:
- name: LinkedinKafkaExporter - name: LinkedinKafkaExporter
rules: rules:
- alert: KafkaTopicOffsetDecreased - alert: KafkaTopicOffsetDecreased

View file

@ -2,6 +2,7 @@ groups:
- name: KubestateExporter - name: KubestateExporter
rules: rules:
- alert: KubernetesNodeNotReady - alert: KubernetesNodeNotReady
@ -13,6 +14,8 @@ groups:
summary: Kubernetes Node ready (node {{ $labels.node }}) summary: Kubernetes Node ready (node {{ $labels.node }})
description: "Node {{ $labels.node }} has been unready for a long time\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Node {{ $labels.node }} has been unready for a long time\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Kubernetes Node with disabled schedules are fine.
# This alarm can be useful to get warned if there are nodes which are longer unscheduled.
- alert: KubernetesNodeSchedulingDisabled - alert: KubernetesNodeSchedulingDisabled
expr: 'kube_node_spec_taint{key="node.kubernetes.io/unschedulable"} == 1' expr: 'kube_node_spec_taint{key="node.kubernetes.io/unschedulable"} == 1'
for: 30m for: 30m
@ -265,6 +268,7 @@ groups:
summary: Kubernetes DaemonSet misscheduled ({{ $labels.namespace }}/{{ $labels.daemonset }}) summary: Kubernetes DaemonSet misscheduled ({{ $labels.namespace }}/{{ $labels.daemonset }})
description: "Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold should be customized for each cronjob name.
- alert: KubernetesCronjobTooLong - alert: KubernetesCronjobTooLong
expr: 'kube_job_status_start_time > 0 and absent(kube_job_status_completion_time) and (time() - kube_job_status_start_time) > 3600' expr: 'kube_job_status_start_time > 0 and absent(kube_job_status_completion_time) and (time() - kube_job_status_start_time) > 3600'
for: 0m for: 0m

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporter - name: EmbeddedExporter
rules: rules:
- alert: LinkerdHighErrorRate - alert: LinkerdHighErrorRate

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporter - name: EmbeddedExporter
rules: rules:
- alert: LokiProcessTooManyRestarts - alert: LokiProcessTooManyRestarts

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporter - name: EmbeddedExporter
rules: rules:
- alert: MeilisearchIndexIsEmpty - alert: MeilisearchIndexIsEmpty

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporter - name: EmbeddedExporter
rules: rules:
- alert: MinioClusterDiskOffline - alert: MinioClusterDiskOffline

View file

@ -2,6 +2,7 @@ groups:
- name: DcuMongodbExporter - name: DcuMongodbExporter
rules: rules:
- alert: MongodbReplicationLag - alert: MongodbReplicationLag

View file

@ -2,6 +2,7 @@ groups:
- name: PerconaMongodbExporter - name: PerconaMongodbExporter
rules: rules:
- alert: MongodbDown - alert: MongodbDown

View file

@ -2,6 +2,7 @@ groups:
- name: StefanprodanMgobExporter - name: StefanprodanMgobExporter
rules: rules:
- alert: MgobBackupFailed - alert: MgobBackupFailed

View file

@ -2,6 +2,7 @@ groups:
- name: MysqldExporter - name: MysqldExporter
rules: rules:
- alert: MysqlDown - alert: MysqlDown

View file

@ -2,6 +2,7 @@ groups:
- name: NatsExporter - name: NatsExporter
rules: rules:
- alert: NatsHighConnectionCount - alert: NatsHighConnectionCount

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporter - name: EmbeddedExporter
rules: rules:
- alert: NetdataHighCpuUsage - alert: NetdataHighCpuUsage

View file

@ -2,6 +2,7 @@ groups:
- name: KnyarNginxExporter - name: KnyarNginxExporter
rules: rules:
- alert: NginxHighHttp4xxErrorRate - alert: NginxHighHttp4xxErrorRate

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporter - name: EmbeddedExporter
rules: rules:
- alert: NomadJobFailed - alert: NomadJobFailed

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporter - name: EmbeddedExporter
rules: rules:
- alert: OpenebsUsedPoolCapacity - alert: OpenebsUsedPoolCapacity

View file

@ -0,0 +1,117 @@
groups:
- name: EmbeddedExporter
# OpenTelemetry Collector self-monitoring metrics are exposed on port 8888 by default at the /metrics endpoint.
# These alerts monitor the collector's health when metrics are ingested via the Prometheus OTLP endpoint or scraped directly.
# All collector internal metrics are prefixed with 'otelcol_'.
rules:
- alert: OpentelemetryCollectorDown
expr: 'up{job=~".*otel.*collector.*"} == 0'
for: 1m
labels:
severity: critical
annotations:
summary: OpenTelemetry Collector down (instance {{ $labels.instance }})
description: "OpenTelemetry Collector instance has disappeared or is not being scraped\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: OpentelemetryCollectorReceiverRefusedSpans
expr: 'rate(otelcol_receiver_refused_spans[5m]) > 0'
for: 5m
labels:
severity: critical
annotations:
summary: OpenTelemetry Collector receiver refused spans (instance {{ $labels.instance }})
description: "OpenTelemetry Collector is refusing spans on {{ $labels.receiver }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: OpentelemetryCollectorReceiverRefusedMetricPoints
expr: 'rate(otelcol_receiver_refused_metric_points[5m]) > 0'
for: 5m
labels:
severity: critical
annotations:
summary: OpenTelemetry Collector receiver refused metric points (instance {{ $labels.instance }})
description: "OpenTelemetry Collector is refusing metric points on {{ $labels.receiver }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: OpentelemetryCollectorReceiverRefusedLogRecords
expr: 'rate(otelcol_receiver_refused_log_records[5m]) > 0'
for: 5m
labels:
severity: critical
annotations:
summary: OpenTelemetry Collector receiver refused log records (instance {{ $labels.instance }})
description: "OpenTelemetry Collector is refusing log records on {{ $labels.receiver }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: OpentelemetryCollectorExporterFailedSpans
expr: 'rate(otelcol_exporter_send_failed_spans[5m]) > 0'
for: 5m
labels:
severity: warning
annotations:
summary: OpenTelemetry Collector exporter failed spans (instance {{ $labels.instance }})
description: "OpenTelemetry Collector failing to send spans via {{ $labels.exporter }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: OpentelemetryCollectorExporterFailedMetricPoints
expr: 'rate(otelcol_exporter_send_failed_metric_points[5m]) > 0'
for: 5m
labels:
severity: warning
annotations:
summary: OpenTelemetry Collector exporter failed metric points (instance {{ $labels.instance }})
description: "OpenTelemetry Collector failing to send metric points via {{ $labels.exporter }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: OpentelemetryCollectorExporterFailedLogRecords
expr: 'rate(otelcol_exporter_send_failed_log_records[5m]) > 0'
for: 5m
labels:
severity: warning
annotations:
summary: OpenTelemetry Collector exporter failed log records (instance {{ $labels.instance }})
description: "OpenTelemetry Collector failing to send log records via {{ $labels.exporter }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: OpentelemetryCollectorExporterQueueNearlyFull
expr: '(otelcol_exporter_queue_size / on(instance, job, exporter) otelcol_exporter_queue_capacity) > 0.8 and otelcol_exporter_queue_capacity > 0'
for: 0m
labels:
severity: warning
annotations:
summary: OpenTelemetry Collector exporter queue nearly full (instance {{ $labels.instance }})
description: "OpenTelemetry Collector exporter {{ $labels.exporter }} queue is over 80% full\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: OpentelemetryCollectorProcessorRefusedSpans
expr: 'rate(otelcol_processor_refused_spans[5m]) > 0'
for: 5m
labels:
severity: warning
annotations:
summary: OpenTelemetry Collector processor refused spans (instance {{ $labels.instance }})
description: "OpenTelemetry Collector processor {{ $labels.processor }} is refusing spans, likely due to backpressure\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: OpentelemetryCollectorProcessorRefusedMetricPoints
expr: 'rate(otelcol_processor_refused_metric_points[5m]) > 0'
for: 5m
labels:
severity: warning
annotations:
summary: OpenTelemetry Collector processor refused metric points (instance {{ $labels.instance }})
description: "OpenTelemetry Collector processor {{ $labels.processor }} is refusing metric points, likely due to backpressure\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: OpentelemetryCollectorHighMemoryUsage
expr: '(otelcol_process_runtime_heap_alloc_bytes{job=~".*otel.*collector.*"} / on(instance, job) otelcol_process_runtime_total_sys_memory_bytes{job=~".*otel.*collector.*"}) > 0.9'
for: 5m
labels:
severity: warning
annotations:
summary: OpenTelemetry Collector high memory usage (instance {{ $labels.instance }})
description: "OpenTelemetry Collector memory usage is above 90%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: OpentelemetryCollectorOtlpReceiverErrors
expr: 'rate(otelcol_receiver_accepted_spans{receiver=~"otlp"}[5m]) == 0 and rate(otelcol_receiver_refused_spans{receiver=~"otlp"}[5m]) > 0'
for: 2m
labels:
severity: critical
annotations:
summary: OpenTelemetry Collector OTLP receiver errors (instance {{ $labels.instance }})
description: "OpenTelemetry Collector OTLP receiver is completely failing - all spans are being refused\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporterPatroni - name: EmbeddedExporterPatroni
rules: rules:
- alert: PatroniHasNoLeader - alert: PatroniHasNoLeader

View file

@ -2,6 +2,7 @@ groups:
- name: SpreakerPgbouncerExporter - name: SpreakerPgbouncerExporter
rules: rules:
- alert: PgbouncerActiveConnections - alert: PgbouncerActiveConnections

View file

@ -2,6 +2,7 @@ groups:
- name: BakinsFpmExporter - name: BakinsFpmExporter
rules: rules:
- alert: Php-fpmMax-childrenReached - alert: Php-fpmMax-childrenReached

View file

@ -2,6 +2,7 @@ groups:
- name: PostgresExporter - name: PostgresExporter
rules: rules:
- alert: PostgresqlDown - alert: PostgresqlDown
@ -166,6 +167,7 @@ groups:
summary: Postgresql too many locks acquired (instance {{ $labels.instance }}) summary: Postgresql too many locks acquired (instance {{ $labels.instance }})
description: "Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
- alert: PostgresqlBloatIndexHigh(>80%) - alert: PostgresqlBloatIndexHigh(>80%)
expr: 'pg_bloat_btree_bloat_pct > 80 and on (idxname) (pg_bloat_btree_real_size > 100000000)' expr: 'pg_bloat_btree_bloat_pct > 80 and on (idxname) (pg_bloat_btree_real_size > 100000000)'
for: 1h for: 1h
@ -175,6 +177,7 @@ groups:
summary: Postgresql bloat index high (> 80%) (instance {{ $labels.instance }}) summary: Postgresql bloat index high (> 80%) (instance {{ $labels.instance }})
description: "The index {{ $labels.idxname }} is bloated. You should execute `REINDEX INDEX CONCURRENTLY {{ $labels.idxname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "The index {{ $labels.idxname }} is bloated. You should execute `REINDEX INDEX CONCURRENTLY {{ $labels.idxname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
- alert: PostgresqlBloatTableHigh(>80%) - alert: PostgresqlBloatTableHigh(>80%)
expr: 'pg_bloat_table_bloat_pct > 80 and on (relname) (pg_bloat_table_real_size > 200000000)' expr: 'pg_bloat_table_bloat_pct > 80 and on (relname) (pg_bloat_table_real_size > 200000000)'
for: 1h for: 1h
@ -184,6 +187,7 @@ groups:
summary: Postgresql bloat table high (> 80%) (instance {{ $labels.instance }}) summary: Postgresql bloat table high (> 80%) (instance {{ $labels.instance }})
description: "The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
- alert: PostgresqlInvalidIndex - alert: PostgresqlInvalidIndex
expr: 'pg_general_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}' expr: 'pg_general_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}'
for: 6h for: 6h

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporter - name: EmbeddedExporter
rules: rules:
- alert: PrometheusJobMissing - alert: PrometheusJobMissing

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporter - name: EmbeddedExporter
rules: rules:
- alert: PromtailRequestErrors - alert: PromtailRequestErrors

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporter - name: EmbeddedExporter
rules: rules:
- alert: PulsarSubscriptionHighNumberOfBacklogEntries - alert: PulsarSubscriptionHighNumberOfBacklogEntries

View file

@ -2,6 +2,7 @@ groups:
- name: KbuddeRabbitmqExporter - name: KbuddeRabbitmqExporter
rules: rules:
- alert: RabbitmqDown - alert: RabbitmqDown
@ -49,6 +50,7 @@ groups:
summary: RabbitMQ too many connections (instance {{ $labels.instance }}) summary: RabbitMQ too many connections (instance {{ $labels.instance }})
description: "RabbitMQ instance has too many connections (> 1000)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "RabbitMQ instance has too many connections (> 1000)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Indicate the queue name in dedicated label.
- alert: RabbitmqDeadLetterQueueFillingUp - alert: RabbitmqDeadLetterQueueFillingUp
expr: 'rabbitmq_queue_messages{queue="my-dead-letter-queue"} > 10' expr: 'rabbitmq_queue_messages{queue="my-dead-letter-queue"} > 10'
for: 1m for: 1m
@ -58,6 +60,7 @@ groups:
summary: RabbitMQ dead letter queue filling up (instance {{ $labels.instance }}) summary: RabbitMQ dead letter queue filling up (instance {{ $labels.instance }})
description: "Dead letter queue is filling up (> 10 msgs)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Dead letter queue is filling up (> 10 msgs)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Indicate the queue name in dedicated label.
- alert: RabbitmqTooManyMessagesInQueue - alert: RabbitmqTooManyMessagesInQueue
expr: 'rabbitmq_queue_messages_ready{queue="my-queue"} > 1000' expr: 'rabbitmq_queue_messages_ready{queue="my-queue"} > 1000'
for: 2m for: 2m
@ -67,6 +70,7 @@ groups:
summary: RabbitMQ too many messages in queue (instance {{ $labels.instance }}) summary: RabbitMQ too many messages in queue (instance {{ $labels.instance }})
description: "Queue is filling up (> 1000 msgs)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Queue is filling up (> 1000 msgs)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Indicate the queue name in dedicated label.
- alert: RabbitmqSlowQueueConsuming - alert: RabbitmqSlowQueueConsuming
expr: 'time() - rabbitmq_queue_head_message_timestamp{queue="my-queue"} > 60' expr: 'time() - rabbitmq_queue_head_message_timestamp{queue="my-queue"} > 60'
for: 2m for: 2m
@ -85,6 +89,7 @@ groups:
summary: RabbitMQ no consumer (instance {{ $labels.instance }}) summary: RabbitMQ no consumer (instance {{ $labels.instance }})
description: "Queue has no consumer\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Queue has no consumer\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Indicate the queue name in dedicated label.
- alert: RabbitmqTooManyConsumers - alert: RabbitmqTooManyConsumers
expr: 'rabbitmq_queue_consumers{queue="my-queue"} > 1' expr: 'rabbitmq_queue_consumers{queue="my-queue"} > 1'
for: 0m for: 0m
@ -94,6 +99,7 @@ groups:
summary: RabbitMQ too many consumers (instance {{ $labels.instance }}) summary: RabbitMQ too many consumers (instance {{ $labels.instance }})
description: "Queue should have only 1 consumer\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Queue should have only 1 consumer\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Indicate the exchange name in dedicated label.
- alert: RabbitmqUnactiveExchange - alert: RabbitmqUnactiveExchange
expr: 'rate(rabbitmq_exchange_messages_published_in_total{exchange="my-exchange"}[1m]) < 5' expr: 'rate(rabbitmq_exchange_messages_published_in_total{exchange="my-exchange"}[1m]) < 5'
for: 2m for: 2m

View file

@ -2,6 +2,7 @@ groups:
- name: RabbitmqExporter - name: RabbitmqExporter
rules: rules:
- alert: RabbitmqNodeDown - alert: RabbitmqNodeDown

View file

@ -2,6 +2,7 @@ groups:
- name: Oliver006RedisExporter - name: Oliver006RedisExporter
rules: rules:
- alert: RedisDown - alert: RedisDown
@ -67,6 +68,7 @@ groups:
summary: Redis missing backup (instance {{ $labels.instance }}) summary: Redis missing backup (instance {{ $labels.instance }})
description: "Redis has not been backuped for 24 hours\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Redis has not been backuped for 24 hours\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# The exporter must be started with --include-system-metrics flag or REDIS_EXPORTER_INCL_SYSTEM_METRICS=true environment variable.
- alert: RedisOutOfSystemMemory - alert: RedisOutOfSystemMemory
expr: 'redis_memory_used_bytes / redis_total_system_memory_bytes * 100 > 90' expr: 'redis_memory_used_bytes / redis_total_system_memory_bytes * 100 > 90'
for: 2m for: 2m

View file

@ -2,6 +2,7 @@ groups:
- name: SmartctlExporter - name: SmartctlExporter
rules: rules:
- alert: SmartDeviceTemperatureWarning - alert: SmartDeviceTemperatureWarning

View file

@ -2,6 +2,7 @@ groups:
- name: StrechSidekiqExporter - name: StrechSidekiqExporter
rules: rules:
- alert: SidekiqQueueSize - alert: SidekiqQueueSize

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporter - name: EmbeddedExporter
rules: rules:
- alert: SolrUpdateErrors - alert: SolrUpdateErrors

View file

@ -2,6 +2,7 @@ groups:
- name: NlamiraultSpeedtestExporter - name: NlamiraultSpeedtestExporter
rules: rules:
- alert: SpeedtestSlowInternetDownload - alert: SpeedtestSlowInternetDownload

View file

@ -2,6 +2,7 @@ groups:
- name: OzarklakeMssqlExporter - name: OzarklakeMssqlExporter
rules: rules:
- alert: SqlServerDown - alert: SqlServerDown

View file

@ -2,6 +2,7 @@ groups:
- name: RibbybibbySslExporter - name: RibbybibbySslExporter
rules: rules:
- alert: SslCertificateProbeFailed - alert: SslCertificateProbeFailed

View file

@ -2,6 +2,7 @@ groups:
- name: ThanosBucketReplicate - name: ThanosBucketReplicate
rules: rules:
- alert: ThanosBucketReplicateErrorRate - alert: ThanosBucketReplicateErrorRate

View file

@ -2,6 +2,7 @@ groups:
- name: ThanosCompactor - name: ThanosCompactor
rules: rules:
- alert: ThanosCompactorMultipleRunning - alert: ThanosCompactorMultipleRunning

View file

@ -2,6 +2,7 @@ groups:
- name: ThanosComponentAbsent - name: ThanosComponentAbsent
rules: rules:
- alert: ThanosCompactIsDown - alert: ThanosCompactIsDown

View file

@ -2,6 +2,7 @@ groups:
- name: ThanosQuery - name: ThanosQuery
rules: rules:
- alert: ThanosQueryHttpRequestQueryErrorRateHigh - alert: ThanosQueryHttpRequestQueryErrorRateHigh

View file

@ -2,6 +2,7 @@ groups:
- name: ThanosReceiver - name: ThanosReceiver
rules: rules:
- alert: ThanosReceiveHttpRequestErrorRateHigh - alert: ThanosReceiveHttpRequestErrorRateHigh

View file

@ -2,6 +2,7 @@ groups:
- name: ThanosRuler - name: ThanosRuler
rules: rules:
- alert: ThanosRuleQueueIsDroppingAlerts - alert: ThanosRuleQueueIsDroppingAlerts

View file

@ -2,6 +2,7 @@ groups:
- name: ThanosSidecar - name: ThanosSidecar
rules: rules:
- alert: ThanosSidecarBucketOperationsFailed - alert: ThanosSidecarBucketOperationsFailed

View file

@ -2,6 +2,7 @@ groups:
- name: ThanosStore - name: ThanosStore
rules: rules:
- alert: ThanosStoreGrpcErrorRate - alert: ThanosStoreGrpcErrorRate

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporterV1 - name: EmbeddedExporterV1
rules: rules:
- alert: TraefikBackendDown - alert: TraefikBackendDown

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporterV2 - name: EmbeddedExporterV2
rules: rules:
- alert: TraefikServiceDown - alert: TraefikServiceDown

View file

@ -2,6 +2,7 @@ groups:
- name: PryordaVmwareExporter - name: PryordaVmwareExporter
rules: rules:
- alert: VirtualMachineMemoryWarning - alert: VirtualMachineMemoryWarning

View file

@ -2,6 +2,7 @@ groups:
- name: WindowsExporter - name: WindowsExporter
rules: rules:
- alert: WindowsServerCollectorError - alert: WindowsServerCollectorError

View file

@ -2,6 +2,7 @@ groups:
- name: NodeExporter - name: NodeExporter
rules: rules:
- alert: ZfsOfflinePool - alert: ZfsOfflinePool

View file

@ -2,6 +2,7 @@ groups:
- name: Zfs_exporter - name: Zfs_exporter
rules: rules:
- alert: ZfsPoolOutOfSpace - alert: ZfsPoolOutOfSpace
@ -13,6 +14,13 @@ groups:
summary: ZFS pool out of space (instance {{ $labels.instance }}) summary: ZFS pool out of space (instance {{ $labels.instance }})
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# 0: ONLINE
# 1: DEGRADED
# 2: FAULTED
# 3: OFFLINE
# 4: UNAVAIL
# 5: REMOVED
# 6: SUSPENDED
- alert: ZfsPoolUnhealthy - alert: ZfsPoolUnhealthy
expr: 'zfs_pool_health > 0' expr: 'zfs_pool_health > 0'
for: 0m for: 0m

View file

@ -2,4 +2,5 @@ groups:
- name: CloudflareKafkaZookeeperExporter - name: CloudflareKafkaZookeeperExporter
rules: rules:

View file

@ -2,6 +2,7 @@ groups:
- name: DabealuZookeeperExporter - name: DabealuZookeeperExporter
rules: rules:
- alert: ZookeeperDown - alert: ZookeeperDown

6
dist/template.yml vendored
View file

@ -2,9 +2,13 @@ groups:
{% assign groupName = slug | split: '-' %}{% capture groupNameCamelcase %}{% for word in groupName %}{{ word | capitalize }} {% endfor %}{% endcapture %} {% assign groupName = slug | split: '-' %}{% capture groupNameCamelcase %}{% for word in groupName %}{{ word | capitalize }} {% endfor %}{% endcapture %}
- name: {{ groupNameCamelcase | remove: ' ' | remove: '-' }} - name: {{ groupNameCamelcase | remove: ' ' | remove: '-' }}
{% assign lines = comments | split: "
" %}{% for line in lines %}# {{ line | strip }}
{% endfor %}
rules: rules:
{% for rule in rules %}{% assign ruleName = rule.name | split: ' ' %}{% capture ruleNameCamelcase %}{% for word in ruleName %}{{ word | capitalize }} {% endfor %}{% endcapture %} {% for rule in rules %}{% assign ruleName = rule.name | split: ' ' %}{% capture ruleNameCamelcase %}{% for word in ruleName %}{{ word | capitalize }} {% endfor %}{% endcapture %}
{% for comment in comments %}# {{ comment | strip }} {% assign lines = rule.comments | split: "
" %}{% for line in lines %}# {{ line | strip }}
{% endfor %}- alert: {{ ruleNameCamelcase | remove: ' ' }} {% endfor %}- alert: {{ ruleNameCamelcase | remove: ' ' }}
expr: '{{ rule.query }}' expr: '{{ rule.query }}'
for: {% if rule.for %}{{ rule.for }}{% else %}0m{% endif %} for: {% if rule.for %}{{ rule.for }}{% else %}0m{% endif %}

View file

@ -62,8 +62,9 @@
// @TODO: Please contribute => https://github.com/samber/awesome-prometheus-alerts 👋 // @TODO: Please contribute => https://github.com/samber/awesome-prometheus-alerts 👋
{% endhighlight %} {% endhighlight %}
{% else %} {% else %}
{{ exporter.comments | strip | newline_to_br }}
{% highlight bash %} {% highlight bash %}
$ wget https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/{{ service.name | replace: " ", "-" | downcase }}/{{ exporter.slug }}.yml $ wget https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/ref/head/dist/rules/{{ service.name | replace: " ", "-" | downcase }}/{{ exporter.slug }}.yml
{% endhighlight %} {% endhighlight %}
{% endif %} {% endif %}