Merge pull request #3 from samber/master

Merge latest from upstream
This commit is contained in:
Simon Matic Langford 2025-11-17 10:20:20 +00:00 committed by GitHub
commit e2ffbfb653
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
84 changed files with 320 additions and 3 deletions

View file

@ -18,7 +18,7 @@ Please ensure your pull request adheres to the following guidelines:
## Improving Github page
### Run localy
### Run locally
```
gem install bundler

View file

@ -106,6 +106,7 @@ Collection available here: **[https://samber.github.io/awesome-prometheus-alerts
- [Promtail](https://samber.github.io/awesome-prometheus-alerts/rules#promtail)
- [Cortex](https://samber.github.io/awesome-prometheus-alerts/rules#cortex)
- [Grafana Alloy](https://samber.github.io/awesome-prometheus-alerts/rules#grafana-alloy)
- [OpenTelemetry Collector](https://samber.github.io/awesome-prometheus-alerts/rules#opentelemetry-collector)
- [Jenkins](https://samber.github.io/awesome-prometheus-alerts/rules#jenkins)
- [Graph Node](https://samber.github.io/awesome-prometheus-alerts/rules#graph-node)

View file

@ -3183,6 +3183,76 @@ groups:
query: "count by (instance) (alloy_build_info) unless count by (instance) (alloy_build_info offset 2m) "
severity: critical
- name: OpenTelemetry Collector
exporters:
- name: Embedded exporter
slug: embedded-exporter
doc_url: https://opentelemetry.io/docs/collector/internal-telemetry/
comments: |
OpenTelemetry Collector self-monitoring metrics are exposed on port 8888 by default at the /metrics endpoint.
These alerts monitor the collector's health when metrics are ingested via the Prometheus OTLP endpoint or scraped directly.
All collector internal metrics are prefixed with 'otelcol_'.
rules:
- name: OpenTelemetry Collector down
description: OpenTelemetry Collector instance has disappeared or is not being scraped
query: 'up{job=~".*otel.*collector.*"} == 0'
severity: critical
for: 1m
- name: OpenTelemetry Collector receiver refused spans
description: "OpenTelemetry Collector is refusing spans on {{ $labels.receiver }}"
query: 'rate(otelcol_receiver_refused_spans[5m]) > 0'
severity: critical
for: 5m
- name: OpenTelemetry Collector receiver refused metric points
description: "OpenTelemetry Collector is refusing metric points on {{ $labels.receiver }}"
query: 'rate(otelcol_receiver_refused_metric_points[5m]) > 0'
severity: critical
for: 5m
- name: OpenTelemetry Collector receiver refused log records
description: "OpenTelemetry Collector is refusing log records on {{ $labels.receiver }}"
query: 'rate(otelcol_receiver_refused_log_records[5m]) > 0'
severity: critical
for: 5m
- name: OpenTelemetry Collector exporter failed spans
description: "OpenTelemetry Collector failing to send spans via {{ $labels.exporter }}"
query: 'rate(otelcol_exporter_send_failed_spans[5m]) > 0'
severity: warning
for: 5m
- name: OpenTelemetry Collector exporter failed metric points
description: "OpenTelemetry Collector failing to send metric points via {{ $labels.exporter }}"
query: 'rate(otelcol_exporter_send_failed_metric_points[5m]) > 0'
severity: warning
for: 5m
- name: OpenTelemetry Collector exporter failed log records
description: "OpenTelemetry Collector failing to send log records via {{ $labels.exporter }}"
query: 'rate(otelcol_exporter_send_failed_log_records[5m]) > 0'
severity: warning
for: 5m
- name: OpenTelemetry Collector exporter queue nearly full
description: "OpenTelemetry Collector exporter {{ $labels.exporter }} queue is over 80% full"
query: '(otelcol_exporter_queue_size / on(instance, job, exporter) otelcol_exporter_queue_capacity) > 0.8 and otelcol_exporter_queue_capacity > 0'
severity: warning
- name: OpenTelemetry Collector processor refused spans
description: "OpenTelemetry Collector processor {{ $labels.processor }} is refusing spans, likely due to backpressure"
query: 'rate(otelcol_processor_refused_spans[5m]) > 0'
severity: warning
for: 5m
- name: OpenTelemetry Collector processor refused metric points
description: "OpenTelemetry Collector processor {{ $labels.processor }} is refusing metric points, likely due to backpressure"
query: 'rate(otelcol_processor_refused_metric_points[5m]) > 0'
severity: warning
for: 5m
- name: OpenTelemetry Collector high memory usage
description: "OpenTelemetry Collector memory usage is above 90%"
query: '(otelcol_process_runtime_heap_alloc_bytes{job=~".*otel.*collector.*"} / on(instance, job) otelcol_process_runtime_total_sys_memory_bytes{job=~".*otel.*collector.*"}) > 0.9'
severity: warning
for: 5m
- name: OpenTelemetry Collector OTLP receiver errors
description: "OpenTelemetry Collector OTLP receiver is completely failing - all spans are being refused"
query: 'rate(otelcol_receiver_accepted_spans{receiver=~"otlp"}[5m]) == 0 and rate(otelcol_receiver_refused_spans{receiver=~"otlp"}[5m]) > 0'
severity: critical
for: 2m
- name: Jenkins
exporters:
- name: Metric plugin

View file

@ -2,6 +2,7 @@ groups:
- name: LusitaniaeApacheExporter
rules:
- alert: ApacheDown

View file

@ -2,6 +2,7 @@ groups:
- name: Apcupsd_exporter
rules:
- alert: ApcUpsBatteryNearlyEmpty

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporter
rules:
- alert: ArgocdServiceNotSynced

View file

@ -2,6 +2,7 @@ groups:
- name: BlackboxExporter
rules:
- alert: BlackboxProbeFailed
@ -58,6 +59,10 @@ groups:
summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
description: "SSL certificate expires in less than 3 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# For probe_ssl_earliest_cert_expiry to be exposed after expiration, you
# need to enable insecure_skip_verify. Note that this will disable
# certificate validation.
# See https://github.com/prometheus/blackbox_exporter/blob/master/CONFIGURATION.md#tls_config
- alert: BlackboxSslCertificateExpired
expr: 'round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 0'
for: 0m

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporter
rules:
- alert: CaddyReverseProxyDown

View file

@ -2,6 +2,7 @@ groups:
- name: CriteoCassandraExporter
rules:
- alert: CassandraHintsCount

View file

@ -2,6 +2,7 @@ groups:
- name: InstaclustrCassandraExporter
rules:
- alert: CassandraNodeIsUnavailable

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporter
rules:
- alert: CephState

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporter
rules:
- alert: ClickhouseNodeDown
@ -85,6 +86,7 @@ groups:
summary: ClickHouse No Live Replicas (instance {{ $labels.instance }})
description: "There are too few live replicas available, risking data loss and service disruption.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Please replace the threshold with an appropriate value
- alert: ClickhouseHighNetworkTraffic
expr: 'ClickHouseMetrics_NetworkSend > 250 or ClickHouseMetrics_NetworkReceive > 250'
for: 5m
@ -94,6 +96,7 @@ groups:
summary: ClickHouse High Network Traffic (instance {{ $labels.instance }})
description: "Network traffic is unusually high, may affect cluster performance.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Please replace the threshold with an appropriate value
- alert: ClickhouseHighTcpConnections
expr: 'ClickHouseMetrics_TCPConnection > 400'
for: 5m
@ -166,6 +169,7 @@ groups:
summary: ClickHouse zookeeper hardware exception (instance {{ $labels.instance }})
description: "Zookeeper hardware exception: network issues communicating with ZooKeeper\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Please replace the threshold with an appropriate value
- alert: ClickhouseHighNetworkUsage
expr: 'rate(ClickHouseProfileEvents_NetworkSendBytes[1m]) > 100*1024*1024 or rate(ClickHouseProfileEvents_NetworkReceiveBytes[1m]) > 100*1024*1024'
for: 2m

View file

@ -2,6 +2,7 @@ groups:
- name: LablabsCloudflareExporter
rules:
- alert: CloudflareHttp4xxErrorRate

View file

@ -2,6 +2,7 @@ groups:
- name: ConsulExporter
rules:
- alert: ConsulServiceHealthcheckFailed

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporter
rules:
- alert: CorednsPanicCount

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporter
rules:
- alert: CortexRulerConfigurationReloadFailure

View file

@ -2,6 +2,7 @@ groups:
- name: GesellixCouchdbPrometheusExporter
rules:
- alert: CouchdbNodeDown

View file

@ -2,8 +2,10 @@ groups:
- name: GoogleCadvisor
rules:
# This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment.
- alert: ContainerKilled
expr: 'time() - container_last_seen > 60'
for: 0m
@ -13,6 +15,7 @@ groups:
summary: Container killed (instance {{ $labels.instance }})
description: "A container has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment.
- alert: ContainerAbsent
expr: 'absent(container_last_seen)'
for: 5m
@ -31,6 +34,7 @@ groups:
summary: Container High CPU utilization (instance {{ $labels.instance }})
description: "Container CPU utilization is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# See https://medium.com/faun/how-much-is-too-much-the-linux-oomkiller-and-used-memory-d32186f29c9d
- alert: ContainerHighMemoryUsage
expr: '(sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80'
for: 2m

View file

@ -2,6 +2,7 @@ groups:
- name: PrometheusCommunityElasticsearchExporter
rules:
- alert: ElasticsearchHeapUsageTooHigh

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporter
rules:
- alert: EtcdInsufficientMembers

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporter
rules:
- alert: FluxKustomizationFailure

View file

@ -2,6 +2,7 @@ groups:
- name: ZnerolFreeswitchExporter
rules:
- alert: FreeswitchDown

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporter
rules:
- alert: GrafanaAlloyServiceDown

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporter
rules:
- alert: ProviderFailedBecauseNet_versionFailed

View file

@ -2,6 +2,7 @@ groups:
- name: Jmx_exporter
rules:
- alert: HadoopNameNodeDown

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporterV2
rules:
- alert: HaproxyHighHttp4xxErrorRateBackend

View file

@ -2,6 +2,7 @@ groups:
- name: HaproxyExporterV1
rules:
- alert: HaproxyDown

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporter
rules:
- alert: VaultSealed

View file

@ -2,6 +2,7 @@ groups:
- name: NodeExporter
rules:
- alert: HostOutOfMemory
@ -22,6 +23,7 @@ groups:
summary: Host memory under memory pressure (instance {{ $labels.instance }})
description: "The node is under heavy memory pressure. High rate of loading memory pages from disk.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
- alert: HostMemoryIsUnderutilized
expr: 'min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * .8'
for: 0m
@ -58,6 +60,9 @@ groups:
summary: Host unusual disk read rate (instance {{ $labels.instance }})
description: "Disk is too busy (IO wait > 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Please add ignored mountpoints in node_exporter parameters like
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
- alert: HostOutOfDiskSpace
expr: '(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < .10 and on (instance, device, mountpoint) node_filesystem_readonly == 0)'
for: 2m
@ -67,6 +72,9 @@ groups:
summary: Host out of disk space (instance {{ $labels.instance }})
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Please add ignored mountpoints in node_exporter parameters like
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
- alert: HostDiskMayFillIn24Hours
expr: 'predict_linear(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[3h], 86400) <= 0 and node_filesystem_avail_bytes > 0'
for: 2m
@ -130,6 +138,7 @@ groups:
summary: Host high CPU load (instance {{ $labels.instance }})
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# You may want to increase the alert manager 'repeat_interval' for this type of alert to daily or weekly
- alert: HostCpuIsUnderutilized
expr: '(min by (instance) (rate(node_cpu_seconds_total{mode="idle"}[1h]))) > 0.8'
for: 1w
@ -166,6 +175,9 @@ groups:
summary: Host unusual disk IO (instance {{ $labels.instance }})
description: "Disk usage >80%. Check storage for issues or increase IOPS capabilities. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# x2 context switches is an arbitrary number.
# The alert threshold depends on the nature of the application.
# Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
- alert: HostContextSwitchingHigh
expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) / (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2'
for: 0m

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporter
rules:
- alert: IstioKubernetesGatewayAvailabilityDrop

View file

@ -2,6 +2,7 @@ groups:
- name: MetricPlugin
rules:
- alert: JenkinsOffline
@ -58,6 +59,12 @@ groups:
summary: Jenkins build tests failing (instance {{ $labels.instance }})
description: "Last build tests failed: {{$labels.jenkins_job}}. Failed build Tests for job `{{$labels.jenkins_job}}` on {{$labels.instance}}/{{$labels.env}} ({{$labels.region}})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# * RUNNING -1 true - The build had no errors.
# * SUCCESS 0 true - The build had no errors.
# * UNSTABLE 1 true - The build had some errors but they were not fatal. For example, some tests failed.
# * FAILURE 2 false - The build had a fatal error.
# * NOT_BUILT 3 false - The module was not built.
# * ABORTED 4 false - The build was manually aborted.
- alert: JenkinsLastBuildFailed
expr: 'default_jenkins_builds_last_build_result_ordinal == 2'
for: 0m

View file

@ -2,6 +2,7 @@ groups:
- name: CzerwonkJunosExporter
rules:
- alert: JuniperSwitchDown

View file

@ -2,6 +2,7 @@ groups:
- name: JvmExporter
rules:
- alert: JvmMemoryFillingUp

View file

@ -2,6 +2,7 @@ groups:
- name: DanielqsjKafkaExporter
rules:
- alert: KafkaTopicsReplicas

View file

@ -2,6 +2,7 @@ groups:
- name: LinkedinKafkaExporter
rules:
- alert: KafkaTopicOffsetDecreased

View file

@ -2,6 +2,7 @@ groups:
- name: KubestateExporter
rules:
- alert: KubernetesNodeNotReady
@ -13,6 +14,8 @@ groups:
summary: Kubernetes Node ready (node {{ $labels.node }})
description: "Node {{ $labels.node }} has been unready for a long time\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Kubernetes Node with disabled schedules are fine.
# This alarm can be useful to get warned if there are nodes which are longer unscheduled.
- alert: KubernetesNodeSchedulingDisabled
expr: 'kube_node_spec_taint{key="node.kubernetes.io/unschedulable"} == 1'
for: 30m
@ -265,6 +268,7 @@ groups:
summary: Kubernetes DaemonSet misscheduled ({{ $labels.namespace }}/{{ $labels.daemonset }})
description: "Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Threshold should be customized for each cronjob name.
- alert: KubernetesCronjobTooLong
expr: 'kube_job_status_start_time > 0 and absent(kube_job_status_completion_time) and (time() - kube_job_status_start_time) > 3600'
for: 0m

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporter
rules:
- alert: LinkerdHighErrorRate

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporter
rules:
- alert: LokiProcessTooManyRestarts

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporter
rules:
- alert: MeilisearchIndexIsEmpty

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporter
rules:
- alert: MinioClusterDiskOffline

View file

@ -2,6 +2,7 @@ groups:
- name: DcuMongodbExporter
rules:
- alert: MongodbReplicationLag

View file

@ -2,6 +2,7 @@ groups:
- name: PerconaMongodbExporter
rules:
- alert: MongodbDown

View file

@ -2,6 +2,7 @@ groups:
- name: StefanprodanMgobExporter
rules:
- alert: MgobBackupFailed

View file

@ -2,6 +2,7 @@ groups:
- name: MysqldExporter
rules:
- alert: MysqlDown

View file

@ -2,6 +2,7 @@ groups:
- name: NatsExporter
rules:
- alert: NatsHighConnectionCount

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporter
rules:
- alert: NetdataHighCpuUsage

View file

@ -2,6 +2,7 @@ groups:
- name: KnyarNginxExporter
rules:
- alert: NginxHighHttp4xxErrorRate

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporter
rules:
- alert: NomadJobFailed

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporter
rules:
- alert: OpenebsUsedPoolCapacity

View file

@ -0,0 +1,117 @@
groups:
- name: EmbeddedExporter
# OpenTelemetry Collector self-monitoring metrics are exposed on port 8888 by default at the /metrics endpoint.
# These alerts monitor the collector's health when metrics are ingested via the Prometheus OTLP endpoint or scraped directly.
# All collector internal metrics are prefixed with 'otelcol_'.
rules:
- alert: OpentelemetryCollectorDown
expr: 'up{job=~".*otel.*collector.*"} == 0'
for: 1m
labels:
severity: critical
annotations:
summary: OpenTelemetry Collector down (instance {{ $labels.instance }})
description: "OpenTelemetry Collector instance has disappeared or is not being scraped\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: OpentelemetryCollectorReceiverRefusedSpans
expr: 'rate(otelcol_receiver_refused_spans[5m]) > 0'
for: 5m
labels:
severity: critical
annotations:
summary: OpenTelemetry Collector receiver refused spans (instance {{ $labels.instance }})
description: "OpenTelemetry Collector is refusing spans on {{ $labels.receiver }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: OpentelemetryCollectorReceiverRefusedMetricPoints
expr: 'rate(otelcol_receiver_refused_metric_points[5m]) > 0'
for: 5m
labels:
severity: critical
annotations:
summary: OpenTelemetry Collector receiver refused metric points (instance {{ $labels.instance }})
description: "OpenTelemetry Collector is refusing metric points on {{ $labels.receiver }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: OpentelemetryCollectorReceiverRefusedLogRecords
expr: 'rate(otelcol_receiver_refused_log_records[5m]) > 0'
for: 5m
labels:
severity: critical
annotations:
summary: OpenTelemetry Collector receiver refused log records (instance {{ $labels.instance }})
description: "OpenTelemetry Collector is refusing log records on {{ $labels.receiver }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: OpentelemetryCollectorExporterFailedSpans
expr: 'rate(otelcol_exporter_send_failed_spans[5m]) > 0'
for: 5m
labels:
severity: warning
annotations:
summary: OpenTelemetry Collector exporter failed spans (instance {{ $labels.instance }})
description: "OpenTelemetry Collector failing to send spans via {{ $labels.exporter }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: OpentelemetryCollectorExporterFailedMetricPoints
expr: 'rate(otelcol_exporter_send_failed_metric_points[5m]) > 0'
for: 5m
labels:
severity: warning
annotations:
summary: OpenTelemetry Collector exporter failed metric points (instance {{ $labels.instance }})
description: "OpenTelemetry Collector failing to send metric points via {{ $labels.exporter }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: OpentelemetryCollectorExporterFailedLogRecords
expr: 'rate(otelcol_exporter_send_failed_log_records[5m]) > 0'
for: 5m
labels:
severity: warning
annotations:
summary: OpenTelemetry Collector exporter failed log records (instance {{ $labels.instance }})
description: "OpenTelemetry Collector failing to send log records via {{ $labels.exporter }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: OpentelemetryCollectorExporterQueueNearlyFull
expr: '(otelcol_exporter_queue_size / on(instance, job, exporter) otelcol_exporter_queue_capacity) > 0.8 and otelcol_exporter_queue_capacity > 0'
for: 0m
labels:
severity: warning
annotations:
summary: OpenTelemetry Collector exporter queue nearly full (instance {{ $labels.instance }})
description: "OpenTelemetry Collector exporter {{ $labels.exporter }} queue is over 80% full\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: OpentelemetryCollectorProcessorRefusedSpans
expr: 'rate(otelcol_processor_refused_spans[5m]) > 0'
for: 5m
labels:
severity: warning
annotations:
summary: OpenTelemetry Collector processor refused spans (instance {{ $labels.instance }})
description: "OpenTelemetry Collector processor {{ $labels.processor }} is refusing spans, likely due to backpressure\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: OpentelemetryCollectorProcessorRefusedMetricPoints
expr: 'rate(otelcol_processor_refused_metric_points[5m]) > 0'
for: 5m
labels:
severity: warning
annotations:
summary: OpenTelemetry Collector processor refused metric points (instance {{ $labels.instance }})
description: "OpenTelemetry Collector processor {{ $labels.processor }} is refusing metric points, likely due to backpressure\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: OpentelemetryCollectorHighMemoryUsage
expr: '(otelcol_process_runtime_heap_alloc_bytes{job=~".*otel.*collector.*"} / on(instance, job) otelcol_process_runtime_total_sys_memory_bytes{job=~".*otel.*collector.*"}) > 0.9'
for: 5m
labels:
severity: warning
annotations:
summary: OpenTelemetry Collector high memory usage (instance {{ $labels.instance }})
description: "OpenTelemetry Collector memory usage is above 90%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: OpentelemetryCollectorOtlpReceiverErrors
expr: 'rate(otelcol_receiver_accepted_spans{receiver=~"otlp"}[5m]) == 0 and rate(otelcol_receiver_refused_spans{receiver=~"otlp"}[5m]) > 0'
for: 2m
labels:
severity: critical
annotations:
summary: OpenTelemetry Collector OTLP receiver errors (instance {{ $labels.instance }})
description: "OpenTelemetry Collector OTLP receiver is completely failing - all spans are being refused\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporterPatroni
rules:
- alert: PatroniHasNoLeader

View file

@ -2,6 +2,7 @@ groups:
- name: SpreakerPgbouncerExporter
rules:
- alert: PgbouncerActiveConnections

View file

@ -2,6 +2,7 @@ groups:
- name: BakinsFpmExporter
rules:
- alert: Php-fpmMax-childrenReached

View file

@ -2,6 +2,7 @@ groups:
- name: PostgresExporter
rules:
- alert: PostgresqlDown
@ -166,6 +167,7 @@ groups:
summary: Postgresql too many locks acquired (instance {{ $labels.instance }})
description: "Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
- alert: PostgresqlBloatIndexHigh(>80%)
expr: 'pg_bloat_btree_bloat_pct > 80 and on (idxname) (pg_bloat_btree_real_size > 100000000)'
for: 1h
@ -175,6 +177,7 @@ groups:
summary: Postgresql bloat index high (> 80%) (instance {{ $labels.instance }})
description: "The index {{ $labels.idxname }} is bloated. You should execute `REINDEX INDEX CONCURRENTLY {{ $labels.idxname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
- alert: PostgresqlBloatTableHigh(>80%)
expr: 'pg_bloat_table_bloat_pct > 80 and on (relname) (pg_bloat_table_real_size > 200000000)'
for: 1h
@ -184,6 +187,7 @@ groups:
summary: Postgresql bloat table high (> 80%) (instance {{ $labels.instance }})
description: "The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# See https://github.com/samber/awesome-prometheus-alerts/issues/289#issuecomment-1164842737
- alert: PostgresqlInvalidIndex
expr: 'pg_general_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}'
for: 6h

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporter
rules:
- alert: PrometheusJobMissing

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporter
rules:
- alert: PromtailRequestErrors

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporter
rules:
- alert: PulsarSubscriptionHighNumberOfBacklogEntries

View file

@ -2,6 +2,7 @@ groups:
- name: KbuddeRabbitmqExporter
rules:
- alert: RabbitmqDown
@ -49,6 +50,7 @@ groups:
summary: RabbitMQ too many connections (instance {{ $labels.instance }})
description: "RabbitMQ instance has too many connections (> 1000)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Indicate the queue name in dedicated label.
- alert: RabbitmqDeadLetterQueueFillingUp
expr: 'rabbitmq_queue_messages{queue="my-dead-letter-queue"} > 10'
for: 1m
@ -58,6 +60,7 @@ groups:
summary: RabbitMQ dead letter queue filling up (instance {{ $labels.instance }})
description: "Dead letter queue is filling up (> 10 msgs)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Indicate the queue name in dedicated label.
- alert: RabbitmqTooManyMessagesInQueue
expr: 'rabbitmq_queue_messages_ready{queue="my-queue"} > 1000'
for: 2m
@ -67,6 +70,7 @@ groups:
summary: RabbitMQ too many messages in queue (instance {{ $labels.instance }})
description: "Queue is filling up (> 1000 msgs)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Indicate the queue name in dedicated label.
- alert: RabbitmqSlowQueueConsuming
expr: 'time() - rabbitmq_queue_head_message_timestamp{queue="my-queue"} > 60'
for: 2m
@ -85,6 +89,7 @@ groups:
summary: RabbitMQ no consumer (instance {{ $labels.instance }})
description: "Queue has no consumer\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Indicate the queue name in dedicated label.
- alert: RabbitmqTooManyConsumers
expr: 'rabbitmq_queue_consumers{queue="my-queue"} > 1'
for: 0m
@ -94,6 +99,7 @@ groups:
summary: RabbitMQ too many consumers (instance {{ $labels.instance }})
description: "Queue should have only 1 consumer\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Indicate the exchange name in dedicated label.
- alert: RabbitmqUnactiveExchange
expr: 'rate(rabbitmq_exchange_messages_published_in_total{exchange="my-exchange"}[1m]) < 5'
for: 2m

View file

@ -2,6 +2,7 @@ groups:
- name: RabbitmqExporter
rules:
- alert: RabbitmqNodeDown

View file

@ -2,6 +2,7 @@ groups:
- name: Oliver006RedisExporter
rules:
- alert: RedisDown
@ -67,6 +68,7 @@ groups:
summary: Redis missing backup (instance {{ $labels.instance }})
description: "Redis has not been backuped for 24 hours\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# The exporter must be started with --include-system-metrics flag or REDIS_EXPORTER_INCL_SYSTEM_METRICS=true environment variable.
- alert: RedisOutOfSystemMemory
expr: 'redis_memory_used_bytes / redis_total_system_memory_bytes * 100 > 90'
for: 2m

View file

@ -2,6 +2,7 @@ groups:
- name: SmartctlExporter
rules:
- alert: SmartDeviceTemperatureWarning

View file

@ -2,6 +2,7 @@ groups:
- name: StrechSidekiqExporter
rules:
- alert: SidekiqQueueSize

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporter
rules:
- alert: SolrUpdateErrors

View file

@ -2,6 +2,7 @@ groups:
- name: NlamiraultSpeedtestExporter
rules:
- alert: SpeedtestSlowInternetDownload

View file

@ -2,6 +2,7 @@ groups:
- name: OzarklakeMssqlExporter
rules:
- alert: SqlServerDown

View file

@ -2,6 +2,7 @@ groups:
- name: RibbybibbySslExporter
rules:
- alert: SslCertificateProbeFailed

View file

@ -2,6 +2,7 @@ groups:
- name: ThanosBucketReplicate
rules:
- alert: ThanosBucketReplicateErrorRate

View file

@ -2,6 +2,7 @@ groups:
- name: ThanosCompactor
rules:
- alert: ThanosCompactorMultipleRunning

View file

@ -2,6 +2,7 @@ groups:
- name: ThanosComponentAbsent
rules:
- alert: ThanosCompactIsDown

View file

@ -2,6 +2,7 @@ groups:
- name: ThanosQuery
rules:
- alert: ThanosQueryHttpRequestQueryErrorRateHigh

View file

@ -2,6 +2,7 @@ groups:
- name: ThanosReceiver
rules:
- alert: ThanosReceiveHttpRequestErrorRateHigh

View file

@ -2,6 +2,7 @@ groups:
- name: ThanosRuler
rules:
- alert: ThanosRuleQueueIsDroppingAlerts

View file

@ -2,6 +2,7 @@ groups:
- name: ThanosSidecar
rules:
- alert: ThanosSidecarBucketOperationsFailed

View file

@ -2,6 +2,7 @@ groups:
- name: ThanosStore
rules:
- alert: ThanosStoreGrpcErrorRate

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporterV1
rules:
- alert: TraefikBackendDown

View file

@ -2,6 +2,7 @@ groups:
- name: EmbeddedExporterV2
rules:
- alert: TraefikServiceDown

View file

@ -2,6 +2,7 @@ groups:
- name: PryordaVmwareExporter
rules:
- alert: VirtualMachineMemoryWarning

View file

@ -2,6 +2,7 @@ groups:
- name: WindowsExporter
rules:
- alert: WindowsServerCollectorError

View file

@ -2,6 +2,7 @@ groups:
- name: NodeExporter
rules:
- alert: ZfsOfflinePool

View file

@ -2,6 +2,7 @@ groups:
- name: Zfs_exporter
rules:
- alert: ZfsPoolOutOfSpace
@ -13,6 +14,13 @@ groups:
summary: ZFS pool out of space (instance {{ $labels.instance }})
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# 0: ONLINE
# 1: DEGRADED
# 2: FAULTED
# 3: OFFLINE
# 4: UNAVAIL
# 5: REMOVED
# 6: SUSPENDED
- alert: ZfsPoolUnhealthy
expr: 'zfs_pool_health > 0'
for: 0m

View file

@ -2,4 +2,5 @@ groups:
- name: CloudflareKafkaZookeeperExporter
rules:

View file

@ -2,6 +2,7 @@ groups:
- name: DabealuZookeeperExporter
rules:
- alert: ZookeeperDown

6
dist/template.yml vendored
View file

@ -2,9 +2,13 @@ groups:
{% assign groupName = slug | split: '-' %}{% capture groupNameCamelcase %}{% for word in groupName %}{{ word | capitalize }} {% endfor %}{% endcapture %}
- name: {{ groupNameCamelcase | remove: ' ' | remove: '-' }}
{% assign lines = comments | split: "
" %}{% for line in lines %}# {{ line | strip }}
{% endfor %}
rules:
{% for rule in rules %}{% assign ruleName = rule.name | split: ' ' %}{% capture ruleNameCamelcase %}{% for word in ruleName %}{{ word | capitalize }} {% endfor %}{% endcapture %}
{% for comment in comments %}# {{ comment | strip }}
{% assign lines = rule.comments | split: "
" %}{% for line in lines %}# {{ line | strip }}
{% endfor %}- alert: {{ ruleNameCamelcase | remove: ' ' }}
expr: '{{ rule.query }}'
for: {% if rule.for %}{{ rule.for }}{% else %}0m{% endif %}

View file

@ -62,8 +62,9 @@
// @TODO: Please contribute => https://github.com/samber/awesome-prometheus-alerts 👋
{% endhighlight %}
{% else %}
{{ exporter.comments | strip | newline_to_br }}
{% highlight bash %}
$ wget https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/{{ service.name | replace: " ", "-" | downcase }}/{{ exporter.slug }}.yml
$ wget https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/ref/head/dist/rules/{{ service.name | replace: " ", "-" | downcase }}/{{ exporter.slug }}.yml
{% endhighlight %}
{% endif %}