From fd6cc7ef3d396e0c4bd1659c519f236e77004bda Mon Sep 17 00:00:00 2001 From: Aaron Riedel Date: Thu, 22 Jun 2023 19:59:06 +0200 Subject: [PATCH] add etcdbackup alerts --- prometheus/alerts.yaml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/prometheus/alerts.yaml b/prometheus/alerts.yaml index 947f5af..a0c9a2b 100644 --- a/prometheus/alerts.yaml +++ b/prometheus/alerts.yaml @@ -89,6 +89,22 @@ spec: annotations: summary: "SMARTcheck not running" description: 'The last SMARTcheck on server {{ $labels.node }} was more than 3h ago. Plox fix.' + - name: etcdbackup + rules: + - alert: "etcdbackup too old" + expr: (time() - etcdbackup_time) > 10800 + labels: + severity: warning + annotations: + summary: "etcd backup not running" + description: 'The last etcd backup on node {{ $labels.node }} was more than 3h ago. Plox fix.' + - alert: "etcdbackup failed" + expr: etcdbackup_result > 0 + labels: + severity: warning + annotations: + summary: "etcdbackup failed" + description: "The backup script for etcd failed on node {{ $labels.node }}. Plox fix." - name: kubernetes rules: - alert: KubernetesUnhealthyPod