From 30b7c968339472da8fb22e74961cb698bc24af2f Mon Sep 17 00:00:00 2001 From: Aaron Riedel Date: Mon, 29 Jan 2024 19:28:16 +0100 Subject: [PATCH] add ECC alert (closes #19) --- prometheus/alerts.yaml | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/prometheus/alerts.yaml b/prometheus/alerts.yaml index 64e6f19..17f3d6b 100644 --- a/prometheus/alerts.yaml +++ b/prometheus/alerts.yaml @@ -81,6 +81,20 @@ spec: annotations: summary: "SMARTcheck not running" description: 'The last SMARTcheck on node {{ $labels.node }} was more than 3h ago. Plox fix.' + - alert: "ECC Memory errors" + expr: (node_edac_correctable_errors_total) > 100 + labels: + severity: warning + annotations: + summary: "ECC errors on {{ $labels.node }}" + description: 'The node {{ $labels.node }} accumulated {{ $value }} correctable errors.' + - alert: "ECC Memory uncorrectable errors" + expr: (node_edac_uncorrectable_errors_total) > 0 + labels: + severity: critical + annotations: + summary: "ECC errors on {{ $labels.node }}" + description: 'The node {{ $labels.node }} accumulated {{ $value }} uncorrectable errors.' - name: etcdbackup rules: - alert: "etcdbackup too old" @@ -106,4 +120,5 @@ spec: severity: warning annotations: summary: "The Pod {{ $labels.pod }} is {{ $labels.reason }}" - description: "The Pod {{ $labels.pod }} is in the state {{ $labels.reason }} for more than 5m. The Pod is in namespace {{ $labels.namespace }} and on node {{ $labels.node }}." \ No newline at end of file + description: "The Pod {{ $labels.pod }} is in the state {{ $labels.reason }} for more than 5m. The Pod is in namespace {{ $labels.namespace }} and on node {{ $labels.node }}." +