diff --git a/prometheus/alerts.yaml b/prometheus/alerts.yaml index 64e6f19..17f3d6b 100644 --- a/prometheus/alerts.yaml +++ b/prometheus/alerts.yaml @@ -81,6 +81,20 @@ spec: annotations: summary: "SMARTcheck not running" description: 'The last SMARTcheck on node {{ $labels.node }} was more than 3h ago. Plox fix.' + - alert: "ECC Memory errors" + expr: (node_edac_correctable_errors_total) > 100 + labels: + severity: warning + annotations: + summary: "ECC errors on {{ $labels.node }}" + description: 'The node {{ $labels.node }} accumulated {{ $value }} correctable errors.' + - alert: "ECC Memory uncorrectable errors" + expr: (node_edac_uncorrectable_errors_total) > 0 + labels: + severity: critical + annotations: + summary: "ECC errors on {{ $labels.node }}" + description: 'The node {{ $labels.node }} accumulated {{ $value }} uncorrectable errors.' - name: etcdbackup rules: - alert: "etcdbackup too old" @@ -106,4 +120,5 @@ spec: severity: warning annotations: summary: "The Pod {{ $labels.pod }} is {{ $labels.reason }}" - description: "The Pod {{ $labels.pod }} is in the state {{ $labels.reason }} for more than 5m. The Pod is in namespace {{ $labels.namespace }} and on node {{ $labels.node }}." \ No newline at end of file + description: "The Pod {{ $labels.pod }} is in the state {{ $labels.reason }} for more than 5m. The Pod is in namespace {{ $labels.namespace }} and on node {{ $labels.node }}." +