add ECC alert (closes #19)
All checks were successful
continuous-integration/drone/push Build is passing

This commit is contained in:
Aaron Riedel 2024-01-29 19:28:16 +01:00
parent 6637456507
commit 30b7c96833
Signed by: aaron
GPG key ID: 643004654D40D577

View file

@ -81,6 +81,20 @@ spec:
annotations:
summary: "SMARTcheck not running"
description: 'The last SMARTcheck on node {{ $labels.node }} was more than 3h ago. Plox fix.'
- alert: "ECC Memory errors"
expr: (node_edac_correctable_errors_total) > 100
labels:
severity: warning
annotations:
summary: "ECC errors on {{ $labels.node }}"
description: 'The node {{ $labels.node }} accumulated {{ $value }} correctable errors.'
- alert: "ECC Memory uncorrectable errors"
expr: (node_edac_uncorrectable_errors_total) > 0
labels:
severity: critical
annotations:
summary: "ECC errors on {{ $labels.node }}"
description: 'The node {{ $labels.node }} accumulated {{ $value }} uncorrectable errors.'
- name: etcdbackup
rules:
- alert: "etcdbackup too old"
@ -106,4 +120,5 @@ spec:
severity: warning
annotations:
summary: "The Pod {{ $labels.pod }} is {{ $labels.reason }}"
description: "The Pod {{ $labels.pod }} is in the state {{ $labels.reason }} for more than 5m. The Pod is in namespace {{ $labels.namespace }} and on node {{ $labels.node }}."
description: "The Pod {{ $labels.pod }} is in the state {{ $labels.reason }} for more than 5m. The Pod is in namespace {{ $labels.namespace }} and on node {{ $labels.node }}."