add ECC alert (closes #19)
All checks were successful
continuous-integration/drone/push Build is passing
All checks were successful
continuous-integration/drone/push Build is passing
This commit is contained in:
parent
6637456507
commit
30b7c96833
1 changed files with 16 additions and 1 deletions
|
@ -81,6 +81,20 @@ spec:
|
|||
annotations:
|
||||
summary: "SMARTcheck not running"
|
||||
description: 'The last SMARTcheck on node {{ $labels.node }} was more than 3h ago. Plox fix.'
|
||||
- alert: "ECC Memory errors"
|
||||
expr: (node_edac_correctable_errors_total) > 100
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "ECC errors on {{ $labels.node }}"
|
||||
description: 'The node {{ $labels.node }} accumulated {{ $value }} correctable errors.'
|
||||
- alert: "ECC Memory uncorrectable errors"
|
||||
expr: (node_edac_uncorrectable_errors_total) > 0
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "ECC errors on {{ $labels.node }}"
|
||||
description: 'The node {{ $labels.node }} accumulated {{ $value }} uncorrectable errors.'
|
||||
- name: etcdbackup
|
||||
rules:
|
||||
- alert: "etcdbackup too old"
|
||||
|
@ -107,3 +121,4 @@ spec:
|
|||
annotations:
|
||||
summary: "The Pod {{ $labels.pod }} is {{ $labels.reason }}"
|
||||
description: "The Pod {{ $labels.pod }} is in the state {{ $labels.reason }} for more than 5m. The Pod is in namespace {{ $labels.namespace }} and on node {{ $labels.node }}."
|
||||
|
||||
|
|
Loading…
Reference in a new issue