add ECC alert (closes #19)
All checks were successful
continuous-integration/drone/push Build is passing
All checks were successful
continuous-integration/drone/push Build is passing
This commit is contained in:
parent
6637456507
commit
30b7c96833
1 changed files with 16 additions and 1 deletions
|
@ -81,6 +81,20 @@ spec:
|
||||||
annotations:
|
annotations:
|
||||||
summary: "SMARTcheck not running"
|
summary: "SMARTcheck not running"
|
||||||
description: 'The last SMARTcheck on node {{ $labels.node }} was more than 3h ago. Plox fix.'
|
description: 'The last SMARTcheck on node {{ $labels.node }} was more than 3h ago. Plox fix.'
|
||||||
|
- alert: "ECC Memory errors"
|
||||||
|
expr: (node_edac_correctable_errors_total) > 100
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "ECC errors on {{ $labels.node }}"
|
||||||
|
description: 'The node {{ $labels.node }} accumulated {{ $value }} correctable errors.'
|
||||||
|
- alert: "ECC Memory uncorrectable errors"
|
||||||
|
expr: (node_edac_uncorrectable_errors_total) > 0
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "ECC errors on {{ $labels.node }}"
|
||||||
|
description: 'The node {{ $labels.node }} accumulated {{ $value }} uncorrectable errors.'
|
||||||
- name: etcdbackup
|
- name: etcdbackup
|
||||||
rules:
|
rules:
|
||||||
- alert: "etcdbackup too old"
|
- alert: "etcdbackup too old"
|
||||||
|
@ -107,3 +121,4 @@ spec:
|
||||||
annotations:
|
annotations:
|
||||||
summary: "The Pod {{ $labels.pod }} is {{ $labels.reason }}"
|
summary: "The Pod {{ $labels.pod }} is {{ $labels.reason }}"
|
||||||
description: "The Pod {{ $labels.pod }} is in the state {{ $labels.reason }} for more than 5m. The Pod is in namespace {{ $labels.namespace }} and on node {{ $labels.node }}."
|
description: "The Pod {{ $labels.pod }} is in the state {{ $labels.reason }} for more than 5m. The Pod is in namespace {{ $labels.namespace }} and on node {{ $labels.node }}."
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue