Alerting: edit rules for storage low
This commit is contained in:
parent
78793ed440
commit
812cd1efa6
1 changed files with 8 additions and 8 deletions
|
@ -19,21 +19,21 @@ spec:
|
||||||
summary: "Memory over 90%"
|
summary: "Memory over 90%"
|
||||||
description: "Memory on node {{ $labels.node }} is over 90% for more than 5 minutes. Plox fix. Memory usage: {{ $value }}%"
|
description: "Memory on node {{ $labels.node }} is over 90% for more than 5 minutes. Plox fix. Memory usage: {{ $value }}%"
|
||||||
- alert: DiskspaceLowWorker
|
- alert: DiskspaceLowWorker
|
||||||
expr: round(node_filesystem_avail_bytes{mountpoint="/", node=~"worker.*"} / 1073742000, 0.1) < 50
|
expr: round(node_filesystem_avail_bytes{mountpoint="/", node=~"worker.*"} / 1073742000, 0.1) < 25
|
||||||
for: 1m
|
for: 1m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Free disk space below 10 GB"
|
summary: "Free disk space below 25 GB"
|
||||||
description: "Disk space on server {{ $labels.node }} is under 10 GB. Plox fix. Free Space: {{ $value }} GB on partition {{ $labels.device }}"
|
description: "Disk space on node {{ $labels.node }} is under 25 GB. Plox fix. Free Space: {{ $value }} GB on partition {{ $labels.device }}"
|
||||||
- alert: DiskspaceLowMaster
|
- alert: DiskspaceLowMaster
|
||||||
expr: round(node_filesystem_avail_bytes{mountpoint="/", node=~"master.*"} / 1073742000, 0.1) < 10
|
expr: round(node_filesystem_avail_bytes{mountpoint="/", node=~"master.*"} / 1073742000, 0.1) < 2
|
||||||
for: 1m
|
for: 1m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: "Free disk space below 2 GB"
|
summary: "Free disk space below 2 GB"
|
||||||
description: "Disk space on server {{ $labels.node }} is under 2 GB. Plox fix. Free Space: {{ $value }} GB on partition {{ $labels.device }}"
|
description: "Disk space on node {{ $labels.node }} is under 2 GB. Plox fix. Free Space: {{ $value }} GB on partition {{ $labels.device }}"
|
||||||
- alert: HostMemoryUnderMemoryPressure
|
- alert: HostMemoryUnderMemoryPressure
|
||||||
expr: rate(node_vmstat_pgmajfault[1m]) > 1000
|
expr: rate(node_vmstat_pgmajfault[1m]) > 1000
|
||||||
for: 2m
|
for: 2m
|
||||||
|
@ -80,15 +80,15 @@ spec:
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
annotations:
|
annotations:
|
||||||
summary: SMART check bad of drive {{ $labels.exported_disk }} in server {{ $labels.node }}
|
summary: SMART check bad of drive {{ $labels.exported_disk }} in node {{ $labels.node }}
|
||||||
description: "SMART check returned bad health of {{ $labels.exported_disk }} in server {{ $labels.node }}. VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
description: "SMART check returned bad health of {{ $labels.exported_disk }} in node {{ $labels.node }}. VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
- alert: "SMARTcheck too old"
|
- alert: "SMARTcheck too old"
|
||||||
expr: (time() - smartmon_smartctl_run) > 10800
|
expr: (time() - smartmon_smartctl_run) > 10800
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
annotations:
|
annotations:
|
||||||
summary: "SMARTcheck not running"
|
summary: "SMARTcheck not running"
|
||||||
description: 'The last SMARTcheck on server {{ $labels.node }} was more than 3h ago. Plox fix.'
|
description: 'The last SMARTcheck on node {{ $labels.node }} was more than 3h ago. Plox fix.'
|
||||||
- name: etcdbackup
|
- name: etcdbackup
|
||||||
rules:
|
rules:
|
||||||
- alert: "etcdbackup too old"
|
- alert: "etcdbackup too old"
|
||||||
|
|
Loading…
Reference in a new issue