Alerting: edit rules for storage low

This commit is contained in:
Aaron Riedel 2023-06-24 09:56:07 +02:00
parent 78793ed440
commit 812cd1efa6
Signed by: aaron
GPG key ID: 643004654D40D577

View file

@ -19,21 +19,21 @@ spec:
summary: "Memory over 90%"
description: "Memory on node {{ $labels.node }} is over 90% for more than 5 minutes. Plox fix. Memory usage: {{ $value }}%"
- alert: DiskspaceLowWorker
expr: round(node_filesystem_avail_bytes{mountpoint="/", node=~"worker.*"} / 1073742000, 0.1) < 50
expr: round(node_filesystem_avail_bytes{mountpoint="/", node=~"worker.*"} / 1073742000, 0.1) < 25
for: 1m
labels:
severity: warning
annotations:
summary: "Free disk space below 10 GB"
description: "Disk space on server {{ $labels.node }} is under 10 GB. Plox fix. Free Space: {{ $value }} GB on partition {{ $labels.device }}"
summary: "Free disk space below 25 GB"
description: "Disk space on node {{ $labels.node }} is under 25 GB. Plox fix. Free Space: {{ $value }} GB on partition {{ $labels.device }}"
- alert: DiskspaceLowMaster
expr: round(node_filesystem_avail_bytes{mountpoint="/", node=~"master.*"} / 1073742000, 0.1) < 10
expr: round(node_filesystem_avail_bytes{mountpoint="/", node=~"master.*"} / 1073742000, 0.1) < 2
for: 1m
labels:
severity: warning
annotations:
summary: "Free disk space below 2 GB"
description: "Disk space on server {{ $labels.node }} is under 2 GB. Plox fix. Free Space: {{ $value }} GB on partition {{ $labels.device }}"
description: "Disk space on node {{ $labels.node }} is under 2 GB. Plox fix. Free Space: {{ $value }} GB on partition {{ $labels.device }}"
- alert: HostMemoryUnderMemoryPressure
expr: rate(node_vmstat_pgmajfault[1m]) > 1000
for: 2m
@ -80,15 +80,15 @@ spec:
labels:
severity: critical
annotations:
summary: SMART check bad of drive {{ $labels.exported_disk }} in server {{ $labels.node }}
description: "SMART check returned bad health of {{ $labels.exported_disk }} in server {{ $labels.node }}. VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: SMART check bad of drive {{ $labels.exported_disk }} in node {{ $labels.node }}
description: "SMART check returned bad health of {{ $labels.exported_disk }} in node {{ $labels.node }}. VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: "SMARTcheck too old"
expr: (time() - smartmon_smartctl_run) > 10800
labels:
severity: warning
annotations:
summary: "SMARTcheck not running"
description: 'The last SMARTcheck on server {{ $labels.node }} was more than 3h ago. Plox fix.'
description: 'The last SMARTcheck on node {{ $labels.node }} was more than 3h ago. Plox fix.'
- name: etcdbackup
rules:
- alert: "etcdbackup too old"