Alerting: edit rules for storage low

This commit is contained in:
Aaron Riedel 2023-06-24 09:56:07 +02:00
parent 78793ed440
commit 812cd1efa6
Signed by: aaron
GPG key ID: 643004654D40D577

View file

@ -19,21 +19,21 @@ spec:
summary: "Memory over 90%" summary: "Memory over 90%"
description: "Memory on node {{ $labels.node }} is over 90% for more than 5 minutes. Plox fix. Memory usage: {{ $value }}%" description: "Memory on node {{ $labels.node }} is over 90% for more than 5 minutes. Plox fix. Memory usage: {{ $value }}%"
- alert: DiskspaceLowWorker - alert: DiskspaceLowWorker
expr: round(node_filesystem_avail_bytes{mountpoint="/", node=~"worker.*"} / 1073742000, 0.1) < 50 expr: round(node_filesystem_avail_bytes{mountpoint="/", node=~"worker.*"} / 1073742000, 0.1) < 25
for: 1m for: 1m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "Free disk space below 10 GB" summary: "Free disk space below 25 GB"
description: "Disk space on server {{ $labels.node }} is under 10 GB. Plox fix. Free Space: {{ $value }} GB on partition {{ $labels.device }}" description: "Disk space on node {{ $labels.node }} is under 25 GB. Plox fix. Free Space: {{ $value }} GB on partition {{ $labels.device }}"
- alert: DiskspaceLowMaster - alert: DiskspaceLowMaster
expr: round(node_filesystem_avail_bytes{mountpoint="/", node=~"master.*"} / 1073742000, 0.1) < 10 expr: round(node_filesystem_avail_bytes{mountpoint="/", node=~"master.*"} / 1073742000, 0.1) < 2
for: 1m for: 1m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "Free disk space below 2 GB" summary: "Free disk space below 2 GB"
description: "Disk space on server {{ $labels.node }} is under 2 GB. Plox fix. Free Space: {{ $value }} GB on partition {{ $labels.device }}" description: "Disk space on node {{ $labels.node }} is under 2 GB. Plox fix. Free Space: {{ $value }} GB on partition {{ $labels.device }}"
- alert: HostMemoryUnderMemoryPressure - alert: HostMemoryUnderMemoryPressure
expr: rate(node_vmstat_pgmajfault[1m]) > 1000 expr: rate(node_vmstat_pgmajfault[1m]) > 1000
for: 2m for: 2m
@ -80,15 +80,15 @@ spec:
labels: labels:
severity: critical severity: critical
annotations: annotations:
summary: SMART check bad of drive {{ $labels.exported_disk }} in server {{ $labels.node }} summary: SMART check bad of drive {{ $labels.exported_disk }} in node {{ $labels.node }}
description: "SMART check returned bad health of {{ $labels.exported_disk }} in server {{ $labels.node }}. VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "SMART check returned bad health of {{ $labels.exported_disk }} in node {{ $labels.node }}. VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: "SMARTcheck too old" - alert: "SMARTcheck too old"
expr: (time() - smartmon_smartctl_run) > 10800 expr: (time() - smartmon_smartctl_run) > 10800
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: "SMARTcheck not running" summary: "SMARTcheck not running"
description: 'The last SMARTcheck on server {{ $labels.node }} was more than 3h ago. Plox fix.' description: 'The last SMARTcheck on node {{ $labels.node }} was more than 3h ago. Plox fix.'
- name: etcdbackup - name: etcdbackup
rules: rules:
- alert: "etcdbackup too old" - alert: "etcdbackup too old"