diff --git a/prometheus/alerts.yaml b/prometheus/alerts.yaml index a0c9a2b..a6238e9 100644 --- a/prometheus/alerts.yaml +++ b/prometheus/alerts.yaml @@ -19,21 +19,21 @@ spec: summary: "Memory over 90%" description: "Memory on node {{ $labels.node }} is over 90% for more than 5 minutes. Plox fix. Memory usage: {{ $value }}%" - alert: DiskspaceLowWorker - expr: round(node_filesystem_avail_bytes{mountpoint="/", node=~"worker.*"} / 1073742000, 0.1) < 50 + expr: round(node_filesystem_avail_bytes{mountpoint="/", node=~"worker.*"} / 1073742000, 0.1) < 25 for: 1m labels: severity: warning annotations: - summary: "Free disk space below 10 GB" - description: "Disk space on server {{ $labels.node }} is under 10 GB. Plox fix. Free Space: {{ $value }} GB on partition {{ $labels.device }}" + summary: "Free disk space below 25 GB" + description: "Disk space on node {{ $labels.node }} is under 25 GB. Plox fix. Free Space: {{ $value }} GB on partition {{ $labels.device }}" - alert: DiskspaceLowMaster - expr: round(node_filesystem_avail_bytes{mountpoint="/", node=~"master.*"} / 1073742000, 0.1) < 10 + expr: round(node_filesystem_avail_bytes{mountpoint="/", node=~"master.*"} / 1073742000, 0.1) < 2 for: 1m labels: severity: warning annotations: summary: "Free disk space below 2 GB" - description: "Disk space on server {{ $labels.node }} is under 2 GB. Plox fix. Free Space: {{ $value }} GB on partition {{ $labels.device }}" + description: "Disk space on node {{ $labels.node }} is under 2 GB. Plox fix. Free Space: {{ $value }} GB on partition {{ $labels.device }}" - alert: HostMemoryUnderMemoryPressure expr: rate(node_vmstat_pgmajfault[1m]) > 1000 for: 2m @@ -80,15 +80,15 @@ spec: labels: severity: critical annotations: - summary: SMART check bad of drive {{ $labels.exported_disk }} in server {{ $labels.node }} - description: "SMART check returned bad health of {{ $labels.exported_disk }} in server {{ $labels.node }}. VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: SMART check bad of drive {{ $labels.exported_disk }} in node {{ $labels.node }} + description: "SMART check returned bad health of {{ $labels.exported_disk }} in node {{ $labels.node }}. VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: "SMARTcheck too old" expr: (time() - smartmon_smartctl_run) > 10800 labels: severity: warning annotations: summary: "SMARTcheck not running" - description: 'The last SMARTcheck on server {{ $labels.node }} was more than 3h ago. Plox fix.' + description: 'The last SMARTcheck on node {{ $labels.node }} was more than 3h ago. Plox fix.' - name: etcdbackup rules: - alert: "etcdbackup too old"