2023-06-20 08:43:48 +02:00
---
apiVersion : monitoring.coreos.com/v1
kind : PrometheusRule
metadata :
name : prometheus-core-deployment-rules
namespace : prometheus
2023-06-20 13:15:28 +02:00
labels :
monitor : core-deployment
2023-06-20 08:43:48 +02:00
spec :
groups :
2023-06-20 13:15:28 +02:00
- name : hardware
2023-06-20 08:43:48 +02:00
rules :
2024-10-07 09:19:39 +02:00
- alert : MemoryHigh
expr : round((((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * 100), 0.1) > 80
for : 5m
labels :
severity : warning
annotations :
summary : "Memory over 80%"
description : "Memory on node {{ $labels.node }} is over 80% for more than 5 minutes. Plox fix. Memory usage: {{ $value }}%"
- alert : DiskspaceLow
expr : round(node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100, 1) < 5
for : 1m
labels :
severity : warning
annotations :
summary : "Free disk space at {{ $value }}%"
description : "Disk space on node {{ $labels.node }} is only {{ $value }}%. Plox fix. Partition: {{ $labels.device }}"
- alert : HostMemoryUnderMemoryPressure
expr : rate(node_vmstat_pgmajfault[1m]) > 1000
for : 2m
labels :
severity : warning
annotations :
summary : Host memory under memory pressure {{ $labels.node }}
description : "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert : HostUnusualDiskReadRate
expr : sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 200
for : 5m
labels :
severity : warning
annotations :
summary : Host unusual disk read rate {{ $labels.node }}
description : "Disk is probably reading too much data (> 200 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert : HostUnusualDiskWriteRate
expr : sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 200
for : 3m
labels :
severity : warning
annotations :
summary : Host unusual disk write rate {{ $labels.node }}
description : "Disk is probably writing too much data (> 200 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert : HostCpuStealNoisyNeighbor
expr : avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10
for : 1m
labels :
severity : warning
annotations :
summary : Host CPU steal noisy neighbor {{ $labels.node }}
description : "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert : HostPhysicalComponentTooHot
expr : node_hwmon_temp_celsius > 90
for : 5m
labels :
severity : warning
annotations :
summary : Host physical component too hot {{ $labels.node }}
description : "Physical hardware component too hot\n Sensor = {{ $labels.sensor }}\n Temp = {{ $value }}"
- alert : SMARTbad
expr : smartmon_device_smart_healthy < 1
for : 0m
labels :
severity : critical
annotations :
summary : SMART check bad of drive {{ $labels.exported_disk }} in node {{ $labels.node }}
description : "SMART check returned bad health of {{ $labels.exported_disk }} in node {{ $labels.node }}. VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert : "SMARTcheck too old"
expr : (time() - smartmon_smartctl_run) > 10800
labels :
severity : warning
annotations :
summary : "SMARTcheck not running"
description : 'The last SMARTcheck on node {{ $labels.node }} was more than 3h ago. Plox fix.'
- alert : "ECC Memory errors"
expr : (node_edac_correctable_errors_total) > 100
labels :
severity : warning
annotations :
summary : "ECC errors on {{ $labels.node }}"
description : 'The node {{ $labels.node }} accumulated {{ $value }} correctable errors.'
- alert : "ECC Memory uncorrectable errors"
expr : (node_edac_uncorrectable_errors_total) > 0
labels :
severity : critical
annotations :
summary : "ECC errors on {{ $labels.node }}"
description : 'The node {{ $labels.node }} accumulated {{ $value }} uncorrectable errors.'
2023-06-22 19:59:06 +02:00
- name : etcdbackup
rules :
2024-10-07 09:19:39 +02:00
- alert : "etcdbackup too old"
expr : (time() - etcdbackup_time) > 10800
labels :
severity : warning
annotations :
summary : "etcd backup not running"
description : 'The last etcd backup on node {{ $labels.node }} was more than 3h ago. Plox fix.'
- alert : "etcdbackup failed"
expr : etcdbackup_result > 0
labels :
severity : warning
annotations :
summary : "etcdbackup failed"
description : "The backup script for etcd failed on node {{ $labels.node }}. Plox fix."
2023-06-20 13:15:28 +02:00
- name : kubernetes
rules :
2024-10-07 09:19:39 +02:00
- alert : KubernetesUnhealthyPod
expr : kube_pod_container_status_waiting_reason == 1
for : 5m
labels :
severity : warning
annotations :
summary : "The Pod {{ $labels.pod }} is {{ $labels.reason }}"
description : "The Pod {{ $labels.pod }} is in the state {{ $labels.reason }} for more than 5m. The Pod is in namespace {{ $labels.namespace }} and on node {{ $labels.node }}."