--- apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: name: prometheus-core-deployment-rules namespace: prometheus labels: monitor: core-deployment spec: groups: - name: hardware rules: - alert: MemoryHigh expr: round((((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * 100), 0.1) > 90 for: 5m labels: severity: warning annotations: summary: "Memory over 90%" description: "Memory on node {{ $labels.node }} is over 90% for more than 5 minutes. Plox fix. Memory usage: {{ $value }}%" - alert: DiskspaceLowWorker expr: round(node_filesystem_avail_bytes{mountpoint="/", node=~"worker.*"} / 1073742000, 0.1) < 50 for: 1m labels: severity: warning annotations: summary: "Free disk space below 10 GB" description: "Disk space on server {{ $labels.node }} is under 10 GB. Plox fix. Free Space: {{ $value }} GB on partition {{ $labels.device }}" - alert: DiskspaceLowMaster expr: round(node_filesystem_avail_bytes{mountpoint="/", node=~"master.*"} / 1073742000, 0.1) < 10 for: 1m labels: severity: warning annotations: summary: "Free disk space below 2 GB" description: "Disk space on server {{ $labels.node }} is under 2 GB. Plox fix. Free Space: {{ $value }} GB on partition {{ $labels.device }}" - alert: HostMemoryUnderMemoryPressure expr: rate(node_vmstat_pgmajfault[1m]) > 1000 for: 2m labels: severity: warning annotations: summary: Host memory under memory pressure {{ $labels.node }} description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostUnusualDiskReadRate expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 200 for: 5m labels: severity: warning annotations: summary: Host unusual disk read rate {{ $labels.node }} description: "Disk is probably reading too much data (> 200 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostUnusualDiskWriteRate expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 200 for: 3m labels: severity: warning annotations: summary: Host unusual disk write rate {{ $labels.node }} description: "Disk is probably writing too much data (> 200 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostCpuStealNoisyNeighbor expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10 for: 1m labels: severity: warning annotations: summary: Host CPU steal noisy neighbor {{ $labels.node }} description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostPhysicalComponentTooHot expr: node_hwmon_temp_celsius > 85 for: 5m labels: severity: warning annotations: summary: Host physical component too hot {{ $labels.node }} description: "Physical hardware component too hot\n Sensor = {{ $labels.sensor }}\n Temp = {{ $value }}" - alert: SMARTbad expr: smartmon_device_smart_healthy < 1 for: 0m labels: severity: critical annotations: summary: SMART check bad of drive {{ $labels.exported_disk }} in server {{ $labels.node }} description: "SMART check returned bad health of {{ $labels.exported_disk }} in server {{ $labels.node }}. VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: "SMARTcheck too old" expr: (time() - smartmon_smartctl_run) > 10800 labels: severity: warning annotations: summary: "SMARTcheck not running" description: 'The last SMARTcheck on server {{ $labels.node }} was more than 3h ago. Plox fix.' - name: kubernetes rules: - alert: KubernetesUnhealthyPod expr: kube_pod_container_status_waiting_reason == 1 for: 5m labels: severity: warning annotations: summary: "The Pod {{ $labels.pod }} is {{ $labels.reason }}" description: "The Pod {{ $labels.pod }} is in the state {{ $labels.reason }} for more than 5m. The Pod is in namespace {{ $labels.namespace }} and on node {{ $labels.node }}."