core-deployments/prometheus/alerts.yaml

---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  name: prometheus-core-deployment-rules
  namespace: prometheus
  labels:
    monitor: core-deployment
spec:
  groups:
    - name: hardware
      rules:
        - alert: MemoryHigh
          expr: round((((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * 100), 0.1) > 80
          for: 5m
          labels:
            severity: warning
          annotations:
            summary: "Memory over 80%"
            description: "Memory on node {{ $labels.node }} is over 80% for more than 5 minutes. Plox fix. Memory usage: {{ $value }}%"
        - alert: DiskspaceLow
          expr: round(node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100, 1) < 5
          for: 1m
          labels:
            severity: warning
          annotations:
            summary: "Free disk space at {{ $value }}%"
            description: "Disk space on node {{ $labels.node }} is only {{ $value }}%. Plox fix. Partition: {{ $labels.device }}"
        - alert: HostMemoryUnderMemoryPressure
          expr: rate(node_vmstat_pgmajfault[1m]) > 1000
          for: 2m
          labels:
            severity: warning
          annotations:
            summary: Host memory under memory pressure {{ $labels.node }}
            description: "The node is under heavy memory pressure. High rate of major page faults\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        - alert: HostUnusualDiskReadRate
          expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 200
          for: 5m
          labels:
            severity: warning
          annotations:
            summary: Host unusual disk read rate {{ $labels.node }}
            description: "Disk is probably reading too much data (> 200 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        - alert: HostUnusualDiskWriteRate
          expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 200
          for: 3m
          labels:
            severity: warning
          annotations:
            summary: Host unusual disk write rate {{ $labels.node }}
            description: "Disk is probably writing too much data (> 200 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        - alert: HostCpuStealNoisyNeighbor
          expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10
          for: 1m
          labels:
            severity: warning
          annotations:
            summary: Host CPU steal noisy neighbor {{ $labels.node }}
            description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        - alert: HostPhysicalComponentTooHot
          expr: node_hwmon_temp_celsius > 90
          for: 5m
          labels:
            severity: warning
          annotations:
            summary: Host physical component too hot {{ $labels.node }}
            description: "Physical hardware component too hot\n  Sensor = {{ $labels.sensor }}\n Temp = {{ $value }}"
        - alert: SMARTbad
          expr: smartmon_device_smart_healthy < 1
          for: 0m
          labels:
            severity: critical
          annotations:
            summary: SMART check bad of drive {{ $labels.exported_disk }} in node {{ $labels.node }}
            description: "SMART check returned bad health of {{ $labels.exported_disk }} in node {{ $labels.node }}. VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
        - alert: "SMARTcheck too old"
          expr: (time() - smartmon_smartctl_run) > 10800
          labels:
            severity: warning
          annotations:
            summary: "SMARTcheck not running"
            description: 'The last SMARTcheck on node {{ $labels.node }} was more than 3h ago. Plox fix.'
        - alert: "ECC Memory errors"
          expr: (node_edac_correctable_errors_total) > 100
          labels:
            severity: warning
          annotations:
            summary: "ECC errors on {{ $labels.node }}"
            description: 'The node {{ $labels.node }} accumulated {{ $value }} correctable errors.'
        - alert: "ECC Memory uncorrectable errors"
          expr: (node_edac_uncorrectable_errors_total) > 0
          labels:
            severity: critical
          annotations:
            summary: "ECC errors on {{ $labels.node }}"
            description: 'The node {{ $labels.node }} accumulated {{ $value }} uncorrectable errors.'
    - name: etcdbackup
      rules:
        - alert: "etcdbackup too old"
          expr: (time() - etcdbackup_time) > 10800
          labels:
            severity: warning
          annotations:
            summary: "etcd backup not running"
            description: 'The last etcd backup on node {{ $labels.node }} was more than 3h ago. Plox fix.'
        - alert: "etcdbackup failed"
          expr: etcdbackup_result > 0
          labels:
            severity: warning
          annotations:
            summary: "etcdbackup failed"
            description: "The backup script for etcd failed on node {{ $labels.node }}. Plox fix."
    - name: kubernetes
      rules:
        - alert: KubernetesUnhealthyPod
          expr: kube_pod_container_status_waiting_reason == 1
          for: 5m
          labels:
            severity: warning
          annotations:
            summary: "The Pod {{ $labels.pod }} is {{ $labels.reason }}"
            description: "The Pod {{ $labels.pod }} is in the state {{ $labels.reason }} for more than 5m. The Pod is in namespace {{ $labels.namespace }} and on node {{ $labels.node }}."
change prometheus to prometheus-operator with kube-prometheus, this includes grafana 2023-06-20 08:43:48 +02:00			`---`
			`apiVersion: monitoring.coreos.com/v1`
			`kind: PrometheusRule`
			`metadata:`
			`name: prometheus-core-deployment-rules`
			`namespace: prometheus`
try to fix prometheus deployment 6 (final) (for now) 2023-06-20 13:15:28 +02:00			`labels:`
			`monitor: core-deployment`
change prometheus to prometheus-operator with kube-prometheus, this includes grafana 2023-06-20 08:43:48 +02:00			`spec:`
			`groups:`
try to fix prometheus deployment 6 (final) (for now) 2023-06-20 13:15:28 +02:00			`- name: hardware`
change prometheus to prometheus-operator with kube-prometheus, this includes grafana 2023-06-20 08:43:48 +02:00			`rules:`
fix typos and file layout for yamllint 2024-10-07 09:19:39 +02:00			`- alert: MemoryHigh`
			`expr: round((((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * 100), 0.1) > 80`
			`for: 5m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "Memory over 80%"`
			`description: "Memory on node {{ $labels.node }} is over 80% for more than 5 minutes. Plox fix. Memory usage: {{ $value }}%"`
			`- alert: DiskspaceLow`
			`expr: round(node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100, 1) < 5`
			`for: 1m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "Free disk space at {{ $value }}%"`
			`description: "Disk space on node {{ $labels.node }} is only {{ $value }}%. Plox fix. Partition: {{ $labels.device }}"`
			`- alert: HostMemoryUnderMemoryPressure`
			`expr: rate(node_vmstat_pgmajfault[1m]) > 1000`
			`for: 2m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: Host memory under memory pressure {{ $labels.node }}`
			`description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"`
			`- alert: HostUnusualDiskReadRate`
			`expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 200`
			`for: 5m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: Host unusual disk read rate {{ $labels.node }}`
			`description: "Disk is probably reading too much data (> 200 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"`
			`- alert: HostUnusualDiskWriteRate`
			`expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 200`
			`for: 3m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: Host unusual disk write rate {{ $labels.node }}`
			`description: "Disk is probably writing too much data (> 200 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"`
			`- alert: HostCpuStealNoisyNeighbor`
			`expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10`
			`for: 1m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: Host CPU steal noisy neighbor {{ $labels.node }}`
			`description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"`
			`- alert: HostPhysicalComponentTooHot`
			`expr: node_hwmon_temp_celsius > 90`
			`for: 5m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: Host physical component too hot {{ $labels.node }}`
			`description: "Physical hardware component too hot\n Sensor = {{ $labels.sensor }}\n Temp = {{ $value }}"`
			`- alert: SMARTbad`
			`expr: smartmon_device_smart_healthy < 1`
			`for: 0m`
			`labels:`
			`severity: critical`
			`annotations:`
			`summary: SMART check bad of drive {{ $labels.exported_disk }} in node {{ $labels.node }}`
			`description: "SMART check returned bad health of {{ $labels.exported_disk }} in node {{ $labels.node }}. VALUE = {{ $value }}\n LABELS = {{ $labels }}"`
			`- alert: "SMARTcheck too old"`
			`expr: (time() - smartmon_smartctl_run) > 10800`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "SMARTcheck not running"`
			`description: 'The last SMARTcheck on node {{ $labels.node }} was more than 3h ago. Plox fix.'`
			`- alert: "ECC Memory errors"`
			`expr: (node_edac_correctable_errors_total) > 100`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "ECC errors on {{ $labels.node }}"`
			`description: 'The node {{ $labels.node }} accumulated {{ $value }} correctable errors.'`
			`- alert: "ECC Memory uncorrectable errors"`
			`expr: (node_edac_uncorrectable_errors_total) > 0`
			`labels:`
			`severity: critical`
			`annotations:`
			`summary: "ECC errors on {{ $labels.node }}"`
			`description: 'The node {{ $labels.node }} accumulated {{ $value }} uncorrectable errors.'`
add etcdbackup alerts 2023-06-22 19:59:06 +02:00			`- name: etcdbackup`
			`rules:`
fix typos and file layout for yamllint 2024-10-07 09:19:39 +02:00			`- alert: "etcdbackup too old"`
			`expr: (time() - etcdbackup_time) > 10800`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "etcd backup not running"`
			`description: 'The last etcd backup on node {{ $labels.node }} was more than 3h ago. Plox fix.'`
			`- alert: "etcdbackup failed"`
			`expr: etcdbackup_result > 0`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "etcdbackup failed"`
			`description: "The backup script for etcd failed on node {{ $labels.node }}. Plox fix."`
try to fix prometheus deployment 6 (final) (for now) 2023-06-20 13:15:28 +02:00			`- name: kubernetes`
			`rules:`
fix typos and file layout for yamllint 2024-10-07 09:19:39 +02:00			`- alert: KubernetesUnhealthyPod`
			`expr: kube_pod_container_status_waiting_reason == 1`
			`for: 5m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "The Pod {{ $labels.pod }} is {{ $labels.reason }}"`
			`description: "The Pod {{ $labels.pod }} is in the state {{ $labels.reason }} for more than 5m. The Pod is in namespace {{ $labels.namespace }} and on node {{ $labels.node }}."`