core-deployments/prometheus/alerts.yaml

---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  name: prometheus-core-deployment-rules
  namespace: prometheus
spec:
  groups:
    - name: memory_high
      rules:
      - alert: MemoryHigh
        expr: round((((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * 100), 0.1) > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Memory over 80%"
          description: "Memory on node {{ $labels.node }} is over 80% for more than 5 minutes. Plox fix. Memory usage: {{ $value }}%"
    - name: diskspace_low_worker
      rules:
      - alert: DiskspaceLow
        expr: round(node_filesystem_avail_bytes{mountpoint="/", node=~"worker.*"} / 1073742000, 0.1) < 50
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "Free disk space below 10 GB"
          description: "Disk space on server {{ $labels.node }} is under 10 GB. Plox fix. Free Space: {{ $value }} GB on partition {{ $labels.device }}"
    - name: diskspace_low_master
      rules:
      - alert: DiskspaceLow
        expr: round(node_filesystem_avail_bytes{mountpoint="/", node=~"master.*"} / 1073742000, 0.1) < 10
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: "Free disk space below 2 GB"
          description: "Disk space on server {{ $labels.node }} is under 2 GB. Plox fix. Free Space: {{ $value }} GB on partition {{ $labels.device }}"
    - name: KubernetesUnhealthyPod
      rules:
      - alert: KubernetesUnhealthyPod
        expr: kube_pod_container_status_waiting_reason == 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "The Pod {{ $labels.pod }} is {{ $labels.reason }}"
          description: "The Pod {{ $labels.pod }} is in the state {{ $labels.reason }} for more than 5m. The Pod is in namespace {{ $labels.namespace }} and on node {{ $labels.node }}."
    - name: PrometheusTargetMissing
      rules:
      - alert: PrometheusTargetMissing
        expr: up == 0
        for: 3m
        labels:
          severity: critical
        annotations:
          summary: Prometheus target missing (instance {{ $labels.instance }})
          description: "A Prometheus target has disappeared. {{if ne $labels.job \"\"}}\n  Job: {{ $labels.job }}{{end}}{{if ne $labels.app \"\"}}\n  App: {{ $labels.app }}{{end}}{{if ne $labels.pod \"\"}}\n  Pod: {{ $labels.pod }}{{end}}{{if ne $labels.node \"\"}}\n  Node: {{ $labels.node }}{{end}}{{if ne $labels.namespace \"\"}}\n  Namespace: {{ $labels.namespace }}{{end}}"
    - name: PrometheusConfigurationReloadFailure
      rules:
      - alert: PrometheusConfigurationReloadFailure
        expr: prometheus_config_last_reload_successful != 1
        for: 3m
        labels:
          severity: critical
        annotations:
          summary: Prometheus configuration reload failure (instance {{ $labels.instance }})
          description: "Prometheus configuration reload error\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - name: PrometheusAlertmanagerConfigurationReloadFailure
      rules:
      - alert: PrometheusAlertmanagerConfigurationReloadFailure
        expr: alertmanager_config_last_reload_successful != 1
        for: 3m
        labels:
          severity: critical
        annotations:
          summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})
          description: "AlertManager configuration reload error\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - name: PrometheusAlertmanagerConfigNotSynced
      rules:
      - alert: PrometheusAlertmanagerConfigNotSynced
        expr: count(count_values("config_hash", alertmanager_config_hash)) > 1
        for: 3m
        labels:
          severity: warning
        annotations:
          summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }})
          description: "Configurations of AlertManager cluster instances are out of sync\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - name: PrometheusTargetEmpty
      rules:
      - alert: PrometheusTargetEmpty
        expr: prometheus_sd_discovered_targets == 0
        for: 3m
        labels:
          severity: critical
        annotations:
          summary: Prometheus target empty (instance {{ $labels.instance }})
          description: "Prometheus has no target in service discovery\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - name: PrometheusTargetScrapingSlow
      rules:
      - alert: PrometheusTargetScrapingSlow
        expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 120
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: Prometheus target scraping  (instance {{ $labels.instance }})
          description: "Prometheus is scraping exporters ly\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - name: PrometheusLargeScrape
      rules:
      - alert: PrometheusLargeScrape
        expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: Prometheus large scrape (instance {{ $labels.instance }})
          description: "Prometheus has many scrapes that exceed the sample limit\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - name: HostMemoryUnderMemoryPressure
      rules:
      - alert: HostMemoryUnderMemoryPressure
        expr: rate(node_vmstat_pgmajfault[1m]) > 1000
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: Host memory under memory pressure {{ $labels.node }}
          description: "The node is under heavy memory pressure. High rate of major page faults\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - name: HostUnusualDiskReadRate
      rules:
      - alert: HostUnusualDiskReadRate
        expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 200
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: Host unusual disk read rate {{ $labels.node }}
          description: "Disk is probably reading too much data (> 200 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - name: HostUnusualDiskWriteRate
      rules:
      - alert: HostUnusualDiskWriteRate
        expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 200
        for: 3m
        labels:
          severity: warning
        annotations:
          summary: Host unusual disk write rate {{ $labels.node }}
          description: "Disk is probably writing too much data (> 200 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - name: HostCpuStealNoisyNeighbor
      rules:
      - alert: HostCpuStealNoisyNeighbor
        expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: Host CPU steal noisy neighbor {{ $labels.node }}
          description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - name: HostPhysicalComponentTooHot
      rules:
      - alert: HostPhysicalComponentTooHot
        expr: node_hwmon_temp_celsius > 85
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: Host physical component too hot {{ $labels.node }}
          description: "Physical hardware component too hot\n  Sensor = {{ $labels.sensor }}\n Temp = {{ $value }}"
    - name: SMARTbad
      rules:
      - alert: SMARTbad
        expr: smartmon_device_smart_healthy < 1
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: SMART check bad of drive {{ $labels.exported_disk }} in server {{ $labels.node }}
          description: "SMART check returned bad health of {{ $labels.exported_disk }} in server {{ $labels.node }}. VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - name: SMARTcheck_old
      rules:
      - alert: "SMARTcheck too old"
        expr: (time() - smartmon_smartctl_run) > 10800
        labels:
          severity: warning
        annotations:
          summary: "SMARTcheck not running"
          description: 'The last SMARTcheck on server {{ $labels.node }} was more than 3h ago. Plox fix.'
change prometheus to prometheus-operator with kube-prometheus, this includes grafana 2023-06-20 08:43:48 +02:00			`---`
			`apiVersion: monitoring.coreos.com/v1`
			`kind: PrometheusRule`
			`metadata:`
			`name: prometheus-core-deployment-rules`
			`namespace: prometheus`
			`spec:`
			`groups:`
			`- name: memory_high`
			`rules:`
			`- alert: MemoryHigh`
			`expr: round((((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * 100), 0.1) > 80`
			`for: 5m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "Memory over 80%"`
			`description: "Memory on node {{ $labels.node }} is over 80% for more than 5 minutes. Plox fix. Memory usage: {{ $value }}%"`
			`- name: diskspace_low_worker`
			`rules:`
			`- alert: DiskspaceLow`
			`expr: round(node_filesystem_avail_bytes{mountpoint="/", node=~"worker.*"} / 1073742000, 0.1) < 50`
			`for: 1m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "Free disk space below 10 GB"`
			`description: "Disk space on server {{ $labels.node }} is under 10 GB. Plox fix. Free Space: {{ $value }} GB on partition {{ $labels.device }}"`
			`- name: diskspace_low_master`
			`rules:`
			`- alert: DiskspaceLow`
			`expr: round(node_filesystem_avail_bytes{mountpoint="/", node=~"master.*"} / 1073742000, 0.1) < 10`
			`for: 1m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "Free disk space below 2 GB"`
			`description: "Disk space on server {{ $labels.node }} is under 2 GB. Plox fix. Free Space: {{ $value }} GB on partition {{ $labels.device }}"`
			`- name: KubernetesUnhealthyPod`
			`rules:`
			`- alert: KubernetesUnhealthyPod`
			`expr: kube_pod_container_status_waiting_reason == 1`
			`for: 5m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "The Pod {{ $labels.pod }} is {{ $labels.reason }}"`
			`description: "The Pod {{ $labels.pod }} is in the state {{ $labels.reason }} for more than 5m. The Pod is in namespace {{ $labels.namespace }} and on node {{ $labels.node }}."`
			`- name: PrometheusTargetMissing`
			`rules:`
			`- alert: PrometheusTargetMissing`
			`expr: up == 0`
			`for: 3m`
			`labels:`
			`severity: critical`
			`annotations:`
			`summary: Prometheus target missing (instance {{ $labels.instance }})`
			`description: "A Prometheus target has disappeared. {{if ne $labels.job \"\"}}\n Job: {{ $labels.job }}{{end}}{{if ne $labels.app \"\"}}\n App: {{ $labels.app }}{{end}}{{if ne $labels.pod \"\"}}\n Pod: {{ $labels.pod }}{{end}}{{if ne $labels.node \"\"}}\n Node: {{ $labels.node }}{{end}}{{if ne $labels.namespace \"\"}}\n Namespace: {{ $labels.namespace }}{{end}}"`
			`- name: PrometheusConfigurationReloadFailure`
			`rules:`
			`- alert: PrometheusConfigurationReloadFailure`
			`expr: prometheus_config_last_reload_successful != 1`
			`for: 3m`
			`labels:`
			`severity: critical`
			`annotations:`
			`summary: Prometheus configuration reload failure (instance {{ $labels.instance }})`
			`description: "Prometheus configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"`
			`- name: PrometheusAlertmanagerConfigurationReloadFailure`
			`rules:`
			`- alert: PrometheusAlertmanagerConfigurationReloadFailure`
			`expr: alertmanager_config_last_reload_successful != 1`
			`for: 3m`
			`labels:`
			`severity: critical`
			`annotations:`
			`summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})`
			`description: "AlertManager configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"`
			`- name: PrometheusAlertmanagerConfigNotSynced`
			`rules:`
			`- alert: PrometheusAlertmanagerConfigNotSynced`
			`expr: count(count_values("config_hash", alertmanager_config_hash)) > 1`
			`for: 3m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }})`
			`description: "Configurations of AlertManager cluster instances are out of sync\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"`
			`- name: PrometheusTargetEmpty`
			`rules:`
			`- alert: PrometheusTargetEmpty`
			`expr: prometheus_sd_discovered_targets == 0`
			`for: 3m`
			`labels:`
			`severity: critical`
			`annotations:`
			`summary: Prometheus target empty (instance {{ $labels.instance }})`
			`description: "Prometheus has no target in service discovery\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"`
			`- name: PrometheusTargetScrapingSlow`
			`rules:`
			`- alert: PrometheusTargetScrapingSlow`
			`expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 120`
			`for: 5m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: Prometheus target scraping (instance {{ $labels.instance }})`
			`description: "Prometheus is scraping exporters ly\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"`
			`- name: PrometheusLargeScrape`
			`rules:`
			`- alert: PrometheusLargeScrape`
			`expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10`
			`for: 5m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: Prometheus large scrape (instance {{ $labels.instance }})`
			`description: "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"`
			`- name: HostMemoryUnderMemoryPressure`
			`rules:`
			`- alert: HostMemoryUnderMemoryPressure`
			`expr: rate(node_vmstat_pgmajfault[1m]) > 1000`
			`for: 2m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: Host memory under memory pressure {{ $labels.node }}`
			`description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"`
			`- name: HostUnusualDiskReadRate`
			`rules:`
			`- alert: HostUnusualDiskReadRate`
			`expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 200`
			`for: 5m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: Host unusual disk read rate {{ $labels.node }}`
			`description: "Disk is probably reading too much data (> 200 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"`
			`- name: HostUnusualDiskWriteRate`
			`rules:`
			`- alert: HostUnusualDiskWriteRate`
			`expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 200`
			`for: 3m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: Host unusual disk write rate {{ $labels.node }}`
			`description: "Disk is probably writing too much data (> 200 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"`
			`- name: HostCpuStealNoisyNeighbor`
			`rules:`
			`- alert: HostCpuStealNoisyNeighbor`
			`expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10`
			`for: 1m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: Host CPU steal noisy neighbor {{ $labels.node }}`
			`description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"`
			`- name: HostPhysicalComponentTooHot`
			`rules:`
			`- alert: HostPhysicalComponentTooHot`
			`expr: node_hwmon_temp_celsius > 85`
			`for: 5m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: Host physical component too hot {{ $labels.node }}`
			`description: "Physical hardware component too hot\n Sensor = {{ $labels.sensor }}\n Temp = {{ $value }}"`
			`- name: SMARTbad`
			`rules:`
			`- alert: SMARTbad`
			`expr: smartmon_device_smart_healthy < 1`
			`for: 0m`
			`labels:`
			`severity: critical`
			`annotations:`
			`summary: SMART check bad of drive {{ $labels.exported_disk }} in server {{ $labels.node }}`
			`description: "SMART check returned bad health of {{ $labels.exported_disk }} in server {{ $labels.node }}. VALUE = {{ $value }}\n LABELS = {{ $labels }}"`
			`- name: SMARTcheck_old`
			`rules:`
			`- alert: "SMARTcheck too old"`
			`expr: (time() - smartmon_smartctl_run) > 10800`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "SMARTcheck not running"`
			`description: 'The last SMARTcheck on server {{ $labels.node }} was more than 3h ago. Plox fix.'`