add more alerts

2023-04-20 05:40:46 +02:00 · 2023-04-20 05:40:46 +02:00 · 517f048d2b
commit 517f048d2b
parent 2db14ec4f8
1 changed files with 159 additions and 0 deletions
--- a/prometheus/config-map.yaml
+++ b/prometheus/config-map.yaml
@ -24,6 +24,165 @@ data:
        annotations:
          summary: "Memory over 80%"
          description: "Memory on node {{ $labels.node }} is over 80% for more than 5 minutes. Plox fix. Memory usage: {{ $value }}%"
+    - name: diskspace_low_worker
+      rules:
+      - alert: DiskspaceLow
+        expr: round((((node_memory_MemTotal_bytes{node=~"worker.*"} - node_memory_MemAvailable_bytes{node=~"worker.*"}) / node_memory_MemTotal_bytes{node=~"worker.*"}) * 100), 0.1) < 10
+        for: 1m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Free disk space below 10 GB"
+          description: "Disk space on server {{ $labels.node }} is under 10 GB. Plox fix. Free Space: {{ $value }} GB on partition {{ $labels.device }}"
+    - name: diskspace_low_master
+      rules:
+      - alert: DiskspaceLow
+        expr: round((((node_memory_MemTotal_bytes{node=~"master.*"} - node_memory_MemAvailable_bytes{node=~"master.*"}) / node_memory_MemTotal_bytes{node=~"master.*"}) * 100), 0.1) < 2
+        for: 1m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Free disk space below 2 GB"
+          description: "Disk space on server {{ $labels.node }} is under 2 GB. Plox fix. Free Space: {{ $value }} GB on partition {{ $labels.device }}"
+    - name: PrometheusTargetMissing
+      rules:
+      - alert: PrometheusTargetMissing
+        expr: up == 0
+        for: 3m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus target missing (instance {{ $labels.instance }})
+          description: "A Prometheus target has disappeared. An exporter might be crashed.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - name: PrometheusConfigurationReloadFailure
+      rules:
+      - alert: PrometheusConfigurationReloadFailure
+        expr: prometheus_config_last_reload_successful != 1
+        for: 3m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus configuration reload failure (instance {{ $labels.instance }})
+          description: "Prometheus configuration reload error\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - name: PrometheusAlertmanagerConfigurationReloadFailure
+      rules:
+      - alert: PrometheusAlertmanagerConfigurationReloadFailure
+        expr: alertmanager_config_last_reload_successful != 1
+        for: 3m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})
+          description: "AlertManager configuration reload error\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - name: PrometheusAlertmanagerConfigNotSynced
+      rules:
+      - alert: PrometheusAlertmanagerConfigNotSynced
+        expr: count(count_values("config_hash", alertmanager_config_hash)) > 1
+        for: 3m
+        labels:
+          severity: warning
+        annotations:
+          summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }})
+          description: "Configurations of AlertManager cluster instances are out of sync\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - name: PrometheusTargetEmpty
+      rules:
+      - alert: PrometheusTargetEmpty
+        expr: prometheus_sd_discovered_targets == 0
+        for: 3m
+        labels:
+          severity: critical
+        annotations:
+          summary: Prometheus target empty (instance {{ $labels.instance }})
+          description: "Prometheus has no target in service discovery\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - name: PrometheusTargetScrapingSlow
+      rules:
+      - alert: PrometheusTargetScrapingSlow
+        expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 120
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: Prometheus target scraping  (instance {{ $labels.instance }})
+          description: "Prometheus is scraping exporters ly\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - name: PrometheusLargeScrape
+      rules:
+      - alert: PrometheusLargeScrape
+        expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: Prometheus large scrape (instance {{ $labels.instance }})
+          description: "Prometheus has many scrapes that exceed the sample limit\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - name: HostMemoryUnderMemoryPressure
+      rules:
+      - alert: HostMemoryUnderMemoryPressure
+        expr: rate(node_vmstat_pgmajfault[1m]) > 1000
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host memory under memory pressure {{ $labels.node }}
+          description: "The node is under heavy memory pressure. High rate of major page faults\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - name: HostUnusualDiskReadRate
+      rules:
+      - alert: HostUnusualDiskReadRate
+        expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 200
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host unusual disk read rate {{ $labels.node }}
+          description: "Disk is probably reading too much data (> 200 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - name: HostUnusualDiskWriteRate
+      rules:
+      - alert: HostUnusualDiskWriteRate
+        expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 200
+        for: 3m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host unusual disk write rate {{ $labels.node }}
+          description: "Disk is probably writing too much data (> 200 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - name: HostCpuStealNoisyNeighbor
+      rules:
+      - alert: HostCpuStealNoisyNeighbor
+        expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10
+        for: 1m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host CPU steal noisy neighbor {{ $labels.node }}
+          description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - name: HostPhysicalComponentTooHot
+      rules:
+      - alert: HostPhysicalComponentTooHot
+        expr: node_hwmon_temp_celsius > 85
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: Host physical component too hot {{ $labels.node }}
+          description: "Physical hardware component too hot\n  Sensor = {{ $labels.sensor }}\n Temp = {{ $value }}"
+    - name: SMARTbad
+      rules:
+      - alert: SMARTbad
+        expr: smartmon_device_smart_healthy < 1
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: SMART check bad of drive {{ $labels.exported_disk }} in server {{ $labels.node }}
+          description: "SMART check returned bad health of {{ $labels.exported_disk }} in server {{ $labels.node }}. VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+    - name: SMARTcheck_old
+      rules:
+      - alert: "SMARTcheck too old"
+        expr: (time() - smartmon_smartctl_run) > 10800
+        labels:
+          severity: warning
+        annotations:
+          summary: "SMARTcheck not running"
+          description: 'The last SMARTcheck on server {{ $labels.node }} was more than 3h ago. Plox fix.'
  allow-snippet-annotations: 'false'
  prometheus.yml: |
    global: