diff --git a/prometheus/config-map.yaml b/prometheus/config-map.yaml index f84e776..34664d0 100644 --- a/prometheus/config-map.yaml +++ b/prometheus/config-map.yaml @@ -24,6 +24,165 @@ data: annotations: summary: "Memory over 80%" description: "Memory on node {{ $labels.node }} is over 80% for more than 5 minutes. Plox fix. Memory usage: {{ $value }}%" + - name: diskspace_low_worker + rules: + - alert: DiskspaceLow + expr: round((((node_memory_MemTotal_bytes{node=~"worker.*"} - node_memory_MemAvailable_bytes{node=~"worker.*"}) / node_memory_MemTotal_bytes{node=~"worker.*"}) * 100), 0.1) < 10 + for: 1m + labels: + severity: warning + annotations: + summary: "Free disk space below 10 GB" + description: "Disk space on server {{ $labels.node }} is under 10 GB. Plox fix. Free Space: {{ $value }} GB on partition {{ $labels.device }}" + - name: diskspace_low_master + rules: + - alert: DiskspaceLow + expr: round((((node_memory_MemTotal_bytes{node=~"master.*"} - node_memory_MemAvailable_bytes{node=~"master.*"}) / node_memory_MemTotal_bytes{node=~"master.*"}) * 100), 0.1) < 2 + for: 1m + labels: + severity: warning + annotations: + summary: "Free disk space below 2 GB" + description: "Disk space on server {{ $labels.node }} is under 2 GB. Plox fix. Free Space: {{ $value }} GB on partition {{ $labels.device }}" + - name: PrometheusTargetMissing + rules: + - alert: PrometheusTargetMissing + expr: up == 0 + for: 3m + labels: + severity: critical + annotations: + summary: Prometheus target missing (instance {{ $labels.instance }}) + description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - name: PrometheusConfigurationReloadFailure + rules: + - alert: PrometheusConfigurationReloadFailure + expr: prometheus_config_last_reload_successful != 1 + for: 3m + labels: + severity: critical + annotations: + summary: Prometheus configuration reload failure (instance {{ $labels.instance }}) + description: "Prometheus configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - name: PrometheusAlertmanagerConfigurationReloadFailure + rules: + - alert: PrometheusAlertmanagerConfigurationReloadFailure + expr: alertmanager_config_last_reload_successful != 1 + for: 3m + labels: + severity: critical + annotations: + summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}) + description: "AlertManager configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - name: PrometheusAlertmanagerConfigNotSynced + rules: + - alert: PrometheusAlertmanagerConfigNotSynced + expr: count(count_values("config_hash", alertmanager_config_hash)) > 1 + for: 3m + labels: + severity: warning + annotations: + summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }}) + description: "Configurations of AlertManager cluster instances are out of sync\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - name: PrometheusTargetEmpty + rules: + - alert: PrometheusTargetEmpty + expr: prometheus_sd_discovered_targets == 0 + for: 3m + labels: + severity: critical + annotations: + summary: Prometheus target empty (instance {{ $labels.instance }}) + description: "Prometheus has no target in service discovery\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - name: PrometheusTargetScrapingSlow + rules: + - alert: PrometheusTargetScrapingSlow + expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 120 + for: 5m + labels: + severity: warning + annotations: + summary: Prometheus target scraping (instance {{ $labels.instance }}) + description: "Prometheus is scraping exporters ly\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - name: PrometheusLargeScrape + rules: + - alert: PrometheusLargeScrape + expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10 + for: 5m + labels: + severity: warning + annotations: + summary: Prometheus large scrape (instance {{ $labels.instance }}) + description: "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - name: HostMemoryUnderMemoryPressure + rules: + - alert: HostMemoryUnderMemoryPressure + expr: rate(node_vmstat_pgmajfault[1m]) > 1000 + for: 2m + labels: + severity: warning + annotations: + summary: Host memory under memory pressure {{ $labels.node }} + description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - name: HostUnusualDiskReadRate + rules: + - alert: HostUnusualDiskReadRate + expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 200 + for: 5m + labels: + severity: warning + annotations: + summary: Host unusual disk read rate {{ $labels.node }} + description: "Disk is probably reading too much data (> 200 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - name: HostUnusualDiskWriteRate + rules: + - alert: HostUnusualDiskWriteRate + expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 200 + for: 3m + labels: + severity: warning + annotations: + summary: Host unusual disk write rate {{ $labels.node }} + description: "Disk is probably writing too much data (> 200 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - name: HostCpuStealNoisyNeighbor + rules: + - alert: HostCpuStealNoisyNeighbor + expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10 + for: 1m + labels: + severity: warning + annotations: + summary: Host CPU steal noisy neighbor {{ $labels.node }} + description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - name: HostPhysicalComponentTooHot + rules: + - alert: HostPhysicalComponentTooHot + expr: node_hwmon_temp_celsius > 85 + for: 5m + labels: + severity: warning + annotations: + summary: Host physical component too hot {{ $labels.node }} + description: "Physical hardware component too hot\n Sensor = {{ $labels.sensor }}\n Temp = {{ $value }}" + - name: SMARTbad + rules: + - alert: SMARTbad + expr: smartmon_device_smart_healthy < 1 + for: 0m + labels: + severity: critical + annotations: + summary: SMART check bad of drive {{ $labels.exported_disk }} in server {{ $labels.node }} + description: "SMART check returned bad health of {{ $labels.exported_disk }} in server {{ $labels.node }}. VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - name: SMARTcheck_old + rules: + - alert: "SMARTcheck too old" + expr: (time() - smartmon_smartctl_run) > 10800 + labels: + severity: warning + annotations: + summary: "SMARTcheck not running" + description: 'The last SMARTcheck on server {{ $labels.node }} was more than 3h ago. Plox fix.' allow-snippet-annotations: 'false' prometheus.yml: | global: