--- kind: ConfigMap metadata: labels: app: prometheus app.kubernetes.io/instance: prometheus component: server release: prometheus name: prometheus-server namespace: prometheus apiVersion: v1 data: alerting_rules.yml: | {} alerts: | groups: - name: memory_high rules: - alert: MemoryHigh expr: round((((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * 100), 0.1) > 80 for: 5m labels: severity: warning annotations: summary: "Memory over 80%" description: "Memory on node {{ $labels.node }} is over 80% for more than 5 minutes. Plox fix. Memory usage: {{ $value }}%" - name: diskspace_low_worker rules: - alert: DiskspaceLow expr: round((((node_memory_MemTotal_bytes{node=~"worker.*"} - node_memory_MemAvailable_bytes{node=~"worker.*"}) / node_memory_MemTotal_bytes{node=~"worker.*"}) * 100), 0.1) < 10 for: 1m labels: severity: warning annotations: summary: "Free disk space below 10 GB" description: "Disk space on server {{ $labels.node }} is under 10 GB. Plox fix. Free Space: {{ $value }} GB on partition {{ $labels.device }}" - name: diskspace_low_master rules: - alert: DiskspaceLow expr: round((((node_memory_MemTotal_bytes{node=~"master.*"} - node_memory_MemAvailable_bytes{node=~"master.*"}) / node_memory_MemTotal_bytes{node=~"master.*"}) * 100), 0.1) < 2 for: 1m labels: severity: warning annotations: summary: "Free disk space below 2 GB" description: "Disk space on server {{ $labels.node }} is under 2 GB. Plox fix. Free Space: {{ $value }} GB on partition {{ $labels.device }}" - name: PrometheusTargetMissing rules: - alert: PrometheusTargetMissing expr: up == 0 for: 3m labels: severity: critical annotations: summary: Prometheus target missing (instance {{ $labels.instance }}) description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - name: PrometheusConfigurationReloadFailure rules: - alert: PrometheusConfigurationReloadFailure expr: prometheus_config_last_reload_successful != 1 for: 3m labels: severity: critical annotations: summary: Prometheus configuration reload failure (instance {{ $labels.instance }}) description: "Prometheus configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - name: PrometheusAlertmanagerConfigurationReloadFailure rules: - alert: PrometheusAlertmanagerConfigurationReloadFailure expr: alertmanager_config_last_reload_successful != 1 for: 3m labels: severity: critical annotations: summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}) description: "AlertManager configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - name: PrometheusAlertmanagerConfigNotSynced rules: - alert: PrometheusAlertmanagerConfigNotSynced expr: count(count_values("config_hash", alertmanager_config_hash)) > 1 for: 3m labels: severity: warning annotations: summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }}) description: "Configurations of AlertManager cluster instances are out of sync\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - name: PrometheusTargetEmpty rules: - alert: PrometheusTargetEmpty expr: prometheus_sd_discovered_targets == 0 for: 3m labels: severity: critical annotations: summary: Prometheus target empty (instance {{ $labels.instance }}) description: "Prometheus has no target in service discovery\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - name: PrometheusTargetScrapingSlow rules: - alert: PrometheusTargetScrapingSlow expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 120 for: 5m labels: severity: warning annotations: summary: Prometheus target scraping (instance {{ $labels.instance }}) description: "Prometheus is scraping exporters ly\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - name: PrometheusLargeScrape rules: - alert: PrometheusLargeScrape expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10 for: 5m labels: severity: warning annotations: summary: Prometheus large scrape (instance {{ $labels.instance }}) description: "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - name: HostMemoryUnderMemoryPressure rules: - alert: HostMemoryUnderMemoryPressure expr: rate(node_vmstat_pgmajfault[1m]) > 1000 for: 2m labels: severity: warning annotations: summary: Host memory under memory pressure {{ $labels.node }} description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - name: HostUnusualDiskReadRate rules: - alert: HostUnusualDiskReadRate expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 200 for: 5m labels: severity: warning annotations: summary: Host unusual disk read rate {{ $labels.node }} description: "Disk is probably reading too much data (> 200 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - name: HostUnusualDiskWriteRate rules: - alert: HostUnusualDiskWriteRate expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 200 for: 3m labels: severity: warning annotations: summary: Host unusual disk write rate {{ $labels.node }} description: "Disk is probably writing too much data (> 200 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - name: HostCpuStealNoisyNeighbor rules: - alert: HostCpuStealNoisyNeighbor expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10 for: 1m labels: severity: warning annotations: summary: Host CPU steal noisy neighbor {{ $labels.node }} description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - name: HostPhysicalComponentTooHot rules: - alert: HostPhysicalComponentTooHot expr: node_hwmon_temp_celsius > 85 for: 5m labels: severity: warning annotations: summary: Host physical component too hot {{ $labels.node }} description: "Physical hardware component too hot\n Sensor = {{ $labels.sensor }}\n Temp = {{ $value }}" - name: SMARTbad rules: - alert: SMARTbad expr: smartmon_device_smart_healthy < 1 for: 0m labels: severity: critical annotations: summary: SMART check bad of drive {{ $labels.exported_disk }} in server {{ $labels.node }} description: "SMART check returned bad health of {{ $labels.exported_disk }} in server {{ $labels.node }}. VALUE = {{ $value }}\n LABELS = {{ $labels }}" - name: SMARTcheck_old rules: - alert: "SMARTcheck too old" expr: (time() - smartmon_smartctl_run) > 10800 labels: severity: warning annotations: summary: "SMARTcheck not running" description: 'The last SMARTcheck on server {{ $labels.node }} was more than 3h ago. Plox fix.' allow-snippet-annotations: 'false' prometheus.yml: | global: evaluation_interval: 1m scrape_interval: 1m scrape_timeout: 10s rule_files: - /etc/config/recording_rules.yml - /etc/config/alerting_rules.yml - /etc/config/rules - /etc/config/alerts scrape_configs: - job_name: prometheus static_configs: - targets: - localhost:9090 - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token job_name: kubernetes-apiservers kubernetes_sd_configs: - role: endpoints relabel_configs: - action: keep regex: default;kubernetes;https source_labels: - __meta_kubernetes_namespace - __meta_kubernetes_service_name - __meta_kubernetes_endpoint_port_name scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt insecure_skip_verify: true - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token job_name: kubernetes-nodes kubernetes_sd_configs: - role: node relabel_configs: - action: labelmap regex: __meta_kubernetes_node_label_(.+) - replacement: kubernetes.default.svc:443 target_label: __address__ - regex: (.+) replacement: /api/v1/nodes/$1/proxy/metrics source_labels: - __meta_kubernetes_node_name target_label: __metrics_path__ scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt insecure_skip_verify: true - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token job_name: kubernetes-nodes-cadvisor kubernetes_sd_configs: - role: node relabel_configs: - action: labelmap regex: __meta_kubernetes_node_label_(.+) - replacement: kubernetes.default.svc:443 target_label: __address__ - regex: (.+) replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor source_labels: - __meta_kubernetes_node_name target_label: __metrics_path__ scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt insecure_skip_verify: true - honor_labels: true job_name: kubernetes-service-endpoints kubernetes_sd_configs: - role: endpoints relabel_configs: - action: keep regex: true source_labels: - __meta_kubernetes_service_annotation_prometheus_io_scrape - action: drop regex: true source_labels: - __meta_kubernetes_service_annotation_prometheus_io_scrape_slow - action: replace regex: (https?) source_labels: - __meta_kubernetes_service_annotation_prometheus_io_scheme target_label: __scheme__ - action: replace regex: (.+) source_labels: - __meta_kubernetes_service_annotation_prometheus_io_path target_label: __metrics_path__ - action: replace regex: (.+?)(?::\d+)?;(\d+) replacement: $1:$2 source_labels: - __address__ - __meta_kubernetes_service_annotation_prometheus_io_port target_label: __address__ - action: labelmap regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) replacement: __param_$1 - action: labelmap regex: __meta_kubernetes_service_label_(.+) - action: replace source_labels: - __meta_kubernetes_namespace target_label: namespace - action: replace source_labels: - __meta_kubernetes_service_name target_label: service - action: replace source_labels: - __meta_kubernetes_pod_node_name target_label: node - honor_labels: true job_name: kubernetes-service-endpoints-slow kubernetes_sd_configs: - role: endpoints relabel_configs: - action: keep regex: true source_labels: - __meta_kubernetes_service_annotation_prometheus_io_scrape_slow - action: replace regex: (https?) source_labels: - __meta_kubernetes_service_annotation_prometheus_io_scheme target_label: __scheme__ - action: replace regex: (.+) source_labels: - __meta_kubernetes_service_annotation_prometheus_io_path target_label: __metrics_path__ - action: replace regex: (.+?)(?::\d+)?;(\d+) replacement: $1:$2 source_labels: - __address__ - __meta_kubernetes_service_annotation_prometheus_io_port target_label: __address__ - action: labelmap regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) replacement: __param_$1 - action: labelmap regex: __meta_kubernetes_service_label_(.+) - action: replace source_labels: - __meta_kubernetes_namespace target_label: namespace - action: replace source_labels: - __meta_kubernetes_service_name target_label: service - action: replace source_labels: - __meta_kubernetes_pod_node_name target_label: node scrape_interval: 5m scrape_timeout: 30s - honor_labels: true job_name: prometheus-pushgateway kubernetes_sd_configs: - role: service relabel_configs: - action: keep regex: pushgateway source_labels: - __meta_kubernetes_service_annotation_prometheus_io_probe - honor_labels: true job_name: kubernetes-services kubernetes_sd_configs: - role: service metrics_path: /probe params: module: - http_2xx relabel_configs: - action: keep regex: true source_labels: - __meta_kubernetes_service_annotation_prometheus_io_probe - source_labels: - __address__ target_label: __param_target - replacement: blackbox target_label: __address__ - source_labels: - __param_target target_label: instance - action: labelmap regex: __meta_kubernetes_service_label_(.+) - source_labels: - __meta_kubernetes_namespace target_label: namespace - source_labels: - __meta_kubernetes_service_name target_label: service - honor_labels: true job_name: kubernetes-pods kubernetes_sd_configs: - role: pod relabel_configs: - action: keep regex: true source_labels: - __meta_kubernetes_pod_annotation_prometheus_io_scrape - action: drop regex: true source_labels: - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow - action: replace regex: (https?) source_labels: - __meta_kubernetes_pod_annotation_prometheus_io_scheme target_label: __scheme__ - action: replace regex: (.+) source_labels: - __meta_kubernetes_pod_annotation_prometheus_io_path target_label: __metrics_path__ - action: replace regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) replacement: '[$2]:$1' source_labels: - __meta_kubernetes_pod_annotation_prometheus_io_port - __meta_kubernetes_pod_ip target_label: __address__ - action: replace regex: (\d+);((([0-9]+?)(\.|$)){4}) replacement: $2:$1 source_labels: - __meta_kubernetes_pod_annotation_prometheus_io_port - __meta_kubernetes_pod_ip target_label: __address__ - action: labelmap regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) replacement: __param_$1 - action: labelmap regex: __meta_kubernetes_pod_label_(.+) - action: replace source_labels: - __meta_kubernetes_namespace target_label: namespace - action: replace source_labels: - __meta_kubernetes_pod_name target_label: pod - action: drop regex: Pending|Succeeded|Failed|Completed source_labels: - __meta_kubernetes_pod_phase - honor_labels: true job_name: kubernetes-pods-slow kubernetes_sd_configs: - role: pod relabel_configs: - action: keep regex: true source_labels: - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow - action: replace regex: (https?) source_labels: - __meta_kubernetes_pod_annotation_prometheus_io_scheme target_label: __scheme__ - action: replace regex: (.+) source_labels: - __meta_kubernetes_pod_annotation_prometheus_io_path target_label: __metrics_path__ - action: replace regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) replacement: '[$2]:$1' source_labels: - __meta_kubernetes_pod_annotation_prometheus_io_port - __meta_kubernetes_pod_ip target_label: __address__ - action: replace regex: (\d+);((([0-9]+?)(\.|$)){4}) replacement: $2:$1 source_labels: - __meta_kubernetes_pod_annotation_prometheus_io_port - __meta_kubernetes_pod_ip target_label: __address__ - action: labelmap regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) replacement: __param_$1 - action: labelmap regex: __meta_kubernetes_pod_label_(.+) - action: replace source_labels: - __meta_kubernetes_namespace target_label: namespace - action: replace source_labels: - __meta_kubernetes_pod_name target_label: pod - action: drop regex: Pending|Succeeded|Failed|Completed source_labels: - __meta_kubernetes_pod_phase scrape_interval: 5m scrape_timeout: 30s alerting: alertmanagers: - kubernetes_sd_configs: - role: pod tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token relabel_configs: - source_labels: [__meta_kubernetes_namespace] regex: prometheus action: keep - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_instance] regex: prometheus action: keep - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name] regex: alertmanager action: keep - source_labels: [__meta_kubernetes_pod_container_port_number] regex: "9093" action: keep recording_rules.yml: | {} rules: | {}