2023-04-06 18:51:47 +02:00
---
kind : ConfigMap
metadata :
labels :
app : prometheus
app.kubernetes.io/instance : prometheus
component : server
release : prometheus
name : prometheus-server
namespace : prometheus
apiVersion : v1
data :
alerting_rules.yml : |
{}
alerts : |
groups :
- name : memory_high
rules :
- alert : MemoryHigh
expr : round((((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * 100), 0.1) > 80
for : 5m
labels :
severity : warning
annotations :
summary : "Memory over 80%"
2023-04-06 19:46:19 +02:00
description : "Memory on node {{ $labels.node }} is over 80% for more than 5 minutes. Plox fix. Memory usage: {{ $value }}%"
2023-04-20 05:40:46 +02:00
- name : diskspace_low_worker
rules :
- alert : DiskspaceLow
2023-04-20 05:54:18 +02:00
expr : round(node_filesystem_avail_bytes{mountpoint="/", node=~"worker.*"} / 1073742000, 0.1) < 50
2023-04-20 05:40:46 +02:00
for : 1m
labels :
severity : warning
annotations :
summary : "Free disk space below 10 GB"
description : "Disk space on server {{ $labels.node }} is under 10 GB. Plox fix. Free Space: {{ $value }} GB on partition {{ $labels.device }}"
- name : diskspace_low_master
rules :
- alert : DiskspaceLow
2023-04-20 05:54:18 +02:00
expr : round(node_filesystem_avail_bytes{mountpoint="/", node=~"master.*"} / 1073742000, 0.1) < 10
2023-04-20 05:40:46 +02:00
for : 1m
labels :
severity : warning
annotations :
summary : "Free disk space below 2 GB"
description : "Disk space on server {{ $labels.node }} is under 2 GB. Plox fix. Free Space: {{ $value }} GB on partition {{ $labels.device }}"
2023-04-21 07:59:28 +02:00
- name : KubernetesUnhealthyPod
rules :
- alert : KubernetesUnhealthyPod
2023-04-21 08:18:39 +02:00
expr : kube_pod_container_status_waiting_reason == 1
2023-04-21 07:59:28 +02:00
for : 5m
labels :
severity : warning
annotations :
2023-04-21 08:18:39 +02:00
summary : "The Pod {{ $labels.pod }} is {{ $labels.reason }}"
2023-04-21 07:59:28 +02:00
description : "The Pod {{ $labels.pod }} is in the state {{ $labels.reason }} for more than 5m. The Pod is in namespace {{ $labels.namespace }} and on node {{ $labels.node }}."
2023-04-20 05:40:46 +02:00
- name : PrometheusTargetMissing
rules :
- alert : PrometheusTargetMissing
expr : up == 0
for : 3m
labels :
severity : critical
annotations :
summary : Prometheus target missing (instance {{ $labels.instance }})
2023-04-21 08:33:55 +02:00
description : "A Prometheus target has disappeared. {{if ne $labels.job \"\"}}\n Job: {{ $labels.job }}{{end}}{{if ne $labels.app \"\"}}\n App: {{ $labels.app }}{{end}}{{if ne $labels.pod \"\"}}\n Pod: {{ $labels.pod }}{{end}}{{if ne $labels.node \"\"}}\n Node: {{ $labels.node }}{{end}}{{if ne $labels.namespace \"\"}}\n Namespace: {{ $labels.namespace }}{{end}}"
2023-04-20 05:40:46 +02:00
- name : PrometheusConfigurationReloadFailure
rules :
- alert : PrometheusConfigurationReloadFailure
expr : prometheus_config_last_reload_successful != 1
for : 3m
labels :
severity : critical
annotations :
summary : Prometheus configuration reload failure (instance {{ $labels.instance }})
description : "Prometheus configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- name : PrometheusAlertmanagerConfigurationReloadFailure
rules :
- alert : PrometheusAlertmanagerConfigurationReloadFailure
expr : alertmanager_config_last_reload_successful != 1
for : 3m
labels :
severity : critical
annotations :
summary : Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})
description : "AlertManager configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- name : PrometheusAlertmanagerConfigNotSynced
rules :
- alert : PrometheusAlertmanagerConfigNotSynced
expr : count(count_values("config_hash", alertmanager_config_hash)) > 1
for : 3m
labels :
severity : warning
annotations :
summary : Prometheus AlertManager config not synced (instance {{ $labels.instance }})
description : "Configurations of AlertManager cluster instances are out of sync\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- name : PrometheusTargetEmpty
rules :
- alert : PrometheusTargetEmpty
expr : prometheus_sd_discovered_targets == 0
for : 3m
labels :
severity : critical
annotations :
summary : Prometheus target empty (instance {{ $labels.instance }})
description : "Prometheus has no target in service discovery\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- name : PrometheusTargetScrapingSlow
rules :
- alert : PrometheusTargetScrapingSlow
expr : prometheus_target_interval_length_seconds{quantile="0.9"} > 120
for : 5m
labels :
severity : warning
annotations :
summary : Prometheus target scraping (instance {{ $labels.instance }})
description : "Prometheus is scraping exporters ly\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- name : PrometheusLargeScrape
rules :
- alert : PrometheusLargeScrape
expr : increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10
for : 5m
labels :
severity : warning
annotations :
summary : Prometheus large scrape (instance {{ $labels.instance }})
description : "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- name : HostMemoryUnderMemoryPressure
rules :
- alert : HostMemoryUnderMemoryPressure
expr : rate(node_vmstat_pgmajfault[1m]) > 1000
for : 2m
labels :
severity : warning
annotations :
summary : Host memory under memory pressure {{ $labels.node }}
description : "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- name : HostUnusualDiskReadRate
rules :
- alert : HostUnusualDiskReadRate
expr : sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 200
for : 5m
labels :
severity : warning
annotations :
summary : Host unusual disk read rate {{ $labels.node }}
description : "Disk is probably reading too much data (> 200 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- name : HostUnusualDiskWriteRate
rules :
- alert : HostUnusualDiskWriteRate
expr : sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 200
for : 3m
labels :
severity : warning
annotations :
summary : Host unusual disk write rate {{ $labels.node }}
description : "Disk is probably writing too much data (> 200 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- name : HostCpuStealNoisyNeighbor
rules :
- alert : HostCpuStealNoisyNeighbor
expr : avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10
for : 1m
labels :
severity : warning
annotations :
summary : Host CPU steal noisy neighbor {{ $labels.node }}
description : "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- name : HostPhysicalComponentTooHot
rules :
- alert : HostPhysicalComponentTooHot
expr : node_hwmon_temp_celsius > 85
for : 5m
labels :
severity : warning
annotations :
summary : Host physical component too hot {{ $labels.node }}
description : "Physical hardware component too hot\n Sensor = {{ $labels.sensor }}\n Temp = {{ $value }}"
- name : SMARTbad
rules :
- alert : SMARTbad
expr : smartmon_device_smart_healthy < 1
for : 0m
labels :
severity : critical
annotations :
summary : SMART check bad of drive {{ $labels.exported_disk }} in server {{ $labels.node }}
description : "SMART check returned bad health of {{ $labels.exported_disk }} in server {{ $labels.node }}. VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- name : SMARTcheck_old
rules :
- alert : "SMARTcheck too old"
expr : (time() - smartmon_smartctl_run) > 10800
labels :
severity : warning
annotations :
summary : "SMARTcheck not running"
description : 'The last SMARTcheck on server {{ $labels.node }} was more than 3h ago. Plox fix.'
2023-04-06 18:51:47 +02:00
allow-snippet-annotations : 'false'
prometheus.yml : |
global :
evaluation_interval : 1m
scrape_interval : 1m
scrape_timeout : 10s
rule_files :
- /etc/config/recording_rules.yml
- /etc/config/alerting_rules.yml
- /etc/config/rules
- /etc/config/alerts
scrape_configs :
- job_name : prometheus
static_configs :
- targets :
- localhost:9090
- bearer_token_file : /var/run/secrets/kubernetes.io/serviceaccount/token
job_name : kubernetes-apiservers
kubernetes_sd_configs :
- role : endpoints
relabel_configs :
- action : keep
regex : default;kubernetes;https
source_labels :
- __meta_kubernetes_namespace
- __meta_kubernetes_service_name
- __meta_kubernetes_endpoint_port_name
scheme : https
tls_config :
ca_file : /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify : true
- bearer_token_file : /var/run/secrets/kubernetes.io/serviceaccount/token
job_name : kubernetes-nodes
kubernetes_sd_configs :
- role : node
relabel_configs :
- action : labelmap
regex : __meta_kubernetes_node_label_(.+)
- replacement : kubernetes.default.svc:443
target_label : __address__
- regex : (.+)
replacement : /api/v1/nodes/$1/proxy/metrics
source_labels :
- __meta_kubernetes_node_name
target_label : __metrics_path__
scheme : https
tls_config :
ca_file : /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify : true
- bearer_token_file : /var/run/secrets/kubernetes.io/serviceaccount/token
job_name : kubernetes-nodes-cadvisor
kubernetes_sd_configs :
- role : node
relabel_configs :
- action : labelmap
regex : __meta_kubernetes_node_label_(.+)
- replacement : kubernetes.default.svc:443
target_label : __address__
- regex : (.+)
replacement : /api/v1/nodes/$1/proxy/metrics/cadvisor
source_labels :
- __meta_kubernetes_node_name
target_label : __metrics_path__
scheme : https
tls_config :
ca_file : /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify : true
- honor_labels : true
job_name : kubernetes-service-endpoints
kubernetes_sd_configs :
- role : endpoints
relabel_configs :
- action : keep
regex : true
source_labels :
- __meta_kubernetes_service_annotation_prometheus_io_scrape
- action : drop
regex : true
source_labels :
- __meta_kubernetes_service_annotation_prometheus_io_scrape_slow
- action : replace
regex : (https?)
source_labels :
- __meta_kubernetes_service_annotation_prometheus_io_scheme
target_label : __scheme__
- action : replace
regex : (.+)
source_labels :
- __meta_kubernetes_service_annotation_prometheus_io_path
target_label : __metrics_path__
- action : replace
regex : (.+?)(?::\d+)?;(\d+)
replacement : $1:$2
source_labels :
- __address__
- __meta_kubernetes_service_annotation_prometheus_io_port
target_label : __address__
- action : labelmap
regex : __meta_kubernetes_service_annotation_prometheus_io_param_(.+)
replacement : __param_$1
- action : labelmap
regex : __meta_kubernetes_service_label_(.+)
- action : replace
source_labels :
- __meta_kubernetes_namespace
target_label : namespace
- action : replace
source_labels :
- __meta_kubernetes_service_name
target_label : service
- action : replace
source_labels :
- __meta_kubernetes_pod_node_name
target_label : node
- honor_labels : true
job_name : kubernetes-service-endpoints-slow
kubernetes_sd_configs :
- role : endpoints
relabel_configs :
- action : keep
regex : true
source_labels :
- __meta_kubernetes_service_annotation_prometheus_io_scrape_slow
- action : replace
regex : (https?)
source_labels :
- __meta_kubernetes_service_annotation_prometheus_io_scheme
target_label : __scheme__
- action : replace
regex : (.+)
source_labels :
- __meta_kubernetes_service_annotation_prometheus_io_path
target_label : __metrics_path__
- action : replace
regex : (.+?)(?::\d+)?;(\d+)
replacement : $1:$2
source_labels :
- __address__
- __meta_kubernetes_service_annotation_prometheus_io_port
target_label : __address__
- action : labelmap
regex : __meta_kubernetes_service_annotation_prometheus_io_param_(.+)
replacement : __param_$1
- action : labelmap
regex : __meta_kubernetes_service_label_(.+)
- action : replace
source_labels :
- __meta_kubernetes_namespace
target_label : namespace
- action : replace
source_labels :
- __meta_kubernetes_service_name
target_label : service
- action : replace
source_labels :
- __meta_kubernetes_pod_node_name
target_label : node
scrape_interval : 5m
scrape_timeout : 30s
- honor_labels : true
job_name : prometheus-pushgateway
kubernetes_sd_configs :
- role : service
relabel_configs :
- action : keep
regex : pushgateway
source_labels :
- __meta_kubernetes_service_annotation_prometheus_io_probe
- honor_labels : true
job_name : kubernetes-services
kubernetes_sd_configs :
- role : service
metrics_path : /probe
params :
module :
- http_2xx
relabel_configs :
- action : keep
regex : true
source_labels :
- __meta_kubernetes_service_annotation_prometheus_io_probe
- source_labels :
- __address__
target_label : __param_target
- replacement : blackbox
target_label : __address__
- source_labels :
- __param_target
target_label : instance
- action : labelmap
regex : __meta_kubernetes_service_label_(.+)
- source_labels :
- __meta_kubernetes_namespace
target_label : namespace
- source_labels :
- __meta_kubernetes_service_name
target_label : service
- honor_labels : true
job_name : kubernetes-pods
kubernetes_sd_configs :
- role : pod
relabel_configs :
- action : keep
regex : true
source_labels :
- __meta_kubernetes_pod_annotation_prometheus_io_scrape
- action : drop
regex : true
source_labels :
- __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow
- action : replace
regex : (https?)
source_labels :
- __meta_kubernetes_pod_annotation_prometheus_io_scheme
target_label : __scheme__
- action : replace
regex : (.+)
source_labels :
- __meta_kubernetes_pod_annotation_prometheus_io_path
target_label : __metrics_path__
- action : replace
regex : (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})
replacement : '[$2]:$1'
source_labels :
- __meta_kubernetes_pod_annotation_prometheus_io_port
- __meta_kubernetes_pod_ip
target_label : __address__
- action : replace
regex : (\d+);((([0-9]+?)(\.|$)){4})
replacement : $2:$1
source_labels :
- __meta_kubernetes_pod_annotation_prometheus_io_port
- __meta_kubernetes_pod_ip
target_label : __address__
- action : labelmap
regex : __meta_kubernetes_pod_annotation_prometheus_io_param_(.+)
replacement : __param_$1
- action : labelmap
regex : __meta_kubernetes_pod_label_(.+)
- action : replace
source_labels :
- __meta_kubernetes_namespace
target_label : namespace
- action : replace
source_labels :
- __meta_kubernetes_pod_name
target_label : pod
- action : drop
regex : Pending|Succeeded|Failed|Completed
source_labels :
- __meta_kubernetes_pod_phase
- honor_labels : true
job_name : kubernetes-pods-slow
kubernetes_sd_configs :
- role : pod
relabel_configs :
- action : keep
regex : true
source_labels :
- __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow
- action : replace
regex : (https?)
source_labels :
- __meta_kubernetes_pod_annotation_prometheus_io_scheme
target_label : __scheme__
- action : replace
regex : (.+)
source_labels :
- __meta_kubernetes_pod_annotation_prometheus_io_path
target_label : __metrics_path__
- action : replace
regex : (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})
replacement : '[$2]:$1'
source_labels :
- __meta_kubernetes_pod_annotation_prometheus_io_port
- __meta_kubernetes_pod_ip
target_label : __address__
- action : replace
regex : (\d+);((([0-9]+?)(\.|$)){4})
replacement : $2:$1
source_labels :
- __meta_kubernetes_pod_annotation_prometheus_io_port
- __meta_kubernetes_pod_ip
target_label : __address__
- action : labelmap
regex : __meta_kubernetes_pod_annotation_prometheus_io_param_(.+)
replacement : __param_$1
- action : labelmap
regex : __meta_kubernetes_pod_label_(.+)
- action : replace
source_labels :
- __meta_kubernetes_namespace
target_label : namespace
- action : replace
source_labels :
- __meta_kubernetes_pod_name
target_label : pod
- action : drop
regex : Pending|Succeeded|Failed|Completed
source_labels :
- __meta_kubernetes_pod_phase
scrape_interval : 5m
scrape_timeout : 30s
alerting :
alertmanagers :
- kubernetes_sd_configs :
- role : pod
tls_config :
ca_file : /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file : /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs :
- source_labels : [ __meta_kubernetes_namespace]
regex : prometheus
action : keep
- source_labels : [ __meta_kubernetes_pod_label_app_kubernetes_io_instance]
regex : prometheus
action : keep
- source_labels : [ __meta_kubernetes_pod_label_app_kubernetes_io_name]
regex : alertmanager
action : keep
- source_labels : [ __meta_kubernetes_pod_container_port_number]
regex : "9093"
action : keep
recording_rules.yml : |
{}
rules : |
{}