core-deployments/prometheus/config-map.yaml

513 lines
18 KiB
YAML
Raw Normal View History

---
kind: ConfigMap
metadata:
labels:
app: prometheus
app.kubernetes.io/instance: prometheus
component: server
release: prometheus
name: prometheus-server
namespace: prometheus
apiVersion: v1
data:
alerting_rules.yml: |
{}
alerts: |
groups:
- name: memory_high
rules:
- alert: MemoryHigh
expr: round((((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * 100), 0.1) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "Memory over 80%"
2023-04-06 19:46:19 +02:00
description: "Memory on node {{ $labels.node }} is over 80% for more than 5 minutes. Plox fix. Memory usage: {{ $value }}%"
2023-04-20 05:40:46 +02:00
- name: diskspace_low_worker
rules:
- alert: DiskspaceLow
2023-04-20 05:54:18 +02:00
expr: round(node_filesystem_avail_bytes{mountpoint="/", node=~"worker.*"} / 1073742000, 0.1) < 50
2023-04-20 05:40:46 +02:00
for: 1m
labels:
severity: warning
annotations:
summary: "Free disk space below 10 GB"
description: "Disk space on server {{ $labels.node }} is under 10 GB. Plox fix. Free Space: {{ $value }} GB on partition {{ $labels.device }}"
- name: diskspace_low_master
rules:
- alert: DiskspaceLow
2023-04-20 05:54:18 +02:00
expr: round(node_filesystem_avail_bytes{mountpoint="/", node=~"master.*"} / 1073742000, 0.1) < 10
2023-04-20 05:40:46 +02:00
for: 1m
labels:
severity: warning
annotations:
summary: "Free disk space below 2 GB"
description: "Disk space on server {{ $labels.node }} is under 2 GB. Plox fix. Free Space: {{ $value }} GB on partition {{ $labels.device }}"
- name: PrometheusTargetMissing
rules:
- alert: PrometheusTargetMissing
expr: up == 0
for: 3m
labels:
severity: critical
annotations:
summary: Prometheus target missing (instance {{ $labels.instance }})
description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- name: PrometheusConfigurationReloadFailure
rules:
- alert: PrometheusConfigurationReloadFailure
expr: prometheus_config_last_reload_successful != 1
for: 3m
labels:
severity: critical
annotations:
summary: Prometheus configuration reload failure (instance {{ $labels.instance }})
description: "Prometheus configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- name: PrometheusAlertmanagerConfigurationReloadFailure
rules:
- alert: PrometheusAlertmanagerConfigurationReloadFailure
expr: alertmanager_config_last_reload_successful != 1
for: 3m
labels:
severity: critical
annotations:
summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})
description: "AlertManager configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- name: PrometheusAlertmanagerConfigNotSynced
rules:
- alert: PrometheusAlertmanagerConfigNotSynced
expr: count(count_values("config_hash", alertmanager_config_hash)) > 1
for: 3m
labels:
severity: warning
annotations:
summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }})
description: "Configurations of AlertManager cluster instances are out of sync\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- name: PrometheusTargetEmpty
rules:
- alert: PrometheusTargetEmpty
expr: prometheus_sd_discovered_targets == 0
for: 3m
labels:
severity: critical
annotations:
summary: Prometheus target empty (instance {{ $labels.instance }})
description: "Prometheus has no target in service discovery\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- name: PrometheusTargetScrapingSlow
rules:
- alert: PrometheusTargetScrapingSlow
expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 120
for: 5m
labels:
severity: warning
annotations:
summary: Prometheus target scraping (instance {{ $labels.instance }})
description: "Prometheus is scraping exporters ly\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- name: PrometheusLargeScrape
rules:
- alert: PrometheusLargeScrape
expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10
for: 5m
labels:
severity: warning
annotations:
summary: Prometheus large scrape (instance {{ $labels.instance }})
description: "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- name: HostMemoryUnderMemoryPressure
rules:
- alert: HostMemoryUnderMemoryPressure
expr: rate(node_vmstat_pgmajfault[1m]) > 1000
for: 2m
labels:
severity: warning
annotations:
summary: Host memory under memory pressure {{ $labels.node }}
description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- name: HostUnusualDiskReadRate
rules:
- alert: HostUnusualDiskReadRate
expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 200
for: 5m
labels:
severity: warning
annotations:
summary: Host unusual disk read rate {{ $labels.node }}
description: "Disk is probably reading too much data (> 200 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- name: HostUnusualDiskWriteRate
rules:
- alert: HostUnusualDiskWriteRate
expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 200
for: 3m
labels:
severity: warning
annotations:
summary: Host unusual disk write rate {{ $labels.node }}
description: "Disk is probably writing too much data (> 200 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- name: HostCpuStealNoisyNeighbor
rules:
- alert: HostCpuStealNoisyNeighbor
expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10
for: 1m
labels:
severity: warning
annotations:
summary: Host CPU steal noisy neighbor {{ $labels.node }}
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- name: HostPhysicalComponentTooHot
rules:
- alert: HostPhysicalComponentTooHot
expr: node_hwmon_temp_celsius > 85
for: 5m
labels:
severity: warning
annotations:
summary: Host physical component too hot {{ $labels.node }}
description: "Physical hardware component too hot\n Sensor = {{ $labels.sensor }}\n Temp = {{ $value }}"
- name: SMARTbad
rules:
- alert: SMARTbad
expr: smartmon_device_smart_healthy < 1
for: 0m
labels:
severity: critical
annotations:
summary: SMART check bad of drive {{ $labels.exported_disk }} in server {{ $labels.node }}
description: "SMART check returned bad health of {{ $labels.exported_disk }} in server {{ $labels.node }}. VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- name: SMARTcheck_old
rules:
- alert: "SMARTcheck too old"
expr: (time() - smartmon_smartctl_run) > 10800
labels:
severity: warning
annotations:
summary: "SMARTcheck not running"
description: 'The last SMARTcheck on server {{ $labels.node }} was more than 3h ago. Plox fix.'
allow-snippet-annotations: 'false'
prometheus.yml: |
global:
evaluation_interval: 1m
scrape_interval: 1m
scrape_timeout: 10s
rule_files:
- /etc/config/recording_rules.yml
- /etc/config/alerting_rules.yml
- /etc/config/rules
- /etc/config/alerts
scrape_configs:
- job_name: prometheus
static_configs:
- targets:
- localhost:9090
- bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
job_name: kubernetes-apiservers
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- action: keep
regex: default;kubernetes;https
source_labels:
- __meta_kubernetes_namespace
- __meta_kubernetes_service_name
- __meta_kubernetes_endpoint_port_name
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
- bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
job_name: kubernetes-nodes
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- replacement: kubernetes.default.svc:443
target_label: __address__
- regex: (.+)
replacement: /api/v1/nodes/$1/proxy/metrics
source_labels:
- __meta_kubernetes_node_name
target_label: __metrics_path__
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
- bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
job_name: kubernetes-nodes-cadvisor
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- replacement: kubernetes.default.svc:443
target_label: __address__
- regex: (.+)
replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor
source_labels:
- __meta_kubernetes_node_name
target_label: __metrics_path__
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
- honor_labels: true
job_name: kubernetes-service-endpoints
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- action: keep
regex: true
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_scrape
- action: drop
regex: true
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_scrape_slow
- action: replace
regex: (https?)
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_scheme
target_label: __scheme__
- action: replace
regex: (.+)
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_path
target_label: __metrics_path__
- action: replace
regex: (.+?)(?::\d+)?;(\d+)
replacement: $1:$2
source_labels:
- __address__
- __meta_kubernetes_service_annotation_prometheus_io_port
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+)
replacement: __param_$1
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- action: replace
source_labels:
- __meta_kubernetes_namespace
target_label: namespace
- action: replace
source_labels:
- __meta_kubernetes_service_name
target_label: service
- action: replace
source_labels:
- __meta_kubernetes_pod_node_name
target_label: node
- honor_labels: true
job_name: kubernetes-service-endpoints-slow
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
- action: keep
regex: true
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_scrape_slow
- action: replace
regex: (https?)
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_scheme
target_label: __scheme__
- action: replace
regex: (.+)
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_path
target_label: __metrics_path__
- action: replace
regex: (.+?)(?::\d+)?;(\d+)
replacement: $1:$2
source_labels:
- __address__
- __meta_kubernetes_service_annotation_prometheus_io_port
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+)
replacement: __param_$1
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- action: replace
source_labels:
- __meta_kubernetes_namespace
target_label: namespace
- action: replace
source_labels:
- __meta_kubernetes_service_name
target_label: service
- action: replace
source_labels:
- __meta_kubernetes_pod_node_name
target_label: node
scrape_interval: 5m
scrape_timeout: 30s
- honor_labels: true
job_name: prometheus-pushgateway
kubernetes_sd_configs:
- role: service
relabel_configs:
- action: keep
regex: pushgateway
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_probe
- honor_labels: true
job_name: kubernetes-services
kubernetes_sd_configs:
- role: service
metrics_path: /probe
params:
module:
- http_2xx
relabel_configs:
- action: keep
regex: true
source_labels:
- __meta_kubernetes_service_annotation_prometheus_io_probe
- source_labels:
- __address__
target_label: __param_target
- replacement: blackbox
target_label: __address__
- source_labels:
- __param_target
target_label: instance
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels:
- __meta_kubernetes_namespace
target_label: namespace
- source_labels:
- __meta_kubernetes_service_name
target_label: service
- honor_labels: true
job_name: kubernetes-pods
kubernetes_sd_configs:
- role: pod
relabel_configs:
- action: keep
regex: true
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_scrape
- action: drop
regex: true
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow
- action: replace
regex: (https?)
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_scheme
target_label: __scheme__
- action: replace
regex: (.+)
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_path
target_label: __metrics_path__
- action: replace
regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})
replacement: '[$2]:$1'
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_port
- __meta_kubernetes_pod_ip
target_label: __address__
- action: replace
regex: (\d+);((([0-9]+?)(\.|$)){4})
replacement: $2:$1
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_port
- __meta_kubernetes_pod_ip
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+)
replacement: __param_$1
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- action: replace
source_labels:
- __meta_kubernetes_namespace
target_label: namespace
- action: replace
source_labels:
- __meta_kubernetes_pod_name
target_label: pod
- action: drop
regex: Pending|Succeeded|Failed|Completed
source_labels:
- __meta_kubernetes_pod_phase
- honor_labels: true
job_name: kubernetes-pods-slow
kubernetes_sd_configs:
- role: pod
relabel_configs:
- action: keep
regex: true
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow
- action: replace
regex: (https?)
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_scheme
target_label: __scheme__
- action: replace
regex: (.+)
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_path
target_label: __metrics_path__
- action: replace
regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})
replacement: '[$2]:$1'
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_port
- __meta_kubernetes_pod_ip
target_label: __address__
- action: replace
regex: (\d+);((([0-9]+?)(\.|$)){4})
replacement: $2:$1
source_labels:
- __meta_kubernetes_pod_annotation_prometheus_io_port
- __meta_kubernetes_pod_ip
target_label: __address__
- action: labelmap
regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+)
replacement: __param_$1
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- action: replace
source_labels:
- __meta_kubernetes_namespace
target_label: namespace
- action: replace
source_labels:
- __meta_kubernetes_pod_name
target_label: pod
- action: drop
regex: Pending|Succeeded|Failed|Completed
source_labels:
- __meta_kubernetes_pod_phase
scrape_interval: 5m
scrape_timeout: 30s
alerting:
alertmanagers:
- kubernetes_sd_configs:
- role: pod
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- source_labels: [__meta_kubernetes_namespace]
regex: prometheus
action: keep
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_instance]
regex: prometheus
action: keep
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
regex: alertmanager
action: keep
- source_labels: [__meta_kubernetes_pod_container_port_number]
regex: "9093"
action: keep
recording_rules.yml: |
{}
rules: |
{}