change prometheus to prometheus-operator with kube-prometheus, this includes grafana
This commit is contained in:
parent
2a75bbe501
commit
d5985f50b5
6 changed files with 284 additions and 638 deletions
|
@ -90,9 +90,9 @@ metadata:
|
||||||
spec:
|
spec:
|
||||||
project: default
|
project: default
|
||||||
sources:
|
sources:
|
||||||
- chart: prometheus
|
- chart: kube-prometheus-stack
|
||||||
repoURL: https://prometheus-community.github.io/helm-charts
|
repoURL: https://prometheus-community.github.io/helm-charts
|
||||||
targetRevision: 20.2.0
|
targetRevision: 46.8.0
|
||||||
helm:
|
helm:
|
||||||
releaseName: prometheus
|
releaseName: prometheus
|
||||||
valueFiles:
|
valueFiles:
|
||||||
|
|
|
@ -1,61 +0,0 @@
|
||||||
---
|
|
||||||
kind: ConfigMap
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app.kubernetes.io/instance: prometheus
|
|
||||||
app.kubernetes.io/managed-by: Helm
|
|
||||||
app.kubernetes.io/name: alertmanager
|
|
||||||
app.kubernetes.io/version: v0.25.0
|
|
||||||
helm.sh/chart: alertmanager-0.24.1
|
|
||||||
name: prometheus-alertmanager
|
|
||||||
namespace: prometheus
|
|
||||||
apiVersion: v1
|
|
||||||
data:
|
|
||||||
alertmanager.yml: |
|
|
||||||
global:
|
|
||||||
resolve_timeout: 5m
|
|
||||||
|
|
||||||
templates:
|
|
||||||
- '/etc/alertmanager/telegram.tmpl'
|
|
||||||
|
|
||||||
route:
|
|
||||||
group_by: ['alertname']
|
|
||||||
group_wait: 30s
|
|
||||||
group_interval: 30s
|
|
||||||
repeat_interval: 24h
|
|
||||||
receiver: 'tg1'
|
|
||||||
routes:
|
|
||||||
- matchers:
|
|
||||||
- severity=warning
|
|
||||||
receiver: 'tg1'
|
|
||||||
|
|
||||||
- matchers:
|
|
||||||
- severity=critical
|
|
||||||
receiver: 'tg1'
|
|
||||||
|
|
||||||
receivers:
|
|
||||||
- name: tg1
|
|
||||||
telegram_configs:
|
|
||||||
- bot_token_file: '/etc/alertmanager/telegram-token/api_key'
|
|
||||||
chat_id: -995270884
|
|
||||||
api_url: "https://api.telegram.org"
|
|
||||||
send_resolved: true
|
|
||||||
parse_mode: "HTML"
|
|
||||||
message: '{{ template "telegram.aaron" .}}'
|
|
||||||
|
|
||||||
inhibit_rules:
|
|
||||||
- source_matchers:
|
|
||||||
- severity = critical
|
|
||||||
target_matchers:
|
|
||||||
- severity = warning
|
|
||||||
equal: ['alertname', 'server', 'instance']
|
|
||||||
telegram.tmpl: |
|
|
||||||
{{ define "telegram.aaron" }}
|
|
||||||
{{ range .Alerts }}
|
|
||||||
{{ if eq .Status "firing"}}🔥 <b>{{ .Labels.alertname }}</b> 🔥{{ else }}✅ <b>{{ .Labels.alertname }}</b> ✅{{ end }}
|
|
||||||
<b> {{ .Annotations.summary }} </b>
|
|
||||||
|
|
||||||
{{ .Annotations.description }}
|
|
||||||
{{ end }}
|
|
||||||
{{ end }}
|
|
||||||
|
|
187
prometheus/alerts.yaml
Normal file
187
prometheus/alerts.yaml
Normal file
|
@ -0,0 +1,187 @@
|
||||||
|
---
|
||||||
|
apiVersion: monitoring.coreos.com/v1
|
||||||
|
kind: PrometheusRule
|
||||||
|
metadata:
|
||||||
|
name: prometheus-core-deployment-rules
|
||||||
|
namespace: prometheus
|
||||||
|
spec:
|
||||||
|
groups:
|
||||||
|
- name: memory_high
|
||||||
|
rules:
|
||||||
|
- alert: MemoryHigh
|
||||||
|
expr: round((((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * 100), 0.1) > 80
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Memory over 80%"
|
||||||
|
description: "Memory on node {{ $labels.node }} is over 80% for more than 5 minutes. Plox fix. Memory usage: {{ $value }}%"
|
||||||
|
- name: diskspace_low_worker
|
||||||
|
rules:
|
||||||
|
- alert: DiskspaceLow
|
||||||
|
expr: round(node_filesystem_avail_bytes{mountpoint="/", node=~"worker.*"} / 1073742000, 0.1) < 50
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Free disk space below 10 GB"
|
||||||
|
description: "Disk space on server {{ $labels.node }} is under 10 GB. Plox fix. Free Space: {{ $value }} GB on partition {{ $labels.device }}"
|
||||||
|
- name: diskspace_low_master
|
||||||
|
rules:
|
||||||
|
- alert: DiskspaceLow
|
||||||
|
expr: round(node_filesystem_avail_bytes{mountpoint="/", node=~"master.*"} / 1073742000, 0.1) < 10
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Free disk space below 2 GB"
|
||||||
|
description: "Disk space on server {{ $labels.node }} is under 2 GB. Plox fix. Free Space: {{ $value }} GB on partition {{ $labels.device }}"
|
||||||
|
- name: KubernetesUnhealthyPod
|
||||||
|
rules:
|
||||||
|
- alert: KubernetesUnhealthyPod
|
||||||
|
expr: kube_pod_container_status_waiting_reason == 1
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "The Pod {{ $labels.pod }} is {{ $labels.reason }}"
|
||||||
|
description: "The Pod {{ $labels.pod }} is in the state {{ $labels.reason }} for more than 5m. The Pod is in namespace {{ $labels.namespace }} and on node {{ $labels.node }}."
|
||||||
|
- name: PrometheusTargetMissing
|
||||||
|
rules:
|
||||||
|
- alert: PrometheusTargetMissing
|
||||||
|
expr: up == 0
|
||||||
|
for: 3m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus target missing (instance {{ $labels.instance }})
|
||||||
|
description: "A Prometheus target has disappeared. {{if ne $labels.job \"\"}}\n Job: {{ $labels.job }}{{end}}{{if ne $labels.app \"\"}}\n App: {{ $labels.app }}{{end}}{{if ne $labels.pod \"\"}}\n Pod: {{ $labels.pod }}{{end}}{{if ne $labels.node \"\"}}\n Node: {{ $labels.node }}{{end}}{{if ne $labels.namespace \"\"}}\n Namespace: {{ $labels.namespace }}{{end}}"
|
||||||
|
- name: PrometheusConfigurationReloadFailure
|
||||||
|
rules:
|
||||||
|
- alert: PrometheusConfigurationReloadFailure
|
||||||
|
expr: prometheus_config_last_reload_successful != 1
|
||||||
|
for: 3m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus configuration reload failure (instance {{ $labels.instance }})
|
||||||
|
description: "Prometheus configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
- name: PrometheusAlertmanagerConfigurationReloadFailure
|
||||||
|
rules:
|
||||||
|
- alert: PrometheusAlertmanagerConfigurationReloadFailure
|
||||||
|
expr: alertmanager_config_last_reload_successful != 1
|
||||||
|
for: 3m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})
|
||||||
|
description: "AlertManager configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
- name: PrometheusAlertmanagerConfigNotSynced
|
||||||
|
rules:
|
||||||
|
- alert: PrometheusAlertmanagerConfigNotSynced
|
||||||
|
expr: count(count_values("config_hash", alertmanager_config_hash)) > 1
|
||||||
|
for: 3m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }})
|
||||||
|
description: "Configurations of AlertManager cluster instances are out of sync\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
- name: PrometheusTargetEmpty
|
||||||
|
rules:
|
||||||
|
- alert: PrometheusTargetEmpty
|
||||||
|
expr: prometheus_sd_discovered_targets == 0
|
||||||
|
for: 3m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus target empty (instance {{ $labels.instance }})
|
||||||
|
description: "Prometheus has no target in service discovery\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
- name: PrometheusTargetScrapingSlow
|
||||||
|
rules:
|
||||||
|
- alert: PrometheusTargetScrapingSlow
|
||||||
|
expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 120
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus target scraping (instance {{ $labels.instance }})
|
||||||
|
description: "Prometheus is scraping exporters ly\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
- name: PrometheusLargeScrape
|
||||||
|
rules:
|
||||||
|
- alert: PrometheusLargeScrape
|
||||||
|
expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus large scrape (instance {{ $labels.instance }})
|
||||||
|
description: "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
- name: HostMemoryUnderMemoryPressure
|
||||||
|
rules:
|
||||||
|
- alert: HostMemoryUnderMemoryPressure
|
||||||
|
expr: rate(node_vmstat_pgmajfault[1m]) > 1000
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host memory under memory pressure {{ $labels.node }}
|
||||||
|
description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
- name: HostUnusualDiskReadRate
|
||||||
|
rules:
|
||||||
|
- alert: HostUnusualDiskReadRate
|
||||||
|
expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 200
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host unusual disk read rate {{ $labels.node }}
|
||||||
|
description: "Disk is probably reading too much data (> 200 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
- name: HostUnusualDiskWriteRate
|
||||||
|
rules:
|
||||||
|
- alert: HostUnusualDiskWriteRate
|
||||||
|
expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 200
|
||||||
|
for: 3m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host unusual disk write rate {{ $labels.node }}
|
||||||
|
description: "Disk is probably writing too much data (> 200 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
- name: HostCpuStealNoisyNeighbor
|
||||||
|
rules:
|
||||||
|
- alert: HostCpuStealNoisyNeighbor
|
||||||
|
expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host CPU steal noisy neighbor {{ $labels.node }}
|
||||||
|
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
- name: HostPhysicalComponentTooHot
|
||||||
|
rules:
|
||||||
|
- alert: HostPhysicalComponentTooHot
|
||||||
|
expr: node_hwmon_temp_celsius > 85
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host physical component too hot {{ $labels.node }}
|
||||||
|
description: "Physical hardware component too hot\n Sensor = {{ $labels.sensor }}\n Temp = {{ $value }}"
|
||||||
|
- name: SMARTbad
|
||||||
|
rules:
|
||||||
|
- alert: SMARTbad
|
||||||
|
expr: smartmon_device_smart_healthy < 1
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: SMART check bad of drive {{ $labels.exported_disk }} in server {{ $labels.node }}
|
||||||
|
description: "SMART check returned bad health of {{ $labels.exported_disk }} in server {{ $labels.node }}. VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
- name: SMARTcheck_old
|
||||||
|
rules:
|
||||||
|
- alert: "SMARTcheck too old"
|
||||||
|
expr: (time() - smartmon_smartctl_run) > 10800
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "SMARTcheck not running"
|
||||||
|
description: 'The last SMARTcheck on server {{ $labels.node }} was more than 3h ago. Plox fix.'
|
|
@ -1,522 +0,0 @@
|
||||||
---
|
|
||||||
kind: ConfigMap
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
app: prometheus
|
|
||||||
app.kubernetes.io/instance: prometheus
|
|
||||||
component: server
|
|
||||||
release: prometheus
|
|
||||||
name: prometheus-server
|
|
||||||
namespace: prometheus
|
|
||||||
apiVersion: v1
|
|
||||||
data:
|
|
||||||
alerting_rules.yml: |
|
|
||||||
{}
|
|
||||||
alerts: |
|
|
||||||
groups:
|
|
||||||
- name: memory_high
|
|
||||||
rules:
|
|
||||||
- alert: MemoryHigh
|
|
||||||
expr: round((((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * 100), 0.1) > 80
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: "Memory over 80%"
|
|
||||||
description: "Memory on node {{ $labels.node }} is over 80% for more than 5 minutes. Plox fix. Memory usage: {{ $value }}%"
|
|
||||||
- name: diskspace_low_worker
|
|
||||||
rules:
|
|
||||||
- alert: DiskspaceLow
|
|
||||||
expr: round(node_filesystem_avail_bytes{mountpoint="/", node=~"worker.*"} / 1073742000, 0.1) < 50
|
|
||||||
for: 1m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: "Free disk space below 10 GB"
|
|
||||||
description: "Disk space on server {{ $labels.node }} is under 10 GB. Plox fix. Free Space: {{ $value }} GB on partition {{ $labels.device }}"
|
|
||||||
- name: diskspace_low_master
|
|
||||||
rules:
|
|
||||||
- alert: DiskspaceLow
|
|
||||||
expr: round(node_filesystem_avail_bytes{mountpoint="/", node=~"master.*"} / 1073742000, 0.1) < 10
|
|
||||||
for: 1m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: "Free disk space below 2 GB"
|
|
||||||
description: "Disk space on server {{ $labels.node }} is under 2 GB. Plox fix. Free Space: {{ $value }} GB on partition {{ $labels.device }}"
|
|
||||||
- name: KubernetesUnhealthyPod
|
|
||||||
rules:
|
|
||||||
- alert: KubernetesUnhealthyPod
|
|
||||||
expr: kube_pod_container_status_waiting_reason == 1
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: "The Pod {{ $labels.pod }} is {{ $labels.reason }}"
|
|
||||||
description: "The Pod {{ $labels.pod }} is in the state {{ $labels.reason }} for more than 5m. The Pod is in namespace {{ $labels.namespace }} and on node {{ $labels.node }}."
|
|
||||||
- name: PrometheusTargetMissing
|
|
||||||
rules:
|
|
||||||
- alert: PrometheusTargetMissing
|
|
||||||
expr: up == 0
|
|
||||||
for: 3m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
summary: Prometheus target missing (instance {{ $labels.instance }})
|
|
||||||
description: "A Prometheus target has disappeared. {{if ne $labels.job \"\"}}\n Job: {{ $labels.job }}{{end}}{{if ne $labels.app \"\"}}\n App: {{ $labels.app }}{{end}}{{if ne $labels.pod \"\"}}\n Pod: {{ $labels.pod }}{{end}}{{if ne $labels.node \"\"}}\n Node: {{ $labels.node }}{{end}}{{if ne $labels.namespace \"\"}}\n Namespace: {{ $labels.namespace }}{{end}}"
|
|
||||||
- name: PrometheusConfigurationReloadFailure
|
|
||||||
rules:
|
|
||||||
- alert: PrometheusConfigurationReloadFailure
|
|
||||||
expr: prometheus_config_last_reload_successful != 1
|
|
||||||
for: 3m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
summary: Prometheus configuration reload failure (instance {{ $labels.instance }})
|
|
||||||
description: "Prometheus configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
||||||
- name: PrometheusAlertmanagerConfigurationReloadFailure
|
|
||||||
rules:
|
|
||||||
- alert: PrometheusAlertmanagerConfigurationReloadFailure
|
|
||||||
expr: alertmanager_config_last_reload_successful != 1
|
|
||||||
for: 3m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})
|
|
||||||
description: "AlertManager configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
||||||
- name: PrometheusAlertmanagerConfigNotSynced
|
|
||||||
rules:
|
|
||||||
- alert: PrometheusAlertmanagerConfigNotSynced
|
|
||||||
expr: count(count_values("config_hash", alertmanager_config_hash)) > 1
|
|
||||||
for: 3m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }})
|
|
||||||
description: "Configurations of AlertManager cluster instances are out of sync\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
||||||
- name: PrometheusTargetEmpty
|
|
||||||
rules:
|
|
||||||
- alert: PrometheusTargetEmpty
|
|
||||||
expr: prometheus_sd_discovered_targets == 0
|
|
||||||
for: 3m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
summary: Prometheus target empty (instance {{ $labels.instance }})
|
|
||||||
description: "Prometheus has no target in service discovery\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
||||||
- name: PrometheusTargetScrapingSlow
|
|
||||||
rules:
|
|
||||||
- alert: PrometheusTargetScrapingSlow
|
|
||||||
expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 120
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: Prometheus target scraping (instance {{ $labels.instance }})
|
|
||||||
description: "Prometheus is scraping exporters ly\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
||||||
- name: PrometheusLargeScrape
|
|
||||||
rules:
|
|
||||||
- alert: PrometheusLargeScrape
|
|
||||||
expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: Prometheus large scrape (instance {{ $labels.instance }})
|
|
||||||
description: "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
||||||
- name: HostMemoryUnderMemoryPressure
|
|
||||||
rules:
|
|
||||||
- alert: HostMemoryUnderMemoryPressure
|
|
||||||
expr: rate(node_vmstat_pgmajfault[1m]) > 1000
|
|
||||||
for: 2m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: Host memory under memory pressure {{ $labels.node }}
|
|
||||||
description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
||||||
- name: HostUnusualDiskReadRate
|
|
||||||
rules:
|
|
||||||
- alert: HostUnusualDiskReadRate
|
|
||||||
expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 200
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: Host unusual disk read rate {{ $labels.node }}
|
|
||||||
description: "Disk is probably reading too much data (> 200 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
||||||
- name: HostUnusualDiskWriteRate
|
|
||||||
rules:
|
|
||||||
- alert: HostUnusualDiskWriteRate
|
|
||||||
expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 200
|
|
||||||
for: 3m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: Host unusual disk write rate {{ $labels.node }}
|
|
||||||
description: "Disk is probably writing too much data (> 200 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
||||||
- name: HostCpuStealNoisyNeighbor
|
|
||||||
rules:
|
|
||||||
- alert: HostCpuStealNoisyNeighbor
|
|
||||||
expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10
|
|
||||||
for: 1m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: Host CPU steal noisy neighbor {{ $labels.node }}
|
|
||||||
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
||||||
- name: HostPhysicalComponentTooHot
|
|
||||||
rules:
|
|
||||||
- alert: HostPhysicalComponentTooHot
|
|
||||||
expr: node_hwmon_temp_celsius > 85
|
|
||||||
for: 5m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: Host physical component too hot {{ $labels.node }}
|
|
||||||
description: "Physical hardware component too hot\n Sensor = {{ $labels.sensor }}\n Temp = {{ $value }}"
|
|
||||||
- name: SMARTbad
|
|
||||||
rules:
|
|
||||||
- alert: SMARTbad
|
|
||||||
expr: smartmon_device_smart_healthy < 1
|
|
||||||
for: 0m
|
|
||||||
labels:
|
|
||||||
severity: critical
|
|
||||||
annotations:
|
|
||||||
summary: SMART check bad of drive {{ $labels.exported_disk }} in server {{ $labels.node }}
|
|
||||||
description: "SMART check returned bad health of {{ $labels.exported_disk }} in server {{ $labels.node }}. VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
|
||||||
- name: SMARTcheck_old
|
|
||||||
rules:
|
|
||||||
- alert: "SMARTcheck too old"
|
|
||||||
expr: (time() - smartmon_smartctl_run) > 10800
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
annotations:
|
|
||||||
summary: "SMARTcheck not running"
|
|
||||||
description: 'The last SMARTcheck on server {{ $labels.node }} was more than 3h ago. Plox fix.'
|
|
||||||
allow-snippet-annotations: 'false'
|
|
||||||
prometheus.yml: |
|
|
||||||
global:
|
|
||||||
evaluation_interval: 1m
|
|
||||||
scrape_interval: 1m
|
|
||||||
scrape_timeout: 10s
|
|
||||||
rule_files:
|
|
||||||
- /etc/config/recording_rules.yml
|
|
||||||
- /etc/config/alerting_rules.yml
|
|
||||||
- /etc/config/rules
|
|
||||||
- /etc/config/alerts
|
|
||||||
scrape_configs:
|
|
||||||
- job_name: prometheus
|
|
||||||
static_configs:
|
|
||||||
- targets:
|
|
||||||
- localhost:9090
|
|
||||||
- bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
|
|
||||||
job_name: kubernetes-apiservers
|
|
||||||
kubernetes_sd_configs:
|
|
||||||
- role: endpoints
|
|
||||||
relabel_configs:
|
|
||||||
- action: keep
|
|
||||||
regex: default;kubernetes;https
|
|
||||||
source_labels:
|
|
||||||
- __meta_kubernetes_namespace
|
|
||||||
- __meta_kubernetes_service_name
|
|
||||||
- __meta_kubernetes_endpoint_port_name
|
|
||||||
scheme: https
|
|
||||||
tls_config:
|
|
||||||
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
|
||||||
insecure_skip_verify: true
|
|
||||||
- bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
|
|
||||||
job_name: kubernetes-nodes
|
|
||||||
kubernetes_sd_configs:
|
|
||||||
- role: node
|
|
||||||
relabel_configs:
|
|
||||||
- action: labelmap
|
|
||||||
regex: __meta_kubernetes_node_label_(.+)
|
|
||||||
- replacement: kubernetes.default.svc:443
|
|
||||||
target_label: __address__
|
|
||||||
- regex: (.+)
|
|
||||||
replacement: /api/v1/nodes/$1/proxy/metrics
|
|
||||||
source_labels:
|
|
||||||
- __meta_kubernetes_node_name
|
|
||||||
target_label: __metrics_path__
|
|
||||||
scheme: https
|
|
||||||
tls_config:
|
|
||||||
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
|
||||||
insecure_skip_verify: true
|
|
||||||
- bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
|
|
||||||
job_name: kubernetes-nodes-cadvisor
|
|
||||||
kubernetes_sd_configs:
|
|
||||||
- role: node
|
|
||||||
relabel_configs:
|
|
||||||
- action: labelmap
|
|
||||||
regex: __meta_kubernetes_node_label_(.+)
|
|
||||||
- replacement: kubernetes.default.svc:443
|
|
||||||
target_label: __address__
|
|
||||||
- regex: (.+)
|
|
||||||
replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor
|
|
||||||
source_labels:
|
|
||||||
- __meta_kubernetes_node_name
|
|
||||||
target_label: __metrics_path__
|
|
||||||
scheme: https
|
|
||||||
tls_config:
|
|
||||||
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
|
||||||
insecure_skip_verify: true
|
|
||||||
- honor_labels: true
|
|
||||||
job_name: kubernetes-service-endpoints
|
|
||||||
kubernetes_sd_configs:
|
|
||||||
- role: endpoints
|
|
||||||
relabel_configs:
|
|
||||||
- action: keep
|
|
||||||
regex: true
|
|
||||||
source_labels:
|
|
||||||
- __meta_kubernetes_service_annotation_prometheus_io_scrape
|
|
||||||
- action: drop
|
|
||||||
regex: true
|
|
||||||
source_labels:
|
|
||||||
- __meta_kubernetes_service_annotation_prometheus_io_scrape_slow
|
|
||||||
- action: replace
|
|
||||||
regex: (https?)
|
|
||||||
source_labels:
|
|
||||||
- __meta_kubernetes_service_annotation_prometheus_io_scheme
|
|
||||||
target_label: __scheme__
|
|
||||||
- action: replace
|
|
||||||
regex: (.+)
|
|
||||||
source_labels:
|
|
||||||
- __meta_kubernetes_service_annotation_prometheus_io_path
|
|
||||||
target_label: __metrics_path__
|
|
||||||
- action: replace
|
|
||||||
regex: (.+?)(?::\d+)?;(\d+)
|
|
||||||
replacement: $1:$2
|
|
||||||
source_labels:
|
|
||||||
- __address__
|
|
||||||
- __meta_kubernetes_service_annotation_prometheus_io_port
|
|
||||||
target_label: __address__
|
|
||||||
- action: labelmap
|
|
||||||
regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+)
|
|
||||||
replacement: __param_$1
|
|
||||||
- action: labelmap
|
|
||||||
regex: __meta_kubernetes_service_label_(.+)
|
|
||||||
- action: replace
|
|
||||||
source_labels:
|
|
||||||
- __meta_kubernetes_namespace
|
|
||||||
target_label: namespace
|
|
||||||
- action: replace
|
|
||||||
source_labels:
|
|
||||||
- __meta_kubernetes_service_name
|
|
||||||
target_label: service
|
|
||||||
- action: replace
|
|
||||||
source_labels:
|
|
||||||
- __meta_kubernetes_pod_node_name
|
|
||||||
target_label: node
|
|
||||||
- honor_labels: true
|
|
||||||
job_name: kubernetes-service-endpoints-slow
|
|
||||||
kubernetes_sd_configs:
|
|
||||||
- role: endpoints
|
|
||||||
relabel_configs:
|
|
||||||
- action: keep
|
|
||||||
regex: true
|
|
||||||
source_labels:
|
|
||||||
- __meta_kubernetes_service_annotation_prometheus_io_scrape_slow
|
|
||||||
- action: replace
|
|
||||||
regex: (https?)
|
|
||||||
source_labels:
|
|
||||||
- __meta_kubernetes_service_annotation_prometheus_io_scheme
|
|
||||||
target_label: __scheme__
|
|
||||||
- action: replace
|
|
||||||
regex: (.+)
|
|
||||||
source_labels:
|
|
||||||
- __meta_kubernetes_service_annotation_prometheus_io_path
|
|
||||||
target_label: __metrics_path__
|
|
||||||
- action: replace
|
|
||||||
regex: (.+?)(?::\d+)?;(\d+)
|
|
||||||
replacement: $1:$2
|
|
||||||
source_labels:
|
|
||||||
- __address__
|
|
||||||
- __meta_kubernetes_service_annotation_prometheus_io_port
|
|
||||||
target_label: __address__
|
|
||||||
- action: labelmap
|
|
||||||
regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+)
|
|
||||||
replacement: __param_$1
|
|
||||||
- action: labelmap
|
|
||||||
regex: __meta_kubernetes_service_label_(.+)
|
|
||||||
- action: replace
|
|
||||||
source_labels:
|
|
||||||
- __meta_kubernetes_namespace
|
|
||||||
target_label: namespace
|
|
||||||
- action: replace
|
|
||||||
source_labels:
|
|
||||||
- __meta_kubernetes_service_name
|
|
||||||
target_label: service
|
|
||||||
- action: replace
|
|
||||||
source_labels:
|
|
||||||
- __meta_kubernetes_pod_node_name
|
|
||||||
target_label: node
|
|
||||||
scrape_interval: 5m
|
|
||||||
scrape_timeout: 30s
|
|
||||||
- honor_labels: true
|
|
||||||
job_name: prometheus-pushgateway
|
|
||||||
kubernetes_sd_configs:
|
|
||||||
- role: service
|
|
||||||
relabel_configs:
|
|
||||||
- action: keep
|
|
||||||
regex: pushgateway
|
|
||||||
source_labels:
|
|
||||||
- __meta_kubernetes_service_annotation_prometheus_io_probe
|
|
||||||
- honor_labels: true
|
|
||||||
job_name: kubernetes-services
|
|
||||||
kubernetes_sd_configs:
|
|
||||||
- role: service
|
|
||||||
metrics_path: /probe
|
|
||||||
params:
|
|
||||||
module:
|
|
||||||
- http_2xx
|
|
||||||
relabel_configs:
|
|
||||||
- action: keep
|
|
||||||
regex: true
|
|
||||||
source_labels:
|
|
||||||
- __meta_kubernetes_service_annotation_prometheus_io_probe
|
|
||||||
- source_labels:
|
|
||||||
- __address__
|
|
||||||
target_label: __param_target
|
|
||||||
- replacement: blackbox
|
|
||||||
target_label: __address__
|
|
||||||
- source_labels:
|
|
||||||
- __param_target
|
|
||||||
target_label: instance
|
|
||||||
- action: labelmap
|
|
||||||
regex: __meta_kubernetes_service_label_(.+)
|
|
||||||
- source_labels:
|
|
||||||
- __meta_kubernetes_namespace
|
|
||||||
target_label: namespace
|
|
||||||
- source_labels:
|
|
||||||
- __meta_kubernetes_service_name
|
|
||||||
target_label: service
|
|
||||||
- honor_labels: true
|
|
||||||
job_name: kubernetes-pods
|
|
||||||
kubernetes_sd_configs:
|
|
||||||
- role: pod
|
|
||||||
relabel_configs:
|
|
||||||
- action: keep
|
|
||||||
regex: true
|
|
||||||
source_labels:
|
|
||||||
- __meta_kubernetes_pod_annotation_prometheus_io_scrape
|
|
||||||
- action: drop
|
|
||||||
regex: true
|
|
||||||
source_labels:
|
|
||||||
- __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow
|
|
||||||
- action: replace
|
|
||||||
regex: (https?)
|
|
||||||
source_labels:
|
|
||||||
- __meta_kubernetes_pod_annotation_prometheus_io_scheme
|
|
||||||
target_label: __scheme__
|
|
||||||
- action: replace
|
|
||||||
regex: (.+)
|
|
||||||
source_labels:
|
|
||||||
- __meta_kubernetes_pod_annotation_prometheus_io_path
|
|
||||||
target_label: __metrics_path__
|
|
||||||
- action: replace
|
|
||||||
regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})
|
|
||||||
replacement: '[$2]:$1'
|
|
||||||
source_labels:
|
|
||||||
- __meta_kubernetes_pod_annotation_prometheus_io_port
|
|
||||||
- __meta_kubernetes_pod_ip
|
|
||||||
target_label: __address__
|
|
||||||
- action: replace
|
|
||||||
regex: (\d+);((([0-9]+?)(\.|$)){4})
|
|
||||||
replacement: $2:$1
|
|
||||||
source_labels:
|
|
||||||
- __meta_kubernetes_pod_annotation_prometheus_io_port
|
|
||||||
- __meta_kubernetes_pod_ip
|
|
||||||
target_label: __address__
|
|
||||||
- action: labelmap
|
|
||||||
regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+)
|
|
||||||
replacement: __param_$1
|
|
||||||
- action: labelmap
|
|
||||||
regex: __meta_kubernetes_pod_label_(.+)
|
|
||||||
- action: replace
|
|
||||||
source_labels:
|
|
||||||
- __meta_kubernetes_namespace
|
|
||||||
target_label: namespace
|
|
||||||
- action: replace
|
|
||||||
source_labels:
|
|
||||||
- __meta_kubernetes_pod_name
|
|
||||||
target_label: pod
|
|
||||||
- action: drop
|
|
||||||
regex: Pending|Succeeded|Failed|Completed
|
|
||||||
source_labels:
|
|
||||||
- __meta_kubernetes_pod_phase
|
|
||||||
- honor_labels: true
|
|
||||||
job_name: kubernetes-pods-slow
|
|
||||||
kubernetes_sd_configs:
|
|
||||||
- role: pod
|
|
||||||
relabel_configs:
|
|
||||||
- action: keep
|
|
||||||
regex: true
|
|
||||||
source_labels:
|
|
||||||
- __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow
|
|
||||||
- action: replace
|
|
||||||
regex: (https?)
|
|
||||||
source_labels:
|
|
||||||
- __meta_kubernetes_pod_annotation_prometheus_io_scheme
|
|
||||||
target_label: __scheme__
|
|
||||||
- action: replace
|
|
||||||
regex: (.+)
|
|
||||||
source_labels:
|
|
||||||
- __meta_kubernetes_pod_annotation_prometheus_io_path
|
|
||||||
target_label: __metrics_path__
|
|
||||||
- action: replace
|
|
||||||
regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})
|
|
||||||
replacement: '[$2]:$1'
|
|
||||||
source_labels:
|
|
||||||
- __meta_kubernetes_pod_annotation_prometheus_io_port
|
|
||||||
- __meta_kubernetes_pod_ip
|
|
||||||
target_label: __address__
|
|
||||||
- action: replace
|
|
||||||
regex: (\d+);((([0-9]+?)(\.|$)){4})
|
|
||||||
replacement: $2:$1
|
|
||||||
source_labels:
|
|
||||||
- __meta_kubernetes_pod_annotation_prometheus_io_port
|
|
||||||
- __meta_kubernetes_pod_ip
|
|
||||||
target_label: __address__
|
|
||||||
- action: labelmap
|
|
||||||
regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+)
|
|
||||||
replacement: __param_$1
|
|
||||||
- action: labelmap
|
|
||||||
regex: __meta_kubernetes_pod_label_(.+)
|
|
||||||
- action: replace
|
|
||||||
source_labels:
|
|
||||||
- __meta_kubernetes_namespace
|
|
||||||
target_label: namespace
|
|
||||||
- action: replace
|
|
||||||
source_labels:
|
|
||||||
- __meta_kubernetes_pod_name
|
|
||||||
target_label: pod
|
|
||||||
- action: drop
|
|
||||||
regex: Pending|Succeeded|Failed|Completed
|
|
||||||
source_labels:
|
|
||||||
- __meta_kubernetes_pod_phase
|
|
||||||
scrape_interval: 5m
|
|
||||||
scrape_timeout: 30s
|
|
||||||
alerting:
|
|
||||||
alertmanagers:
|
|
||||||
- kubernetes_sd_configs:
|
|
||||||
- role: pod
|
|
||||||
tls_config:
|
|
||||||
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
|
||||||
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
|
|
||||||
relabel_configs:
|
|
||||||
- source_labels: [__meta_kubernetes_namespace]
|
|
||||||
regex: prometheus
|
|
||||||
action: keep
|
|
||||||
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_instance]
|
|
||||||
regex: prometheus
|
|
||||||
action: keep
|
|
||||||
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
|
|
||||||
regex: alertmanager
|
|
||||||
action: keep
|
|
||||||
- source_labels: [__meta_kubernetes_pod_container_port_number]
|
|
||||||
regex: "9093"
|
|
||||||
action: keep
|
|
||||||
recording_rules.yml: |
|
|
||||||
{}
|
|
||||||
rules: |
|
|
||||||
{}
|
|
|
@ -1,46 +1,4 @@
|
||||||
---
|
---
|
||||||
apiVersion: networking.k8s.io/v1
|
|
||||||
kind: Ingress
|
|
||||||
metadata:
|
|
||||||
name: prometheus-ingress
|
|
||||||
namespace: prometheus
|
|
||||||
annotations:
|
|
||||||
nginx.org/basic-auth-secret: prometheus-basic-auth-secret
|
|
||||||
spec:
|
|
||||||
ingressClassName: nginx
|
|
||||||
rules:
|
|
||||||
- host: "prometheus.services.yolokube.de"
|
|
||||||
http:
|
|
||||||
paths:
|
|
||||||
- pathType: Prefix
|
|
||||||
path: "/"
|
|
||||||
backend:
|
|
||||||
service:
|
|
||||||
name: prometheus-server
|
|
||||||
port:
|
|
||||||
number: 80
|
|
||||||
---
|
|
||||||
apiVersion: networking.k8s.io/v1
|
|
||||||
kind: Ingress
|
|
||||||
metadata:
|
|
||||||
name: alertmanager-ingress
|
|
||||||
namespace: prometheus
|
|
||||||
annotations:
|
|
||||||
nginx.org/basic-auth-secret: prometheus-basic-auth-secret
|
|
||||||
spec:
|
|
||||||
ingressClassName: nginx
|
|
||||||
rules:
|
|
||||||
- host: "alertmanager.services.yolokube.de"
|
|
||||||
http:
|
|
||||||
paths:
|
|
||||||
- pathType: Prefix
|
|
||||||
path: "/"
|
|
||||||
backend:
|
|
||||||
service:
|
|
||||||
name: prometheus-alertmanager
|
|
||||||
port:
|
|
||||||
number: 9093
|
|
||||||
---
|
|
||||||
kind: Secret
|
kind: Secret
|
||||||
metadata:
|
metadata:
|
||||||
name: prometheus-basic-auth-secret
|
name: prometheus-basic-auth-secret
|
||||||
|
|
|
@ -1,17 +1,101 @@
|
||||||
alertmanager:
|
alertmanager:
|
||||||
image:
|
alertmanagerSpec:
|
||||||
repository: aaronriedel/alertmanager
|
image:
|
||||||
tag: "latest"
|
registry: docker.io
|
||||||
extraSecretMounts:
|
repository: aaronriedel/alertmanager
|
||||||
- name: telegram-api
|
tag: "latest"
|
||||||
mountPath: /etc/alertmanager/telegram-token
|
replicas: 2
|
||||||
subPath: ""
|
externalUrl: alertmanager.services.yolokube.de
|
||||||
secretName: telegram-api
|
config:
|
||||||
readOnly: true
|
global:
|
||||||
configmapReload:
|
resolve_timeout: 5m
|
||||||
|
templates:
|
||||||
|
- '/etc/alertmanager/config/*.tmpl'
|
||||||
|
route:
|
||||||
|
group_by: ['alertname']
|
||||||
|
group_wait: 30s
|
||||||
|
group_interval: 30s
|
||||||
|
repeat_interval: 24h
|
||||||
|
receiver: 'tg1'
|
||||||
|
routes:
|
||||||
|
- matchers:
|
||||||
|
- severity=warning
|
||||||
|
receiver: 'tg1'
|
||||||
|
- matchers:
|
||||||
|
- severity=critical
|
||||||
|
receiver: 'tg1'
|
||||||
|
receivers:
|
||||||
|
- name: tg1
|
||||||
|
telegram_configs:
|
||||||
|
- bot_token_file: '/etc/alertmanager/telegram-token/api_key'
|
||||||
|
chat_id: -995270884
|
||||||
|
api_url: "https://api.telegram.org"
|
||||||
|
send_resolved: true
|
||||||
|
parse_mode: "HTML"
|
||||||
|
message: '{{ template "telegram.aaron" .}}'
|
||||||
|
inhibit_rules:
|
||||||
|
- source_matchers:
|
||||||
|
- severity = critical
|
||||||
|
target_matchers:
|
||||||
|
- severity = warning
|
||||||
|
equal: ['alertname', 'server', 'instance']
|
||||||
|
templateFiles:
|
||||||
|
telegram.tmpl: |-
|
||||||
|
{{ define "telegram.aaron" }}
|
||||||
|
{{ range .Alerts }}
|
||||||
|
{{ if eq .Status "firing"}}🔥 <b>{{ .Labels.alertname }}</b> 🔥{{ else }}✅ <b>{{ .Labels.alertname }}</b> ✅{{ end }}
|
||||||
|
<b> {{ .Annotations.summary }} </b>
|
||||||
|
|
||||||
|
{{ .Annotations.description }}
|
||||||
|
{{ end }}
|
||||||
|
{{ end }}
|
||||||
|
ingress:
|
||||||
enabled: true
|
enabled: true
|
||||||
|
ingressClassName: nginx
|
||||||
|
hosts:
|
||||||
|
- alertmanager.services.yolokube.de
|
||||||
|
annotations:
|
||||||
|
nginx.org/basic-auth-secret: prometheus-basic-auth-secret
|
||||||
|
ingressPerReplica:
|
||||||
|
enabled: true
|
||||||
|
ingressClassName: nginx
|
||||||
|
hostPrefix: alertmanager
|
||||||
|
hostDomain: services.yolokube.de
|
||||||
|
annotations:
|
||||||
|
nginx.org/basic-auth-secret: prometheus-basic-auth-secret
|
||||||
|
servicePerReplica:
|
||||||
|
enabled: true
|
||||||
|
grafana:
|
||||||
|
defaultDashboardsTimezone: Europe/Berlin
|
||||||
|
ingress:
|
||||||
|
enabled: true
|
||||||
|
hosts:
|
||||||
|
- grafana.services.yolokube.de
|
||||||
|
ingressClassName: nginx
|
||||||
prometheus-node-exporter:
|
prometheus-node-exporter:
|
||||||
extraArgs:
|
extraArgs:
|
||||||
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
|
- '--collector.filesystem.mount-points-exclude=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/.+)($|/)'
|
||||||
|
- '--collector.filesystem.fs-types-exclude=^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$'
|
||||||
- '--collector.textfile.directory=/host/root/var/log/'
|
- '--collector.textfile.directory=/host/root/var/log/'
|
||||||
- '--collector.ethtool'
|
- '--collector.ethtool'
|
||||||
|
prometheus:
|
||||||
|
servicePerReplica:
|
||||||
|
enabled: true
|
||||||
|
ingress:
|
||||||
|
enabled: true
|
||||||
|
ingressClassName: nginx
|
||||||
|
hosts:
|
||||||
|
- prometheus.services.yolokube.de
|
||||||
|
annotations:
|
||||||
|
nginx.org/basic-auth-secret: prometheus-basic-auth-secret
|
||||||
|
ingressPerReplica:
|
||||||
|
enabled: true
|
||||||
|
hostPrefix: prometheus
|
||||||
|
hostDomain: services.yolokube.de
|
||||||
|
annotations:
|
||||||
|
nginx.org/basic-auth-secret: prometheus-basic-auth-secret
|
||||||
|
prometheusSpec:
|
||||||
|
externalUrl: prometheus.services.yolokube.de
|
||||||
|
replicas: 2
|
||||||
|
servicePerReplica:
|
||||||
|
enabled: true
|
Loading…
Reference in a new issue