From d5985f50b5fd63a6d484e5ee67fb2756de331a0c Mon Sep 17 00:00:00 2001 From: Aaron Riedel Date: Tue, 20 Jun 2023 08:43:48 +0200 Subject: [PATCH] change prometheus to prometheus-operator with kube-prometheus, this includes grafana --- core-deployments.yaml | 4 +- prometheus/alertmanager-config.yaml | 61 ---- prometheus/alerts.yaml | 187 ++++++++++ prometheus/config-map.yaml | 522 ---------------------------- prometheus/ingress.yaml | 42 --- prometheus/values.yaml | 106 +++++- 6 files changed, 284 insertions(+), 638 deletions(-) delete mode 100644 prometheus/alertmanager-config.yaml create mode 100644 prometheus/alerts.yaml delete mode 100644 prometheus/config-map.yaml diff --git a/core-deployments.yaml b/core-deployments.yaml index 6aac982..7280199 100644 --- a/core-deployments.yaml +++ b/core-deployments.yaml @@ -90,9 +90,9 @@ metadata: spec: project: default sources: - - chart: prometheus + - chart: kube-prometheus-stack repoURL: https://prometheus-community.github.io/helm-charts - targetRevision: 20.2.0 + targetRevision: 46.8.0 helm: releaseName: prometheus valueFiles: diff --git a/prometheus/alertmanager-config.yaml b/prometheus/alertmanager-config.yaml deleted file mode 100644 index e5ff68a..0000000 --- a/prometheus/alertmanager-config.yaml +++ /dev/null @@ -1,61 +0,0 @@ ---- -kind: ConfigMap -metadata: - labels: - app.kubernetes.io/instance: prometheus - app.kubernetes.io/managed-by: Helm - app.kubernetes.io/name: alertmanager - app.kubernetes.io/version: v0.25.0 - helm.sh/chart: alertmanager-0.24.1 - name: prometheus-alertmanager - namespace: prometheus -apiVersion: v1 -data: - alertmanager.yml: | - global: - resolve_timeout: 5m - - templates: - - '/etc/alertmanager/telegram.tmpl' - - route: - group_by: ['alertname'] - group_wait: 30s - group_interval: 30s - repeat_interval: 24h - receiver: 'tg1' - routes: - - matchers: - - severity=warning - receiver: 'tg1' - - - matchers: - - severity=critical - receiver: 'tg1' - - receivers: - - name: tg1 - telegram_configs: - - bot_token_file: '/etc/alertmanager/telegram-token/api_key' - chat_id: -995270884 - api_url: "https://api.telegram.org" - send_resolved: true - parse_mode: "HTML" - message: '{{ template "telegram.aaron" .}}' - - inhibit_rules: - - source_matchers: - - severity = critical - target_matchers: - - severity = warning - equal: ['alertname', 'server', 'instance'] - telegram.tmpl: | - {{ define "telegram.aaron" }} - {{ range .Alerts }} - {{ if eq .Status "firing"}}🔥 {{ .Labels.alertname }} 🔥{{ else }}✅ {{ .Labels.alertname }} ✅{{ end }} - {{ .Annotations.summary }} - - {{ .Annotations.description }} - {{ end }} - {{ end }} - diff --git a/prometheus/alerts.yaml b/prometheus/alerts.yaml new file mode 100644 index 0000000..248223f --- /dev/null +++ b/prometheus/alerts.yaml @@ -0,0 +1,187 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: prometheus-core-deployment-rules + namespace: prometheus +spec: + groups: + - name: memory_high + rules: + - alert: MemoryHigh + expr: round((((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * 100), 0.1) > 80 + for: 5m + labels: + severity: warning + annotations: + summary: "Memory over 80%" + description: "Memory on node {{ $labels.node }} is over 80% for more than 5 minutes. Plox fix. Memory usage: {{ $value }}%" + - name: diskspace_low_worker + rules: + - alert: DiskspaceLow + expr: round(node_filesystem_avail_bytes{mountpoint="/", node=~"worker.*"} / 1073742000, 0.1) < 50 + for: 1m + labels: + severity: warning + annotations: + summary: "Free disk space below 10 GB" + description: "Disk space on server {{ $labels.node }} is under 10 GB. Plox fix. Free Space: {{ $value }} GB on partition {{ $labels.device }}" + - name: diskspace_low_master + rules: + - alert: DiskspaceLow + expr: round(node_filesystem_avail_bytes{mountpoint="/", node=~"master.*"} / 1073742000, 0.1) < 10 + for: 1m + labels: + severity: warning + annotations: + summary: "Free disk space below 2 GB" + description: "Disk space on server {{ $labels.node }} is under 2 GB. Plox fix. Free Space: {{ $value }} GB on partition {{ $labels.device }}" + - name: KubernetesUnhealthyPod + rules: + - alert: KubernetesUnhealthyPod + expr: kube_pod_container_status_waiting_reason == 1 + for: 5m + labels: + severity: warning + annotations: + summary: "The Pod {{ $labels.pod }} is {{ $labels.reason }}" + description: "The Pod {{ $labels.pod }} is in the state {{ $labels.reason }} for more than 5m. The Pod is in namespace {{ $labels.namespace }} and on node {{ $labels.node }}." + - name: PrometheusTargetMissing + rules: + - alert: PrometheusTargetMissing + expr: up == 0 + for: 3m + labels: + severity: critical + annotations: + summary: Prometheus target missing (instance {{ $labels.instance }}) + description: "A Prometheus target has disappeared. {{if ne $labels.job \"\"}}\n Job: {{ $labels.job }}{{end}}{{if ne $labels.app \"\"}}\n App: {{ $labels.app }}{{end}}{{if ne $labels.pod \"\"}}\n Pod: {{ $labels.pod }}{{end}}{{if ne $labels.node \"\"}}\n Node: {{ $labels.node }}{{end}}{{if ne $labels.namespace \"\"}}\n Namespace: {{ $labels.namespace }}{{end}}" + - name: PrometheusConfigurationReloadFailure + rules: + - alert: PrometheusConfigurationReloadFailure + expr: prometheus_config_last_reload_successful != 1 + for: 3m + labels: + severity: critical + annotations: + summary: Prometheus configuration reload failure (instance {{ $labels.instance }}) + description: "Prometheus configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - name: PrometheusAlertmanagerConfigurationReloadFailure + rules: + - alert: PrometheusAlertmanagerConfigurationReloadFailure + expr: alertmanager_config_last_reload_successful != 1 + for: 3m + labels: + severity: critical + annotations: + summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}) + description: "AlertManager configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - name: PrometheusAlertmanagerConfigNotSynced + rules: + - alert: PrometheusAlertmanagerConfigNotSynced + expr: count(count_values("config_hash", alertmanager_config_hash)) > 1 + for: 3m + labels: + severity: warning + annotations: + summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }}) + description: "Configurations of AlertManager cluster instances are out of sync\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - name: PrometheusTargetEmpty + rules: + - alert: PrometheusTargetEmpty + expr: prometheus_sd_discovered_targets == 0 + for: 3m + labels: + severity: critical + annotations: + summary: Prometheus target empty (instance {{ $labels.instance }}) + description: "Prometheus has no target in service discovery\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - name: PrometheusTargetScrapingSlow + rules: + - alert: PrometheusTargetScrapingSlow + expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 120 + for: 5m + labels: + severity: warning + annotations: + summary: Prometheus target scraping (instance {{ $labels.instance }}) + description: "Prometheus is scraping exporters ly\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - name: PrometheusLargeScrape + rules: + - alert: PrometheusLargeScrape + expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10 + for: 5m + labels: + severity: warning + annotations: + summary: Prometheus large scrape (instance {{ $labels.instance }}) + description: "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - name: HostMemoryUnderMemoryPressure + rules: + - alert: HostMemoryUnderMemoryPressure + expr: rate(node_vmstat_pgmajfault[1m]) > 1000 + for: 2m + labels: + severity: warning + annotations: + summary: Host memory under memory pressure {{ $labels.node }} + description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - name: HostUnusualDiskReadRate + rules: + - alert: HostUnusualDiskReadRate + expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 200 + for: 5m + labels: + severity: warning + annotations: + summary: Host unusual disk read rate {{ $labels.node }} + description: "Disk is probably reading too much data (> 200 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - name: HostUnusualDiskWriteRate + rules: + - alert: HostUnusualDiskWriteRate + expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 200 + for: 3m + labels: + severity: warning + annotations: + summary: Host unusual disk write rate {{ $labels.node }} + description: "Disk is probably writing too much data (> 200 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - name: HostCpuStealNoisyNeighbor + rules: + - alert: HostCpuStealNoisyNeighbor + expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10 + for: 1m + labels: + severity: warning + annotations: + summary: Host CPU steal noisy neighbor {{ $labels.node }} + description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - name: HostPhysicalComponentTooHot + rules: + - alert: HostPhysicalComponentTooHot + expr: node_hwmon_temp_celsius > 85 + for: 5m + labels: + severity: warning + annotations: + summary: Host physical component too hot {{ $labels.node }} + description: "Physical hardware component too hot\n Sensor = {{ $labels.sensor }}\n Temp = {{ $value }}" + - name: SMARTbad + rules: + - alert: SMARTbad + expr: smartmon_device_smart_healthy < 1 + for: 0m + labels: + severity: critical + annotations: + summary: SMART check bad of drive {{ $labels.exported_disk }} in server {{ $labels.node }} + description: "SMART check returned bad health of {{ $labels.exported_disk }} in server {{ $labels.node }}. VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - name: SMARTcheck_old + rules: + - alert: "SMARTcheck too old" + expr: (time() - smartmon_smartctl_run) > 10800 + labels: + severity: warning + annotations: + summary: "SMARTcheck not running" + description: 'The last SMARTcheck on server {{ $labels.node }} was more than 3h ago. Plox fix.' \ No newline at end of file diff --git a/prometheus/config-map.yaml b/prometheus/config-map.yaml deleted file mode 100644 index 0ee0504..0000000 --- a/prometheus/config-map.yaml +++ /dev/null @@ -1,522 +0,0 @@ ---- -kind: ConfigMap -metadata: - labels: - app: prometheus - app.kubernetes.io/instance: prometheus - component: server - release: prometheus - name: prometheus-server - namespace: prometheus -apiVersion: v1 -data: - alerting_rules.yml: | - {} - alerts: | - groups: - - name: memory_high - rules: - - alert: MemoryHigh - expr: round((((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * 100), 0.1) > 80 - for: 5m - labels: - severity: warning - annotations: - summary: "Memory over 80%" - description: "Memory on node {{ $labels.node }} is over 80% for more than 5 minutes. Plox fix. Memory usage: {{ $value }}%" - - name: diskspace_low_worker - rules: - - alert: DiskspaceLow - expr: round(node_filesystem_avail_bytes{mountpoint="/", node=~"worker.*"} / 1073742000, 0.1) < 50 - for: 1m - labels: - severity: warning - annotations: - summary: "Free disk space below 10 GB" - description: "Disk space on server {{ $labels.node }} is under 10 GB. Plox fix. Free Space: {{ $value }} GB on partition {{ $labels.device }}" - - name: diskspace_low_master - rules: - - alert: DiskspaceLow - expr: round(node_filesystem_avail_bytes{mountpoint="/", node=~"master.*"} / 1073742000, 0.1) < 10 - for: 1m - labels: - severity: warning - annotations: - summary: "Free disk space below 2 GB" - description: "Disk space on server {{ $labels.node }} is under 2 GB. Plox fix. Free Space: {{ $value }} GB on partition {{ $labels.device }}" - - name: KubernetesUnhealthyPod - rules: - - alert: KubernetesUnhealthyPod - expr: kube_pod_container_status_waiting_reason == 1 - for: 5m - labels: - severity: warning - annotations: - summary: "The Pod {{ $labels.pod }} is {{ $labels.reason }}" - description: "The Pod {{ $labels.pod }} is in the state {{ $labels.reason }} for more than 5m. The Pod is in namespace {{ $labels.namespace }} and on node {{ $labels.node }}." - - name: PrometheusTargetMissing - rules: - - alert: PrometheusTargetMissing - expr: up == 0 - for: 3m - labels: - severity: critical - annotations: - summary: Prometheus target missing (instance {{ $labels.instance }}) - description: "A Prometheus target has disappeared. {{if ne $labels.job \"\"}}\n Job: {{ $labels.job }}{{end}}{{if ne $labels.app \"\"}}\n App: {{ $labels.app }}{{end}}{{if ne $labels.pod \"\"}}\n Pod: {{ $labels.pod }}{{end}}{{if ne $labels.node \"\"}}\n Node: {{ $labels.node }}{{end}}{{if ne $labels.namespace \"\"}}\n Namespace: {{ $labels.namespace }}{{end}}" - - name: PrometheusConfigurationReloadFailure - rules: - - alert: PrometheusConfigurationReloadFailure - expr: prometheus_config_last_reload_successful != 1 - for: 3m - labels: - severity: critical - annotations: - summary: Prometheus configuration reload failure (instance {{ $labels.instance }}) - description: "Prometheus configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - name: PrometheusAlertmanagerConfigurationReloadFailure - rules: - - alert: PrometheusAlertmanagerConfigurationReloadFailure - expr: alertmanager_config_last_reload_successful != 1 - for: 3m - labels: - severity: critical - annotations: - summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}) - description: "AlertManager configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - name: PrometheusAlertmanagerConfigNotSynced - rules: - - alert: PrometheusAlertmanagerConfigNotSynced - expr: count(count_values("config_hash", alertmanager_config_hash)) > 1 - for: 3m - labels: - severity: warning - annotations: - summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }}) - description: "Configurations of AlertManager cluster instances are out of sync\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - name: PrometheusTargetEmpty - rules: - - alert: PrometheusTargetEmpty - expr: prometheus_sd_discovered_targets == 0 - for: 3m - labels: - severity: critical - annotations: - summary: Prometheus target empty (instance {{ $labels.instance }}) - description: "Prometheus has no target in service discovery\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - name: PrometheusTargetScrapingSlow - rules: - - alert: PrometheusTargetScrapingSlow - expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 120 - for: 5m - labels: - severity: warning - annotations: - summary: Prometheus target scraping (instance {{ $labels.instance }}) - description: "Prometheus is scraping exporters ly\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - name: PrometheusLargeScrape - rules: - - alert: PrometheusLargeScrape - expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10 - for: 5m - labels: - severity: warning - annotations: - summary: Prometheus large scrape (instance {{ $labels.instance }}) - description: "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - name: HostMemoryUnderMemoryPressure - rules: - - alert: HostMemoryUnderMemoryPressure - expr: rate(node_vmstat_pgmajfault[1m]) > 1000 - for: 2m - labels: - severity: warning - annotations: - summary: Host memory under memory pressure {{ $labels.node }} - description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - name: HostUnusualDiskReadRate - rules: - - alert: HostUnusualDiskReadRate - expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 200 - for: 5m - labels: - severity: warning - annotations: - summary: Host unusual disk read rate {{ $labels.node }} - description: "Disk is probably reading too much data (> 200 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - name: HostUnusualDiskWriteRate - rules: - - alert: HostUnusualDiskWriteRate - expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 200 - for: 3m - labels: - severity: warning - annotations: - summary: Host unusual disk write rate {{ $labels.node }} - description: "Disk is probably writing too much data (> 200 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - name: HostCpuStealNoisyNeighbor - rules: - - alert: HostCpuStealNoisyNeighbor - expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10 - for: 1m - labels: - severity: warning - annotations: - summary: Host CPU steal noisy neighbor {{ $labels.node }} - description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - name: HostPhysicalComponentTooHot - rules: - - alert: HostPhysicalComponentTooHot - expr: node_hwmon_temp_celsius > 85 - for: 5m - labels: - severity: warning - annotations: - summary: Host physical component too hot {{ $labels.node }} - description: "Physical hardware component too hot\n Sensor = {{ $labels.sensor }}\n Temp = {{ $value }}" - - name: SMARTbad - rules: - - alert: SMARTbad - expr: smartmon_device_smart_healthy < 1 - for: 0m - labels: - severity: critical - annotations: - summary: SMART check bad of drive {{ $labels.exported_disk }} in server {{ $labels.node }} - description: "SMART check returned bad health of {{ $labels.exported_disk }} in server {{ $labels.node }}. VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - name: SMARTcheck_old - rules: - - alert: "SMARTcheck too old" - expr: (time() - smartmon_smartctl_run) > 10800 - labels: - severity: warning - annotations: - summary: "SMARTcheck not running" - description: 'The last SMARTcheck on server {{ $labels.node }} was more than 3h ago. Plox fix.' - allow-snippet-annotations: 'false' - prometheus.yml: | - global: - evaluation_interval: 1m - scrape_interval: 1m - scrape_timeout: 10s - rule_files: - - /etc/config/recording_rules.yml - - /etc/config/alerting_rules.yml - - /etc/config/rules - - /etc/config/alerts - scrape_configs: - - job_name: prometheus - static_configs: - - targets: - - localhost:9090 - - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - job_name: kubernetes-apiservers - kubernetes_sd_configs: - - role: endpoints - relabel_configs: - - action: keep - regex: default;kubernetes;https - source_labels: - - __meta_kubernetes_namespace - - __meta_kubernetes_service_name - - __meta_kubernetes_endpoint_port_name - scheme: https - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - insecure_skip_verify: true - - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - job_name: kubernetes-nodes - kubernetes_sd_configs: - - role: node - relabel_configs: - - action: labelmap - regex: __meta_kubernetes_node_label_(.+) - - replacement: kubernetes.default.svc:443 - target_label: __address__ - - regex: (.+) - replacement: /api/v1/nodes/$1/proxy/metrics - source_labels: - - __meta_kubernetes_node_name - target_label: __metrics_path__ - scheme: https - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - insecure_skip_verify: true - - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - job_name: kubernetes-nodes-cadvisor - kubernetes_sd_configs: - - role: node - relabel_configs: - - action: labelmap - regex: __meta_kubernetes_node_label_(.+) - - replacement: kubernetes.default.svc:443 - target_label: __address__ - - regex: (.+) - replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor - source_labels: - - __meta_kubernetes_node_name - target_label: __metrics_path__ - scheme: https - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - insecure_skip_verify: true - - honor_labels: true - job_name: kubernetes-service-endpoints - kubernetes_sd_configs: - - role: endpoints - relabel_configs: - - action: keep - regex: true - source_labels: - - __meta_kubernetes_service_annotation_prometheus_io_scrape - - action: drop - regex: true - source_labels: - - __meta_kubernetes_service_annotation_prometheus_io_scrape_slow - - action: replace - regex: (https?) - source_labels: - - __meta_kubernetes_service_annotation_prometheus_io_scheme - target_label: __scheme__ - - action: replace - regex: (.+) - source_labels: - - __meta_kubernetes_service_annotation_prometheus_io_path - target_label: __metrics_path__ - - action: replace - regex: (.+?)(?::\d+)?;(\d+) - replacement: $1:$2 - source_labels: - - __address__ - - __meta_kubernetes_service_annotation_prometheus_io_port - target_label: __address__ - - action: labelmap - regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) - replacement: __param_$1 - - action: labelmap - regex: __meta_kubernetes_service_label_(.+) - - action: replace - source_labels: - - __meta_kubernetes_namespace - target_label: namespace - - action: replace - source_labels: - - __meta_kubernetes_service_name - target_label: service - - action: replace - source_labels: - - __meta_kubernetes_pod_node_name - target_label: node - - honor_labels: true - job_name: kubernetes-service-endpoints-slow - kubernetes_sd_configs: - - role: endpoints - relabel_configs: - - action: keep - regex: true - source_labels: - - __meta_kubernetes_service_annotation_prometheus_io_scrape_slow - - action: replace - regex: (https?) - source_labels: - - __meta_kubernetes_service_annotation_prometheus_io_scheme - target_label: __scheme__ - - action: replace - regex: (.+) - source_labels: - - __meta_kubernetes_service_annotation_prometheus_io_path - target_label: __metrics_path__ - - action: replace - regex: (.+?)(?::\d+)?;(\d+) - replacement: $1:$2 - source_labels: - - __address__ - - __meta_kubernetes_service_annotation_prometheus_io_port - target_label: __address__ - - action: labelmap - regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) - replacement: __param_$1 - - action: labelmap - regex: __meta_kubernetes_service_label_(.+) - - action: replace - source_labels: - - __meta_kubernetes_namespace - target_label: namespace - - action: replace - source_labels: - - __meta_kubernetes_service_name - target_label: service - - action: replace - source_labels: - - __meta_kubernetes_pod_node_name - target_label: node - scrape_interval: 5m - scrape_timeout: 30s - - honor_labels: true - job_name: prometheus-pushgateway - kubernetes_sd_configs: - - role: service - relabel_configs: - - action: keep - regex: pushgateway - source_labels: - - __meta_kubernetes_service_annotation_prometheus_io_probe - - honor_labels: true - job_name: kubernetes-services - kubernetes_sd_configs: - - role: service - metrics_path: /probe - params: - module: - - http_2xx - relabel_configs: - - action: keep - regex: true - source_labels: - - __meta_kubernetes_service_annotation_prometheus_io_probe - - source_labels: - - __address__ - target_label: __param_target - - replacement: blackbox - target_label: __address__ - - source_labels: - - __param_target - target_label: instance - - action: labelmap - regex: __meta_kubernetes_service_label_(.+) - - source_labels: - - __meta_kubernetes_namespace - target_label: namespace - - source_labels: - - __meta_kubernetes_service_name - target_label: service - - honor_labels: true - job_name: kubernetes-pods - kubernetes_sd_configs: - - role: pod - relabel_configs: - - action: keep - regex: true - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_scrape - - action: drop - regex: true - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow - - action: replace - regex: (https?) - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_scheme - target_label: __scheme__ - - action: replace - regex: (.+) - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_path - target_label: __metrics_path__ - - action: replace - regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) - replacement: '[$2]:$1' - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_port - - __meta_kubernetes_pod_ip - target_label: __address__ - - action: replace - regex: (\d+);((([0-9]+?)(\.|$)){4}) - replacement: $2:$1 - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_port - - __meta_kubernetes_pod_ip - target_label: __address__ - - action: labelmap - regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) - replacement: __param_$1 - - action: labelmap - regex: __meta_kubernetes_pod_label_(.+) - - action: replace - source_labels: - - __meta_kubernetes_namespace - target_label: namespace - - action: replace - source_labels: - - __meta_kubernetes_pod_name - target_label: pod - - action: drop - regex: Pending|Succeeded|Failed|Completed - source_labels: - - __meta_kubernetes_pod_phase - - honor_labels: true - job_name: kubernetes-pods-slow - kubernetes_sd_configs: - - role: pod - relabel_configs: - - action: keep - regex: true - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow - - action: replace - regex: (https?) - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_scheme - target_label: __scheme__ - - action: replace - regex: (.+) - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_path - target_label: __metrics_path__ - - action: replace - regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) - replacement: '[$2]:$1' - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_port - - __meta_kubernetes_pod_ip - target_label: __address__ - - action: replace - regex: (\d+);((([0-9]+?)(\.|$)){4}) - replacement: $2:$1 - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_port - - __meta_kubernetes_pod_ip - target_label: __address__ - - action: labelmap - regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) - replacement: __param_$1 - - action: labelmap - regex: __meta_kubernetes_pod_label_(.+) - - action: replace - source_labels: - - __meta_kubernetes_namespace - target_label: namespace - - action: replace - source_labels: - - __meta_kubernetes_pod_name - target_label: pod - - action: drop - regex: Pending|Succeeded|Failed|Completed - source_labels: - - __meta_kubernetes_pod_phase - scrape_interval: 5m - scrape_timeout: 30s - alerting: - alertmanagers: - - kubernetes_sd_configs: - - role: pod - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - relabel_configs: - - source_labels: [__meta_kubernetes_namespace] - regex: prometheus - action: keep - - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_instance] - regex: prometheus - action: keep - - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name] - regex: alertmanager - action: keep - - source_labels: [__meta_kubernetes_pod_container_port_number] - regex: "9093" - action: keep - recording_rules.yml: | - {} - rules: | - {} diff --git a/prometheus/ingress.yaml b/prometheus/ingress.yaml index 10f41e4..d09cdff 100644 --- a/prometheus/ingress.yaml +++ b/prometheus/ingress.yaml @@ -1,46 +1,4 @@ --- -apiVersion: networking.k8s.io/v1 -kind: Ingress -metadata: - name: prometheus-ingress - namespace: prometheus - annotations: - nginx.org/basic-auth-secret: prometheus-basic-auth-secret -spec: - ingressClassName: nginx - rules: - - host: "prometheus.services.yolokube.de" - http: - paths: - - pathType: Prefix - path: "/" - backend: - service: - name: prometheus-server - port: - number: 80 ---- -apiVersion: networking.k8s.io/v1 -kind: Ingress -metadata: - name: alertmanager-ingress - namespace: prometheus - annotations: - nginx.org/basic-auth-secret: prometheus-basic-auth-secret -spec: - ingressClassName: nginx - rules: - - host: "alertmanager.services.yolokube.de" - http: - paths: - - pathType: Prefix - path: "/" - backend: - service: - name: prometheus-alertmanager - port: - number: 9093 ---- kind: Secret metadata: name: prometheus-basic-auth-secret diff --git a/prometheus/values.yaml b/prometheus/values.yaml index 74821dc..568e549 100644 --- a/prometheus/values.yaml +++ b/prometheus/values.yaml @@ -1,17 +1,101 @@ alertmanager: - image: - repository: aaronriedel/alertmanager - tag: "latest" - extraSecretMounts: - - name: telegram-api - mountPath: /etc/alertmanager/telegram-token - subPath: "" - secretName: telegram-api - readOnly: true - configmapReload: + alertmanagerSpec: + image: + registry: docker.io + repository: aaronriedel/alertmanager + tag: "latest" + replicas: 2 + externalUrl: alertmanager.services.yolokube.de + config: + global: + resolve_timeout: 5m + templates: + - '/etc/alertmanager/config/*.tmpl' + route: + group_by: ['alertname'] + group_wait: 30s + group_interval: 30s + repeat_interval: 24h + receiver: 'tg1' + routes: + - matchers: + - severity=warning + receiver: 'tg1' + - matchers: + - severity=critical + receiver: 'tg1' + receivers: + - name: tg1 + telegram_configs: + - bot_token_file: '/etc/alertmanager/telegram-token/api_key' + chat_id: -995270884 + api_url: "https://api.telegram.org" + send_resolved: true + parse_mode: "HTML" + message: '{{ template "telegram.aaron" .}}' + inhibit_rules: + - source_matchers: + - severity = critical + target_matchers: + - severity = warning + equal: ['alertname', 'server', 'instance'] + templateFiles: + telegram.tmpl: |- + {{ define "telegram.aaron" }} + {{ range .Alerts }} + {{ if eq .Status "firing"}}🔥 {{ .Labels.alertname }} 🔥{{ else }}✅ {{ .Labels.alertname }} ✅{{ end }} + {{ .Annotations.summary }} + + {{ .Annotations.description }} + {{ end }} + {{ end }} + ingress: enabled: true + ingressClassName: nginx + hosts: + - alertmanager.services.yolokube.de + annotations: + nginx.org/basic-auth-secret: prometheus-basic-auth-secret + ingressPerReplica: + enabled: true + ingressClassName: nginx + hostPrefix: alertmanager + hostDomain: services.yolokube.de + annotations: + nginx.org/basic-auth-secret: prometheus-basic-auth-secret + servicePerReplica: + enabled: true +grafana: + defaultDashboardsTimezone: Europe/Berlin + ingress: + enabled: true + hosts: + - grafana.services.yolokube.de + ingressClassName: nginx prometheus-node-exporter: extraArgs: - - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)' + - '--collector.filesystem.mount-points-exclude=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/.+)($|/)' + - '--collector.filesystem.fs-types-exclude=^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$' - '--collector.textfile.directory=/host/root/var/log/' - '--collector.ethtool' +prometheus: + servicePerReplica: + enabled: true + ingress: + enabled: true + ingressClassName: nginx + hosts: + - prometheus.services.yolokube.de + annotations: + nginx.org/basic-auth-secret: prometheus-basic-auth-secret + ingressPerReplica: + enabled: true + hostPrefix: prometheus + hostDomain: services.yolokube.de + annotations: + nginx.org/basic-auth-secret: prometheus-basic-auth-secret + prometheusSpec: + externalUrl: prometheus.services.yolokube.de + replicas: 2 + servicePerReplica: + enabled: true \ No newline at end of file