diff --git a/core-deployments.yaml b/core-deployments.yaml
index 6aac982..7280199 100644
--- a/core-deployments.yaml
+++ b/core-deployments.yaml
@@ -90,9 +90,9 @@ metadata:
spec:
project: default
sources:
- - chart: prometheus
+ - chart: kube-prometheus-stack
repoURL: https://prometheus-community.github.io/helm-charts
- targetRevision: 20.2.0
+ targetRevision: 46.8.0
helm:
releaseName: prometheus
valueFiles:
diff --git a/prometheus/alertmanager-config.yaml b/prometheus/alertmanager-config.yaml
deleted file mode 100644
index e5ff68a..0000000
--- a/prometheus/alertmanager-config.yaml
+++ /dev/null
@@ -1,61 +0,0 @@
----
-kind: ConfigMap
-metadata:
- labels:
- app.kubernetes.io/instance: prometheus
- app.kubernetes.io/managed-by: Helm
- app.kubernetes.io/name: alertmanager
- app.kubernetes.io/version: v0.25.0
- helm.sh/chart: alertmanager-0.24.1
- name: prometheus-alertmanager
- namespace: prometheus
-apiVersion: v1
-data:
- alertmanager.yml: |
- global:
- resolve_timeout: 5m
-
- templates:
- - '/etc/alertmanager/telegram.tmpl'
-
- route:
- group_by: ['alertname']
- group_wait: 30s
- group_interval: 30s
- repeat_interval: 24h
- receiver: 'tg1'
- routes:
- - matchers:
- - severity=warning
- receiver: 'tg1'
-
- - matchers:
- - severity=critical
- receiver: 'tg1'
-
- receivers:
- - name: tg1
- telegram_configs:
- - bot_token_file: '/etc/alertmanager/telegram-token/api_key'
- chat_id: -995270884
- api_url: "https://api.telegram.org"
- send_resolved: true
- parse_mode: "HTML"
- message: '{{ template "telegram.aaron" .}}'
-
- inhibit_rules:
- - source_matchers:
- - severity = critical
- target_matchers:
- - severity = warning
- equal: ['alertname', 'server', 'instance']
- telegram.tmpl: |
- {{ define "telegram.aaron" }}
- {{ range .Alerts }}
- {{ if eq .Status "firing"}}🔥 {{ .Labels.alertname }} 🔥{{ else }}✅ {{ .Labels.alertname }} ✅{{ end }}
- {{ .Annotations.summary }}
-
- {{ .Annotations.description }}
- {{ end }}
- {{ end }}
-
diff --git a/prometheus/alerts.yaml b/prometheus/alerts.yaml
new file mode 100644
index 0000000..248223f
--- /dev/null
+++ b/prometheus/alerts.yaml
@@ -0,0 +1,187 @@
+---
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+ name: prometheus-core-deployment-rules
+ namespace: prometheus
+spec:
+ groups:
+ - name: memory_high
+ rules:
+ - alert: MemoryHigh
+ expr: round((((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * 100), 0.1) > 80
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Memory over 80%"
+ description: "Memory on node {{ $labels.node }} is over 80% for more than 5 minutes. Plox fix. Memory usage: {{ $value }}%"
+ - name: diskspace_low_worker
+ rules:
+ - alert: DiskspaceLow
+ expr: round(node_filesystem_avail_bytes{mountpoint="/", node=~"worker.*"} / 1073742000, 0.1) < 50
+ for: 1m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Free disk space below 10 GB"
+ description: "Disk space on server {{ $labels.node }} is under 10 GB. Plox fix. Free Space: {{ $value }} GB on partition {{ $labels.device }}"
+ - name: diskspace_low_master
+ rules:
+ - alert: DiskspaceLow
+ expr: round(node_filesystem_avail_bytes{mountpoint="/", node=~"master.*"} / 1073742000, 0.1) < 10
+ for: 1m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Free disk space below 2 GB"
+ description: "Disk space on server {{ $labels.node }} is under 2 GB. Plox fix. Free Space: {{ $value }} GB on partition {{ $labels.device }}"
+ - name: KubernetesUnhealthyPod
+ rules:
+ - alert: KubernetesUnhealthyPod
+ expr: kube_pod_container_status_waiting_reason == 1
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "The Pod {{ $labels.pod }} is {{ $labels.reason }}"
+ description: "The Pod {{ $labels.pod }} is in the state {{ $labels.reason }} for more than 5m. The Pod is in namespace {{ $labels.namespace }} and on node {{ $labels.node }}."
+ - name: PrometheusTargetMissing
+ rules:
+ - alert: PrometheusTargetMissing
+ expr: up == 0
+ for: 3m
+ labels:
+ severity: critical
+ annotations:
+ summary: Prometheus target missing (instance {{ $labels.instance }})
+ description: "A Prometheus target has disappeared. {{if ne $labels.job \"\"}}\n Job: {{ $labels.job }}{{end}}{{if ne $labels.app \"\"}}\n App: {{ $labels.app }}{{end}}{{if ne $labels.pod \"\"}}\n Pod: {{ $labels.pod }}{{end}}{{if ne $labels.node \"\"}}\n Node: {{ $labels.node }}{{end}}{{if ne $labels.namespace \"\"}}\n Namespace: {{ $labels.namespace }}{{end}}"
+ - name: PrometheusConfigurationReloadFailure
+ rules:
+ - alert: PrometheusConfigurationReloadFailure
+ expr: prometheus_config_last_reload_successful != 1
+ for: 3m
+ labels:
+ severity: critical
+ annotations:
+ summary: Prometheus configuration reload failure (instance {{ $labels.instance }})
+ description: "Prometheus configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ - name: PrometheusAlertmanagerConfigurationReloadFailure
+ rules:
+ - alert: PrometheusAlertmanagerConfigurationReloadFailure
+ expr: alertmanager_config_last_reload_successful != 1
+ for: 3m
+ labels:
+ severity: critical
+ annotations:
+ summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})
+ description: "AlertManager configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ - name: PrometheusAlertmanagerConfigNotSynced
+ rules:
+ - alert: PrometheusAlertmanagerConfigNotSynced
+ expr: count(count_values("config_hash", alertmanager_config_hash)) > 1
+ for: 3m
+ labels:
+ severity: warning
+ annotations:
+ summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }})
+ description: "Configurations of AlertManager cluster instances are out of sync\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ - name: PrometheusTargetEmpty
+ rules:
+ - alert: PrometheusTargetEmpty
+ expr: prometheus_sd_discovered_targets == 0
+ for: 3m
+ labels:
+ severity: critical
+ annotations:
+ summary: Prometheus target empty (instance {{ $labels.instance }})
+ description: "Prometheus has no target in service discovery\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ - name: PrometheusTargetScrapingSlow
+ rules:
+ - alert: PrometheusTargetScrapingSlow
+ expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 120
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: Prometheus target scraping (instance {{ $labels.instance }})
+ description: "Prometheus is scraping exporters ly\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ - name: PrometheusLargeScrape
+ rules:
+ - alert: PrometheusLargeScrape
+ expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: Prometheus large scrape (instance {{ $labels.instance }})
+ description: "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ - name: HostMemoryUnderMemoryPressure
+ rules:
+ - alert: HostMemoryUnderMemoryPressure
+ expr: rate(node_vmstat_pgmajfault[1m]) > 1000
+ for: 2m
+ labels:
+ severity: warning
+ annotations:
+ summary: Host memory under memory pressure {{ $labels.node }}
+ description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ - name: HostUnusualDiskReadRate
+ rules:
+ - alert: HostUnusualDiskReadRate
+ expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 200
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: Host unusual disk read rate {{ $labels.node }}
+ description: "Disk is probably reading too much data (> 200 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ - name: HostUnusualDiskWriteRate
+ rules:
+ - alert: HostUnusualDiskWriteRate
+ expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 200
+ for: 3m
+ labels:
+ severity: warning
+ annotations:
+ summary: Host unusual disk write rate {{ $labels.node }}
+ description: "Disk is probably writing too much data (> 200 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ - name: HostCpuStealNoisyNeighbor
+ rules:
+ - alert: HostCpuStealNoisyNeighbor
+ expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10
+ for: 1m
+ labels:
+ severity: warning
+ annotations:
+ summary: Host CPU steal noisy neighbor {{ $labels.node }}
+ description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ - name: HostPhysicalComponentTooHot
+ rules:
+ - alert: HostPhysicalComponentTooHot
+ expr: node_hwmon_temp_celsius > 85
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: Host physical component too hot {{ $labels.node }}
+ description: "Physical hardware component too hot\n Sensor = {{ $labels.sensor }}\n Temp = {{ $value }}"
+ - name: SMARTbad
+ rules:
+ - alert: SMARTbad
+ expr: smartmon_device_smart_healthy < 1
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: SMART check bad of drive {{ $labels.exported_disk }} in server {{ $labels.node }}
+ description: "SMART check returned bad health of {{ $labels.exported_disk }} in server {{ $labels.node }}. VALUE = {{ $value }}\n LABELS = {{ $labels }}"
+ - name: SMARTcheck_old
+ rules:
+ - alert: "SMARTcheck too old"
+ expr: (time() - smartmon_smartctl_run) > 10800
+ labels:
+ severity: warning
+ annotations:
+ summary: "SMARTcheck not running"
+ description: 'The last SMARTcheck on server {{ $labels.node }} was more than 3h ago. Plox fix.'
\ No newline at end of file
diff --git a/prometheus/config-map.yaml b/prometheus/config-map.yaml
deleted file mode 100644
index 0ee0504..0000000
--- a/prometheus/config-map.yaml
+++ /dev/null
@@ -1,522 +0,0 @@
----
-kind: ConfigMap
-metadata:
- labels:
- app: prometheus
- app.kubernetes.io/instance: prometheus
- component: server
- release: prometheus
- name: prometheus-server
- namespace: prometheus
-apiVersion: v1
-data:
- alerting_rules.yml: |
- {}
- alerts: |
- groups:
- - name: memory_high
- rules:
- - alert: MemoryHigh
- expr: round((((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * 100), 0.1) > 80
- for: 5m
- labels:
- severity: warning
- annotations:
- summary: "Memory over 80%"
- description: "Memory on node {{ $labels.node }} is over 80% for more than 5 minutes. Plox fix. Memory usage: {{ $value }}%"
- - name: diskspace_low_worker
- rules:
- - alert: DiskspaceLow
- expr: round(node_filesystem_avail_bytes{mountpoint="/", node=~"worker.*"} / 1073742000, 0.1) < 50
- for: 1m
- labels:
- severity: warning
- annotations:
- summary: "Free disk space below 10 GB"
- description: "Disk space on server {{ $labels.node }} is under 10 GB. Plox fix. Free Space: {{ $value }} GB on partition {{ $labels.device }}"
- - name: diskspace_low_master
- rules:
- - alert: DiskspaceLow
- expr: round(node_filesystem_avail_bytes{mountpoint="/", node=~"master.*"} / 1073742000, 0.1) < 10
- for: 1m
- labels:
- severity: warning
- annotations:
- summary: "Free disk space below 2 GB"
- description: "Disk space on server {{ $labels.node }} is under 2 GB. Plox fix. Free Space: {{ $value }} GB on partition {{ $labels.device }}"
- - name: KubernetesUnhealthyPod
- rules:
- - alert: KubernetesUnhealthyPod
- expr: kube_pod_container_status_waiting_reason == 1
- for: 5m
- labels:
- severity: warning
- annotations:
- summary: "The Pod {{ $labels.pod }} is {{ $labels.reason }}"
- description: "The Pod {{ $labels.pod }} is in the state {{ $labels.reason }} for more than 5m. The Pod is in namespace {{ $labels.namespace }} and on node {{ $labels.node }}."
- - name: PrometheusTargetMissing
- rules:
- - alert: PrometheusTargetMissing
- expr: up == 0
- for: 3m
- labels:
- severity: critical
- annotations:
- summary: Prometheus target missing (instance {{ $labels.instance }})
- description: "A Prometheus target has disappeared. {{if ne $labels.job \"\"}}\n Job: {{ $labels.job }}{{end}}{{if ne $labels.app \"\"}}\n App: {{ $labels.app }}{{end}}{{if ne $labels.pod \"\"}}\n Pod: {{ $labels.pod }}{{end}}{{if ne $labels.node \"\"}}\n Node: {{ $labels.node }}{{end}}{{if ne $labels.namespace \"\"}}\n Namespace: {{ $labels.namespace }}{{end}}"
- - name: PrometheusConfigurationReloadFailure
- rules:
- - alert: PrometheusConfigurationReloadFailure
- expr: prometheus_config_last_reload_successful != 1
- for: 3m
- labels:
- severity: critical
- annotations:
- summary: Prometheus configuration reload failure (instance {{ $labels.instance }})
- description: "Prometheus configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- - name: PrometheusAlertmanagerConfigurationReloadFailure
- rules:
- - alert: PrometheusAlertmanagerConfigurationReloadFailure
- expr: alertmanager_config_last_reload_successful != 1
- for: 3m
- labels:
- severity: critical
- annotations:
- summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})
- description: "AlertManager configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- - name: PrometheusAlertmanagerConfigNotSynced
- rules:
- - alert: PrometheusAlertmanagerConfigNotSynced
- expr: count(count_values("config_hash", alertmanager_config_hash)) > 1
- for: 3m
- labels:
- severity: warning
- annotations:
- summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }})
- description: "Configurations of AlertManager cluster instances are out of sync\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- - name: PrometheusTargetEmpty
- rules:
- - alert: PrometheusTargetEmpty
- expr: prometheus_sd_discovered_targets == 0
- for: 3m
- labels:
- severity: critical
- annotations:
- summary: Prometheus target empty (instance {{ $labels.instance }})
- description: "Prometheus has no target in service discovery\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- - name: PrometheusTargetScrapingSlow
- rules:
- - alert: PrometheusTargetScrapingSlow
- expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 120
- for: 5m
- labels:
- severity: warning
- annotations:
- summary: Prometheus target scraping (instance {{ $labels.instance }})
- description: "Prometheus is scraping exporters ly\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- - name: PrometheusLargeScrape
- rules:
- - alert: PrometheusLargeScrape
- expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10
- for: 5m
- labels:
- severity: warning
- annotations:
- summary: Prometheus large scrape (instance {{ $labels.instance }})
- description: "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- - name: HostMemoryUnderMemoryPressure
- rules:
- - alert: HostMemoryUnderMemoryPressure
- expr: rate(node_vmstat_pgmajfault[1m]) > 1000
- for: 2m
- labels:
- severity: warning
- annotations:
- summary: Host memory under memory pressure {{ $labels.node }}
- description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- - name: HostUnusualDiskReadRate
- rules:
- - alert: HostUnusualDiskReadRate
- expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 200
- for: 5m
- labels:
- severity: warning
- annotations:
- summary: Host unusual disk read rate {{ $labels.node }}
- description: "Disk is probably reading too much data (> 200 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- - name: HostUnusualDiskWriteRate
- rules:
- - alert: HostUnusualDiskWriteRate
- expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 200
- for: 3m
- labels:
- severity: warning
- annotations:
- summary: Host unusual disk write rate {{ $labels.node }}
- description: "Disk is probably writing too much data (> 200 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- - name: HostCpuStealNoisyNeighbor
- rules:
- - alert: HostCpuStealNoisyNeighbor
- expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10
- for: 1m
- labels:
- severity: warning
- annotations:
- summary: Host CPU steal noisy neighbor {{ $labels.node }}
- description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- - name: HostPhysicalComponentTooHot
- rules:
- - alert: HostPhysicalComponentTooHot
- expr: node_hwmon_temp_celsius > 85
- for: 5m
- labels:
- severity: warning
- annotations:
- summary: Host physical component too hot {{ $labels.node }}
- description: "Physical hardware component too hot\n Sensor = {{ $labels.sensor }}\n Temp = {{ $value }}"
- - name: SMARTbad
- rules:
- - alert: SMARTbad
- expr: smartmon_device_smart_healthy < 1
- for: 0m
- labels:
- severity: critical
- annotations:
- summary: SMART check bad of drive {{ $labels.exported_disk }} in server {{ $labels.node }}
- description: "SMART check returned bad health of {{ $labels.exported_disk }} in server {{ $labels.node }}. VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- - name: SMARTcheck_old
- rules:
- - alert: "SMARTcheck too old"
- expr: (time() - smartmon_smartctl_run) > 10800
- labels:
- severity: warning
- annotations:
- summary: "SMARTcheck not running"
- description: 'The last SMARTcheck on server {{ $labels.node }} was more than 3h ago. Plox fix.'
- allow-snippet-annotations: 'false'
- prometheus.yml: |
- global:
- evaluation_interval: 1m
- scrape_interval: 1m
- scrape_timeout: 10s
- rule_files:
- - /etc/config/recording_rules.yml
- - /etc/config/alerting_rules.yml
- - /etc/config/rules
- - /etc/config/alerts
- scrape_configs:
- - job_name: prometheus
- static_configs:
- - targets:
- - localhost:9090
- - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
- job_name: kubernetes-apiservers
- kubernetes_sd_configs:
- - role: endpoints
- relabel_configs:
- - action: keep
- regex: default;kubernetes;https
- source_labels:
- - __meta_kubernetes_namespace
- - __meta_kubernetes_service_name
- - __meta_kubernetes_endpoint_port_name
- scheme: https
- tls_config:
- ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
- insecure_skip_verify: true
- - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
- job_name: kubernetes-nodes
- kubernetes_sd_configs:
- - role: node
- relabel_configs:
- - action: labelmap
- regex: __meta_kubernetes_node_label_(.+)
- - replacement: kubernetes.default.svc:443
- target_label: __address__
- - regex: (.+)
- replacement: /api/v1/nodes/$1/proxy/metrics
- source_labels:
- - __meta_kubernetes_node_name
- target_label: __metrics_path__
- scheme: https
- tls_config:
- ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
- insecure_skip_verify: true
- - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
- job_name: kubernetes-nodes-cadvisor
- kubernetes_sd_configs:
- - role: node
- relabel_configs:
- - action: labelmap
- regex: __meta_kubernetes_node_label_(.+)
- - replacement: kubernetes.default.svc:443
- target_label: __address__
- - regex: (.+)
- replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor
- source_labels:
- - __meta_kubernetes_node_name
- target_label: __metrics_path__
- scheme: https
- tls_config:
- ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
- insecure_skip_verify: true
- - honor_labels: true
- job_name: kubernetes-service-endpoints
- kubernetes_sd_configs:
- - role: endpoints
- relabel_configs:
- - action: keep
- regex: true
- source_labels:
- - __meta_kubernetes_service_annotation_prometheus_io_scrape
- - action: drop
- regex: true
- source_labels:
- - __meta_kubernetes_service_annotation_prometheus_io_scrape_slow
- - action: replace
- regex: (https?)
- source_labels:
- - __meta_kubernetes_service_annotation_prometheus_io_scheme
- target_label: __scheme__
- - action: replace
- regex: (.+)
- source_labels:
- - __meta_kubernetes_service_annotation_prometheus_io_path
- target_label: __metrics_path__
- - action: replace
- regex: (.+?)(?::\d+)?;(\d+)
- replacement: $1:$2
- source_labels:
- - __address__
- - __meta_kubernetes_service_annotation_prometheus_io_port
- target_label: __address__
- - action: labelmap
- regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+)
- replacement: __param_$1
- - action: labelmap
- regex: __meta_kubernetes_service_label_(.+)
- - action: replace
- source_labels:
- - __meta_kubernetes_namespace
- target_label: namespace
- - action: replace
- source_labels:
- - __meta_kubernetes_service_name
- target_label: service
- - action: replace
- source_labels:
- - __meta_kubernetes_pod_node_name
- target_label: node
- - honor_labels: true
- job_name: kubernetes-service-endpoints-slow
- kubernetes_sd_configs:
- - role: endpoints
- relabel_configs:
- - action: keep
- regex: true
- source_labels:
- - __meta_kubernetes_service_annotation_prometheus_io_scrape_slow
- - action: replace
- regex: (https?)
- source_labels:
- - __meta_kubernetes_service_annotation_prometheus_io_scheme
- target_label: __scheme__
- - action: replace
- regex: (.+)
- source_labels:
- - __meta_kubernetes_service_annotation_prometheus_io_path
- target_label: __metrics_path__
- - action: replace
- regex: (.+?)(?::\d+)?;(\d+)
- replacement: $1:$2
- source_labels:
- - __address__
- - __meta_kubernetes_service_annotation_prometheus_io_port
- target_label: __address__
- - action: labelmap
- regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+)
- replacement: __param_$1
- - action: labelmap
- regex: __meta_kubernetes_service_label_(.+)
- - action: replace
- source_labels:
- - __meta_kubernetes_namespace
- target_label: namespace
- - action: replace
- source_labels:
- - __meta_kubernetes_service_name
- target_label: service
- - action: replace
- source_labels:
- - __meta_kubernetes_pod_node_name
- target_label: node
- scrape_interval: 5m
- scrape_timeout: 30s
- - honor_labels: true
- job_name: prometheus-pushgateway
- kubernetes_sd_configs:
- - role: service
- relabel_configs:
- - action: keep
- regex: pushgateway
- source_labels:
- - __meta_kubernetes_service_annotation_prometheus_io_probe
- - honor_labels: true
- job_name: kubernetes-services
- kubernetes_sd_configs:
- - role: service
- metrics_path: /probe
- params:
- module:
- - http_2xx
- relabel_configs:
- - action: keep
- regex: true
- source_labels:
- - __meta_kubernetes_service_annotation_prometheus_io_probe
- - source_labels:
- - __address__
- target_label: __param_target
- - replacement: blackbox
- target_label: __address__
- - source_labels:
- - __param_target
- target_label: instance
- - action: labelmap
- regex: __meta_kubernetes_service_label_(.+)
- - source_labels:
- - __meta_kubernetes_namespace
- target_label: namespace
- - source_labels:
- - __meta_kubernetes_service_name
- target_label: service
- - honor_labels: true
- job_name: kubernetes-pods
- kubernetes_sd_configs:
- - role: pod
- relabel_configs:
- - action: keep
- regex: true
- source_labels:
- - __meta_kubernetes_pod_annotation_prometheus_io_scrape
- - action: drop
- regex: true
- source_labels:
- - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow
- - action: replace
- regex: (https?)
- source_labels:
- - __meta_kubernetes_pod_annotation_prometheus_io_scheme
- target_label: __scheme__
- - action: replace
- regex: (.+)
- source_labels:
- - __meta_kubernetes_pod_annotation_prometheus_io_path
- target_label: __metrics_path__
- - action: replace
- regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})
- replacement: '[$2]:$1'
- source_labels:
- - __meta_kubernetes_pod_annotation_prometheus_io_port
- - __meta_kubernetes_pod_ip
- target_label: __address__
- - action: replace
- regex: (\d+);((([0-9]+?)(\.|$)){4})
- replacement: $2:$1
- source_labels:
- - __meta_kubernetes_pod_annotation_prometheus_io_port
- - __meta_kubernetes_pod_ip
- target_label: __address__
- - action: labelmap
- regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+)
- replacement: __param_$1
- - action: labelmap
- regex: __meta_kubernetes_pod_label_(.+)
- - action: replace
- source_labels:
- - __meta_kubernetes_namespace
- target_label: namespace
- - action: replace
- source_labels:
- - __meta_kubernetes_pod_name
- target_label: pod
- - action: drop
- regex: Pending|Succeeded|Failed|Completed
- source_labels:
- - __meta_kubernetes_pod_phase
- - honor_labels: true
- job_name: kubernetes-pods-slow
- kubernetes_sd_configs:
- - role: pod
- relabel_configs:
- - action: keep
- regex: true
- source_labels:
- - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow
- - action: replace
- regex: (https?)
- source_labels:
- - __meta_kubernetes_pod_annotation_prometheus_io_scheme
- target_label: __scheme__
- - action: replace
- regex: (.+)
- source_labels:
- - __meta_kubernetes_pod_annotation_prometheus_io_path
- target_label: __metrics_path__
- - action: replace
- regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4})
- replacement: '[$2]:$1'
- source_labels:
- - __meta_kubernetes_pod_annotation_prometheus_io_port
- - __meta_kubernetes_pod_ip
- target_label: __address__
- - action: replace
- regex: (\d+);((([0-9]+?)(\.|$)){4})
- replacement: $2:$1
- source_labels:
- - __meta_kubernetes_pod_annotation_prometheus_io_port
- - __meta_kubernetes_pod_ip
- target_label: __address__
- - action: labelmap
- regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+)
- replacement: __param_$1
- - action: labelmap
- regex: __meta_kubernetes_pod_label_(.+)
- - action: replace
- source_labels:
- - __meta_kubernetes_namespace
- target_label: namespace
- - action: replace
- source_labels:
- - __meta_kubernetes_pod_name
- target_label: pod
- - action: drop
- regex: Pending|Succeeded|Failed|Completed
- source_labels:
- - __meta_kubernetes_pod_phase
- scrape_interval: 5m
- scrape_timeout: 30s
- alerting:
- alertmanagers:
- - kubernetes_sd_configs:
- - role: pod
- tls_config:
- ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
- bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
- relabel_configs:
- - source_labels: [__meta_kubernetes_namespace]
- regex: prometheus
- action: keep
- - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_instance]
- regex: prometheus
- action: keep
- - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
- regex: alertmanager
- action: keep
- - source_labels: [__meta_kubernetes_pod_container_port_number]
- regex: "9093"
- action: keep
- recording_rules.yml: |
- {}
- rules: |
- {}
diff --git a/prometheus/ingress.yaml b/prometheus/ingress.yaml
index 10f41e4..d09cdff 100644
--- a/prometheus/ingress.yaml
+++ b/prometheus/ingress.yaml
@@ -1,46 +1,4 @@
---
-apiVersion: networking.k8s.io/v1
-kind: Ingress
-metadata:
- name: prometheus-ingress
- namespace: prometheus
- annotations:
- nginx.org/basic-auth-secret: prometheus-basic-auth-secret
-spec:
- ingressClassName: nginx
- rules:
- - host: "prometheus.services.yolokube.de"
- http:
- paths:
- - pathType: Prefix
- path: "/"
- backend:
- service:
- name: prometheus-server
- port:
- number: 80
----
-apiVersion: networking.k8s.io/v1
-kind: Ingress
-metadata:
- name: alertmanager-ingress
- namespace: prometheus
- annotations:
- nginx.org/basic-auth-secret: prometheus-basic-auth-secret
-spec:
- ingressClassName: nginx
- rules:
- - host: "alertmanager.services.yolokube.de"
- http:
- paths:
- - pathType: Prefix
- path: "/"
- backend:
- service:
- name: prometheus-alertmanager
- port:
- number: 9093
----
kind: Secret
metadata:
name: prometheus-basic-auth-secret
diff --git a/prometheus/values.yaml b/prometheus/values.yaml
index 74821dc..568e549 100644
--- a/prometheus/values.yaml
+++ b/prometheus/values.yaml
@@ -1,17 +1,101 @@
alertmanager:
- image:
- repository: aaronriedel/alertmanager
- tag: "latest"
- extraSecretMounts:
- - name: telegram-api
- mountPath: /etc/alertmanager/telegram-token
- subPath: ""
- secretName: telegram-api
- readOnly: true
- configmapReload:
+ alertmanagerSpec:
+ image:
+ registry: docker.io
+ repository: aaronriedel/alertmanager
+ tag: "latest"
+ replicas: 2
+ externalUrl: alertmanager.services.yolokube.de
+ config:
+ global:
+ resolve_timeout: 5m
+ templates:
+ - '/etc/alertmanager/config/*.tmpl'
+ route:
+ group_by: ['alertname']
+ group_wait: 30s
+ group_interval: 30s
+ repeat_interval: 24h
+ receiver: 'tg1'
+ routes:
+ - matchers:
+ - severity=warning
+ receiver: 'tg1'
+ - matchers:
+ - severity=critical
+ receiver: 'tg1'
+ receivers:
+ - name: tg1
+ telegram_configs:
+ - bot_token_file: '/etc/alertmanager/telegram-token/api_key'
+ chat_id: -995270884
+ api_url: "https://api.telegram.org"
+ send_resolved: true
+ parse_mode: "HTML"
+ message: '{{ template "telegram.aaron" .}}'
+ inhibit_rules:
+ - source_matchers:
+ - severity = critical
+ target_matchers:
+ - severity = warning
+ equal: ['alertname', 'server', 'instance']
+ templateFiles:
+ telegram.tmpl: |-
+ {{ define "telegram.aaron" }}
+ {{ range .Alerts }}
+ {{ if eq .Status "firing"}}🔥 {{ .Labels.alertname }} 🔥{{ else }}✅ {{ .Labels.alertname }} ✅{{ end }}
+ {{ .Annotations.summary }}
+
+ {{ .Annotations.description }}
+ {{ end }}
+ {{ end }}
+ ingress:
enabled: true
+ ingressClassName: nginx
+ hosts:
+ - alertmanager.services.yolokube.de
+ annotations:
+ nginx.org/basic-auth-secret: prometheus-basic-auth-secret
+ ingressPerReplica:
+ enabled: true
+ ingressClassName: nginx
+ hostPrefix: alertmanager
+ hostDomain: services.yolokube.de
+ annotations:
+ nginx.org/basic-auth-secret: prometheus-basic-auth-secret
+ servicePerReplica:
+ enabled: true
+grafana:
+ defaultDashboardsTimezone: Europe/Berlin
+ ingress:
+ enabled: true
+ hosts:
+ - grafana.services.yolokube.de
+ ingressClassName: nginx
prometheus-node-exporter:
extraArgs:
- - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
+ - '--collector.filesystem.mount-points-exclude=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/.+)($|/)'
+ - '--collector.filesystem.fs-types-exclude=^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$'
- '--collector.textfile.directory=/host/root/var/log/'
- '--collector.ethtool'
+prometheus:
+ servicePerReplica:
+ enabled: true
+ ingress:
+ enabled: true
+ ingressClassName: nginx
+ hosts:
+ - prometheus.services.yolokube.de
+ annotations:
+ nginx.org/basic-auth-secret: prometheus-basic-auth-secret
+ ingressPerReplica:
+ enabled: true
+ hostPrefix: prometheus
+ hostDomain: services.yolokube.de
+ annotations:
+ nginx.org/basic-auth-secret: prometheus-basic-auth-secret
+ prometheusSpec:
+ externalUrl: prometheus.services.yolokube.de
+ replicas: 2
+ servicePerReplica:
+ enabled: true
\ No newline at end of file