From e63707d16cd148746bed3b7a429591ae61816413 Mon Sep 17 00:00:00 2001 From: Aaron Riedel Date: Tue, 20 Jun 2023 13:15:28 +0200 Subject: [PATCH] try to fix prometheus deployment 6 (final) (for now) --- .gitignore | 1 + prometheus/alerts.yaml | 118 ++++++----------------------------------- prometheus/values.yaml | 68 ++++++++---------------- 3 files changed, 38 insertions(+), 149 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d37e42c --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +**/secret.yaml \ No newline at end of file diff --git a/prometheus/alerts.yaml b/prometheus/alerts.yaml index 248223f..b1e269b 100644 --- a/prometheus/alerts.yaml +++ b/prometheus/alerts.yaml @@ -4,9 +4,11 @@ kind: PrometheusRule metadata: name: prometheus-core-deployment-rules namespace: prometheus + labels: + monitor: core-deployment spec: groups: - - name: memory_high + - name: hardware rules: - alert: MemoryHigh expr: round((((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * 100), 0.1) > 80 @@ -16,9 +18,7 @@ spec: annotations: summary: "Memory over 80%" description: "Memory on node {{ $labels.node }} is over 80% for more than 5 minutes. Plox fix. Memory usage: {{ $value }}%" - - name: diskspace_low_worker - rules: - - alert: DiskspaceLow + - alert: DiskspaceLowWorker expr: round(node_filesystem_avail_bytes{mountpoint="/", node=~"worker.*"} / 1073742000, 0.1) < 50 for: 1m labels: @@ -26,9 +26,7 @@ spec: annotations: summary: "Free disk space below 10 GB" description: "Disk space on server {{ $labels.node }} is under 10 GB. Plox fix. Free Space: {{ $value }} GB on partition {{ $labels.device }}" - - name: diskspace_low_master - rules: - - alert: DiskspaceLow + - alert: DiskspaceLowMaster expr: round(node_filesystem_avail_bytes{mountpoint="/", node=~"master.*"} / 1073742000, 0.1) < 10 for: 1m labels: @@ -36,88 +34,6 @@ spec: annotations: summary: "Free disk space below 2 GB" description: "Disk space on server {{ $labels.node }} is under 2 GB. Plox fix. Free Space: {{ $value }} GB on partition {{ $labels.device }}" - - name: KubernetesUnhealthyPod - rules: - - alert: KubernetesUnhealthyPod - expr: kube_pod_container_status_waiting_reason == 1 - for: 5m - labels: - severity: warning - annotations: - summary: "The Pod {{ $labels.pod }} is {{ $labels.reason }}" - description: "The Pod {{ $labels.pod }} is in the state {{ $labels.reason }} for more than 5m. The Pod is in namespace {{ $labels.namespace }} and on node {{ $labels.node }}." - - name: PrometheusTargetMissing - rules: - - alert: PrometheusTargetMissing - expr: up == 0 - for: 3m - labels: - severity: critical - annotations: - summary: Prometheus target missing (instance {{ $labels.instance }}) - description: "A Prometheus target has disappeared. {{if ne $labels.job \"\"}}\n Job: {{ $labels.job }}{{end}}{{if ne $labels.app \"\"}}\n App: {{ $labels.app }}{{end}}{{if ne $labels.pod \"\"}}\n Pod: {{ $labels.pod }}{{end}}{{if ne $labels.node \"\"}}\n Node: {{ $labels.node }}{{end}}{{if ne $labels.namespace \"\"}}\n Namespace: {{ $labels.namespace }}{{end}}" - - name: PrometheusConfigurationReloadFailure - rules: - - alert: PrometheusConfigurationReloadFailure - expr: prometheus_config_last_reload_successful != 1 - for: 3m - labels: - severity: critical - annotations: - summary: Prometheus configuration reload failure (instance {{ $labels.instance }}) - description: "Prometheus configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - name: PrometheusAlertmanagerConfigurationReloadFailure - rules: - - alert: PrometheusAlertmanagerConfigurationReloadFailure - expr: alertmanager_config_last_reload_successful != 1 - for: 3m - labels: - severity: critical - annotations: - summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}) - description: "AlertManager configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - name: PrometheusAlertmanagerConfigNotSynced - rules: - - alert: PrometheusAlertmanagerConfigNotSynced - expr: count(count_values("config_hash", alertmanager_config_hash)) > 1 - for: 3m - labels: - severity: warning - annotations: - summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }}) - description: "Configurations of AlertManager cluster instances are out of sync\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - name: PrometheusTargetEmpty - rules: - - alert: PrometheusTargetEmpty - expr: prometheus_sd_discovered_targets == 0 - for: 3m - labels: - severity: critical - annotations: - summary: Prometheus target empty (instance {{ $labels.instance }}) - description: "Prometheus has no target in service discovery\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - name: PrometheusTargetScrapingSlow - rules: - - alert: PrometheusTargetScrapingSlow - expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 120 - for: 5m - labels: - severity: warning - annotations: - summary: Prometheus target scraping (instance {{ $labels.instance }}) - description: "Prometheus is scraping exporters ly\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - name: PrometheusLargeScrape - rules: - - alert: PrometheusLargeScrape - expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10 - for: 5m - labels: - severity: warning - annotations: - summary: Prometheus large scrape (instance {{ $labels.instance }}) - description: "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - name: HostMemoryUnderMemoryPressure - rules: - alert: HostMemoryUnderMemoryPressure expr: rate(node_vmstat_pgmajfault[1m]) > 1000 for: 2m @@ -126,8 +42,6 @@ spec: annotations: summary: Host memory under memory pressure {{ $labels.node }} description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - name: HostUnusualDiskReadRate - rules: - alert: HostUnusualDiskReadRate expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 200 for: 5m @@ -136,8 +50,6 @@ spec: annotations: summary: Host unusual disk read rate {{ $labels.node }} description: "Disk is probably reading too much data (> 200 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - name: HostUnusualDiskWriteRate - rules: - alert: HostUnusualDiskWriteRate expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 200 for: 3m @@ -146,8 +58,6 @@ spec: annotations: summary: Host unusual disk write rate {{ $labels.node }} description: "Disk is probably writing too much data (> 200 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - name: HostCpuStealNoisyNeighbor - rules: - alert: HostCpuStealNoisyNeighbor expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10 for: 1m @@ -156,8 +66,6 @@ spec: annotations: summary: Host CPU steal noisy neighbor {{ $labels.node }} description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - name: HostPhysicalComponentTooHot - rules: - alert: HostPhysicalComponentTooHot expr: node_hwmon_temp_celsius > 85 for: 5m @@ -166,8 +74,6 @@ spec: annotations: summary: Host physical component too hot {{ $labels.node }} description: "Physical hardware component too hot\n Sensor = {{ $labels.sensor }}\n Temp = {{ $value }}" - - name: SMARTbad - rules: - alert: SMARTbad expr: smartmon_device_smart_healthy < 1 for: 0m @@ -176,12 +82,20 @@ spec: annotations: summary: SMART check bad of drive {{ $labels.exported_disk }} in server {{ $labels.node }} description: "SMART check returned bad health of {{ $labels.exported_disk }} in server {{ $labels.node }}. VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - name: SMARTcheck_old - rules: - alert: "SMARTcheck too old" expr: (time() - smartmon_smartctl_run) > 10800 labels: severity: warning annotations: summary: "SMARTcheck not running" - description: 'The last SMARTcheck on server {{ $labels.node }} was more than 3h ago. Plox fix.' \ No newline at end of file + description: 'The last SMARTcheck on server {{ $labels.node }} was more than 3h ago. Plox fix.' + - name: kubernetes + rules: + - alert: KubernetesUnhealthyPod + expr: kube_pod_container_status_waiting_reason == 1 + for: 5m + labels: + severity: warning + annotations: + summary: "The Pod {{ $labels.pod }} is {{ $labels.reason }}" + description: "The Pod {{ $labels.pod }} is in the state {{ $labels.reason }} for more than 5m. The Pod is in namespace {{ $labels.namespace }} and on node {{ $labels.node }}." \ No newline at end of file diff --git a/prometheus/values.yaml b/prometheus/values.yaml index 83a34fd..abff96a 100644 --- a/prometheus/values.yaml +++ b/prometheus/values.yaml @@ -1,53 +1,14 @@ alertmanager: alertmanagerSpec: - image: - registry: registry-1.docker.io - repository: aaronriedel/alertmanager - tag: latest replicas: 2 - config: - global: - resolve_timeout: 5m - templates: - - '/etc/alertmanager/config/*.tmpl' - route: - group_by: ['alertname'] - group_wait: 30s - group_interval: 30s - repeat_interval: 24h - receiver: 'tg1' - routes: - - matchers: - - severity=warning - receiver: 'tg1' - - matchers: - - severity=critical - receiver: 'tg1' - receivers: - - name: tg1 - telegram_configs: - - bot_token_file: '/etc/alertmanager/telegram-token/api_key' - chat_id: -995270884 - api_url: "https://api.telegram.org" - send_resolved: true - parse_mode: "HTML" - message: '{{ template "telegram.aaron" .}}' - inhibit_rules: - - source_matchers: - - severity = critical - target_matchers: - - severity = warning - equal: ['alertname', 'server', 'instance'] - templateFiles: - telegram.tmpl: |- - {{ define "telegram.aaron" }} - {{ range .Alerts }} - {{ if eq .Status "firing"}}🔥 {{ .Labels.alertname }} 🔥{{ else }}✅ {{ .Labels.alertname }} ✅{{ end }} - {{ .Annotations.summary }} - - {{ .Annotations.description }} - {{ end }} - {{ end }} + storage: + volumeClaimTemplate: + spec: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 5Gi + useExistingSecret: true ingress: paths: - / @@ -105,5 +66,18 @@ prometheus: nginx.org/basic-auth-secret: prometheus-basic-auth-secret prometheusSpec: replicas: 2 + storageSpec: + volumeClaimTemplate: + spec: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 10Gi + ruleNamespaceSelector: {} + ruleSelectorNilUsesHelmValues: false + serviceMonitorSelectorNilUsesHelmValues: false + podMonitorSelectorNilUsesHelmValues: false + probeSelectorNilUsesHelmValues: false + scrapeConfigSelectorNilUsesHelmValues: false servicePerReplica: enabled: true \ No newline at end of file