diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..d37e42c
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+**/secret.yaml
\ No newline at end of file
diff --git a/prometheus/alerts.yaml b/prometheus/alerts.yaml
index 248223f..b1e269b 100644
--- a/prometheus/alerts.yaml
+++ b/prometheus/alerts.yaml
@@ -4,9 +4,11 @@ kind: PrometheusRule
metadata:
name: prometheus-core-deployment-rules
namespace: prometheus
+ labels:
+ monitor: core-deployment
spec:
groups:
- - name: memory_high
+ - name: hardware
rules:
- alert: MemoryHigh
expr: round((((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * 100), 0.1) > 80
@@ -16,9 +18,7 @@ spec:
annotations:
summary: "Memory over 80%"
description: "Memory on node {{ $labels.node }} is over 80% for more than 5 minutes. Plox fix. Memory usage: {{ $value }}%"
- - name: diskspace_low_worker
- rules:
- - alert: DiskspaceLow
+ - alert: DiskspaceLowWorker
expr: round(node_filesystem_avail_bytes{mountpoint="/", node=~"worker.*"} / 1073742000, 0.1) < 50
for: 1m
labels:
@@ -26,9 +26,7 @@ spec:
annotations:
summary: "Free disk space below 10 GB"
description: "Disk space on server {{ $labels.node }} is under 10 GB. Plox fix. Free Space: {{ $value }} GB on partition {{ $labels.device }}"
- - name: diskspace_low_master
- rules:
- - alert: DiskspaceLow
+ - alert: DiskspaceLowMaster
expr: round(node_filesystem_avail_bytes{mountpoint="/", node=~"master.*"} / 1073742000, 0.1) < 10
for: 1m
labels:
@@ -36,88 +34,6 @@ spec:
annotations:
summary: "Free disk space below 2 GB"
description: "Disk space on server {{ $labels.node }} is under 2 GB. Plox fix. Free Space: {{ $value }} GB on partition {{ $labels.device }}"
- - name: KubernetesUnhealthyPod
- rules:
- - alert: KubernetesUnhealthyPod
- expr: kube_pod_container_status_waiting_reason == 1
- for: 5m
- labels:
- severity: warning
- annotations:
- summary: "The Pod {{ $labels.pod }} is {{ $labels.reason }}"
- description: "The Pod {{ $labels.pod }} is in the state {{ $labels.reason }} for more than 5m. The Pod is in namespace {{ $labels.namespace }} and on node {{ $labels.node }}."
- - name: PrometheusTargetMissing
- rules:
- - alert: PrometheusTargetMissing
- expr: up == 0
- for: 3m
- labels:
- severity: critical
- annotations:
- summary: Prometheus target missing (instance {{ $labels.instance }})
- description: "A Prometheus target has disappeared. {{if ne $labels.job \"\"}}\n Job: {{ $labels.job }}{{end}}{{if ne $labels.app \"\"}}\n App: {{ $labels.app }}{{end}}{{if ne $labels.pod \"\"}}\n Pod: {{ $labels.pod }}{{end}}{{if ne $labels.node \"\"}}\n Node: {{ $labels.node }}{{end}}{{if ne $labels.namespace \"\"}}\n Namespace: {{ $labels.namespace }}{{end}}"
- - name: PrometheusConfigurationReloadFailure
- rules:
- - alert: PrometheusConfigurationReloadFailure
- expr: prometheus_config_last_reload_successful != 1
- for: 3m
- labels:
- severity: critical
- annotations:
- summary: Prometheus configuration reload failure (instance {{ $labels.instance }})
- description: "Prometheus configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- - name: PrometheusAlertmanagerConfigurationReloadFailure
- rules:
- - alert: PrometheusAlertmanagerConfigurationReloadFailure
- expr: alertmanager_config_last_reload_successful != 1
- for: 3m
- labels:
- severity: critical
- annotations:
- summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})
- description: "AlertManager configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- - name: PrometheusAlertmanagerConfigNotSynced
- rules:
- - alert: PrometheusAlertmanagerConfigNotSynced
- expr: count(count_values("config_hash", alertmanager_config_hash)) > 1
- for: 3m
- labels:
- severity: warning
- annotations:
- summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }})
- description: "Configurations of AlertManager cluster instances are out of sync\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- - name: PrometheusTargetEmpty
- rules:
- - alert: PrometheusTargetEmpty
- expr: prometheus_sd_discovered_targets == 0
- for: 3m
- labels:
- severity: critical
- annotations:
- summary: Prometheus target empty (instance {{ $labels.instance }})
- description: "Prometheus has no target in service discovery\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- - name: PrometheusTargetScrapingSlow
- rules:
- - alert: PrometheusTargetScrapingSlow
- expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 120
- for: 5m
- labels:
- severity: warning
- annotations:
- summary: Prometheus target scraping (instance {{ $labels.instance }})
- description: "Prometheus is scraping exporters ly\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- - name: PrometheusLargeScrape
- rules:
- - alert: PrometheusLargeScrape
- expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10
- for: 5m
- labels:
- severity: warning
- annotations:
- summary: Prometheus large scrape (instance {{ $labels.instance }})
- description: "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- - name: HostMemoryUnderMemoryPressure
- rules:
- alert: HostMemoryUnderMemoryPressure
expr: rate(node_vmstat_pgmajfault[1m]) > 1000
for: 2m
@@ -126,8 +42,6 @@ spec:
annotations:
summary: Host memory under memory pressure {{ $labels.node }}
description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- - name: HostUnusualDiskReadRate
- rules:
- alert: HostUnusualDiskReadRate
expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 200
for: 5m
@@ -136,8 +50,6 @@ spec:
annotations:
summary: Host unusual disk read rate {{ $labels.node }}
description: "Disk is probably reading too much data (> 200 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- - name: HostUnusualDiskWriteRate
- rules:
- alert: HostUnusualDiskWriteRate
expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 200
for: 3m
@@ -146,8 +58,6 @@ spec:
annotations:
summary: Host unusual disk write rate {{ $labels.node }}
description: "Disk is probably writing too much data (> 200 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- - name: HostCpuStealNoisyNeighbor
- rules:
- alert: HostCpuStealNoisyNeighbor
expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10
for: 1m
@@ -156,8 +66,6 @@ spec:
annotations:
summary: Host CPU steal noisy neighbor {{ $labels.node }}
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- - name: HostPhysicalComponentTooHot
- rules:
- alert: HostPhysicalComponentTooHot
expr: node_hwmon_temp_celsius > 85
for: 5m
@@ -166,8 +74,6 @@ spec:
annotations:
summary: Host physical component too hot {{ $labels.node }}
description: "Physical hardware component too hot\n Sensor = {{ $labels.sensor }}\n Temp = {{ $value }}"
- - name: SMARTbad
- rules:
- alert: SMARTbad
expr: smartmon_device_smart_healthy < 1
for: 0m
@@ -176,12 +82,20 @@ spec:
annotations:
summary: SMART check bad of drive {{ $labels.exported_disk }} in server {{ $labels.node }}
description: "SMART check returned bad health of {{ $labels.exported_disk }} in server {{ $labels.node }}. VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- - name: SMARTcheck_old
- rules:
- alert: "SMARTcheck too old"
expr: (time() - smartmon_smartctl_run) > 10800
labels:
severity: warning
annotations:
summary: "SMARTcheck not running"
- description: 'The last SMARTcheck on server {{ $labels.node }} was more than 3h ago. Plox fix.'
\ No newline at end of file
+ description: 'The last SMARTcheck on server {{ $labels.node }} was more than 3h ago. Plox fix.'
+ - name: kubernetes
+ rules:
+ - alert: KubernetesUnhealthyPod
+ expr: kube_pod_container_status_waiting_reason == 1
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "The Pod {{ $labels.pod }} is {{ $labels.reason }}"
+ description: "The Pod {{ $labels.pod }} is in the state {{ $labels.reason }} for more than 5m. The Pod is in namespace {{ $labels.namespace }} and on node {{ $labels.node }}."
\ No newline at end of file
diff --git a/prometheus/values.yaml b/prometheus/values.yaml
index 83a34fd..abff96a 100644
--- a/prometheus/values.yaml
+++ b/prometheus/values.yaml
@@ -1,53 +1,14 @@
alertmanager:
alertmanagerSpec:
- image:
- registry: registry-1.docker.io
- repository: aaronriedel/alertmanager
- tag: latest
replicas: 2
- config:
- global:
- resolve_timeout: 5m
- templates:
- - '/etc/alertmanager/config/*.tmpl'
- route:
- group_by: ['alertname']
- group_wait: 30s
- group_interval: 30s
- repeat_interval: 24h
- receiver: 'tg1'
- routes:
- - matchers:
- - severity=warning
- receiver: 'tg1'
- - matchers:
- - severity=critical
- receiver: 'tg1'
- receivers:
- - name: tg1
- telegram_configs:
- - bot_token_file: '/etc/alertmanager/telegram-token/api_key'
- chat_id: -995270884
- api_url: "https://api.telegram.org"
- send_resolved: true
- parse_mode: "HTML"
- message: '{{ template "telegram.aaron" .}}'
- inhibit_rules:
- - source_matchers:
- - severity = critical
- target_matchers:
- - severity = warning
- equal: ['alertname', 'server', 'instance']
- templateFiles:
- telegram.tmpl: |-
- {{ define "telegram.aaron" }}
- {{ range .Alerts }}
- {{ if eq .Status "firing"}}🔥 {{ .Labels.alertname }} 🔥{{ else }}✅ {{ .Labels.alertname }} ✅{{ end }}
- {{ .Annotations.summary }}
-
- {{ .Annotations.description }}
- {{ end }}
- {{ end }}
+ storage:
+ volumeClaimTemplate:
+ spec:
+ accessModes: ["ReadWriteOnce"]
+ resources:
+ requests:
+ storage: 5Gi
+ useExistingSecret: true
ingress:
paths:
- /
@@ -105,5 +66,18 @@ prometheus:
nginx.org/basic-auth-secret: prometheus-basic-auth-secret
prometheusSpec:
replicas: 2
+ storageSpec:
+ volumeClaimTemplate:
+ spec:
+ accessModes: ["ReadWriteOnce"]
+ resources:
+ requests:
+ storage: 10Gi
+ ruleNamespaceSelector: {}
+ ruleSelectorNilUsesHelmValues: false
+ serviceMonitorSelectorNilUsesHelmValues: false
+ podMonitorSelectorNilUsesHelmValues: false
+ probeSelectorNilUsesHelmValues: false
+ scrapeConfigSelectorNilUsesHelmValues: false
servicePerReplica:
enabled: true
\ No newline at end of file