try to fix prometheus deployment 6 (final) (for now)
This commit is contained in:
parent
c706f9b61e
commit
e63707d16c
3 changed files with 38 additions and 149 deletions
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
|
@ -0,0 +1 @@
|
|||
**/secret.yaml
|
|
@ -4,9 +4,11 @@ kind: PrometheusRule
|
|||
metadata:
|
||||
name: prometheus-core-deployment-rules
|
||||
namespace: prometheus
|
||||
labels:
|
||||
monitor: core-deployment
|
||||
spec:
|
||||
groups:
|
||||
- name: memory_high
|
||||
- name: hardware
|
||||
rules:
|
||||
- alert: MemoryHigh
|
||||
expr: round((((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * 100), 0.1) > 80
|
||||
|
@ -16,9 +18,7 @@ spec:
|
|||
annotations:
|
||||
summary: "Memory over 80%"
|
||||
description: "Memory on node {{ $labels.node }} is over 80% for more than 5 minutes. Plox fix. Memory usage: {{ $value }}%"
|
||||
- name: diskspace_low_worker
|
||||
rules:
|
||||
- alert: DiskspaceLow
|
||||
- alert: DiskspaceLowWorker
|
||||
expr: round(node_filesystem_avail_bytes{mountpoint="/", node=~"worker.*"} / 1073742000, 0.1) < 50
|
||||
for: 1m
|
||||
labels:
|
||||
|
@ -26,9 +26,7 @@ spec:
|
|||
annotations:
|
||||
summary: "Free disk space below 10 GB"
|
||||
description: "Disk space on server {{ $labels.node }} is under 10 GB. Plox fix. Free Space: {{ $value }} GB on partition {{ $labels.device }}"
|
||||
- name: diskspace_low_master
|
||||
rules:
|
||||
- alert: DiskspaceLow
|
||||
- alert: DiskspaceLowMaster
|
||||
expr: round(node_filesystem_avail_bytes{mountpoint="/", node=~"master.*"} / 1073742000, 0.1) < 10
|
||||
for: 1m
|
||||
labels:
|
||||
|
@ -36,7 +34,62 @@ spec:
|
|||
annotations:
|
||||
summary: "Free disk space below 2 GB"
|
||||
description: "Disk space on server {{ $labels.node }} is under 2 GB. Plox fix. Free Space: {{ $value }} GB on partition {{ $labels.device }}"
|
||||
- name: KubernetesUnhealthyPod
|
||||
- alert: HostMemoryUnderMemoryPressure
|
||||
expr: rate(node_vmstat_pgmajfault[1m]) > 1000
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host memory under memory pressure {{ $labels.node }}
|
||||
description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
- alert: HostUnusualDiskReadRate
|
||||
expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 200
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk read rate {{ $labels.node }}
|
||||
description: "Disk is probably reading too much data (> 200 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
- alert: HostUnusualDiskWriteRate
|
||||
expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 200
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk write rate {{ $labels.node }}
|
||||
description: "Disk is probably writing too much data (> 200 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
- alert: HostCpuStealNoisyNeighbor
|
||||
expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host CPU steal noisy neighbor {{ $labels.node }}
|
||||
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
- alert: HostPhysicalComponentTooHot
|
||||
expr: node_hwmon_temp_celsius > 85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host physical component too hot {{ $labels.node }}
|
||||
description: "Physical hardware component too hot\n Sensor = {{ $labels.sensor }}\n Temp = {{ $value }}"
|
||||
- alert: SMARTbad
|
||||
expr: smartmon_device_smart_healthy < 1
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: SMART check bad of drive {{ $labels.exported_disk }} in server {{ $labels.node }}
|
||||
description: "SMART check returned bad health of {{ $labels.exported_disk }} in server {{ $labels.node }}. VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
- alert: "SMARTcheck too old"
|
||||
expr: (time() - smartmon_smartctl_run) > 10800
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "SMARTcheck not running"
|
||||
description: 'The last SMARTcheck on server {{ $labels.node }} was more than 3h ago. Plox fix.'
|
||||
- name: kubernetes
|
||||
rules:
|
||||
- alert: KubernetesUnhealthyPod
|
||||
expr: kube_pod_container_status_waiting_reason == 1
|
||||
|
@ -46,142 +99,3 @@ spec:
|
|||
annotations:
|
||||
summary: "The Pod {{ $labels.pod }} is {{ $labels.reason }}"
|
||||
description: "The Pod {{ $labels.pod }} is in the state {{ $labels.reason }} for more than 5m. The Pod is in namespace {{ $labels.namespace }} and on node {{ $labels.node }}."
|
||||
- name: PrometheusTargetMissing
|
||||
rules:
|
||||
- alert: PrometheusTargetMissing
|
||||
expr: up == 0
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus target missing (instance {{ $labels.instance }})
|
||||
description: "A Prometheus target has disappeared. {{if ne $labels.job \"\"}}\n Job: {{ $labels.job }}{{end}}{{if ne $labels.app \"\"}}\n App: {{ $labels.app }}{{end}}{{if ne $labels.pod \"\"}}\n Pod: {{ $labels.pod }}{{end}}{{if ne $labels.node \"\"}}\n Node: {{ $labels.node }}{{end}}{{if ne $labels.namespace \"\"}}\n Namespace: {{ $labels.namespace }}{{end}}"
|
||||
- name: PrometheusConfigurationReloadFailure
|
||||
rules:
|
||||
- alert: PrometheusConfigurationReloadFailure
|
||||
expr: prometheus_config_last_reload_successful != 1
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus configuration reload failure (instance {{ $labels.instance }})
|
||||
description: "Prometheus configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
- name: PrometheusAlertmanagerConfigurationReloadFailure
|
||||
rules:
|
||||
- alert: PrometheusAlertmanagerConfigurationReloadFailure
|
||||
expr: alertmanager_config_last_reload_successful != 1
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})
|
||||
description: "AlertManager configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
- name: PrometheusAlertmanagerConfigNotSynced
|
||||
rules:
|
||||
- alert: PrometheusAlertmanagerConfigNotSynced
|
||||
expr: count(count_values("config_hash", alertmanager_config_hash)) > 1
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }})
|
||||
description: "Configurations of AlertManager cluster instances are out of sync\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
- name: PrometheusTargetEmpty
|
||||
rules:
|
||||
- alert: PrometheusTargetEmpty
|
||||
expr: prometheus_sd_discovered_targets == 0
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus target empty (instance {{ $labels.instance }})
|
||||
description: "Prometheus has no target in service discovery\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
- name: PrometheusTargetScrapingSlow
|
||||
rules:
|
||||
- alert: PrometheusTargetScrapingSlow
|
||||
expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 120
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Prometheus target scraping (instance {{ $labels.instance }})
|
||||
description: "Prometheus is scraping exporters ly\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
- name: PrometheusLargeScrape
|
||||
rules:
|
||||
- alert: PrometheusLargeScrape
|
||||
expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Prometheus large scrape (instance {{ $labels.instance }})
|
||||
description: "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
- name: HostMemoryUnderMemoryPressure
|
||||
rules:
|
||||
- alert: HostMemoryUnderMemoryPressure
|
||||
expr: rate(node_vmstat_pgmajfault[1m]) > 1000
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host memory under memory pressure {{ $labels.node }}
|
||||
description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
- name: HostUnusualDiskReadRate
|
||||
rules:
|
||||
- alert: HostUnusualDiskReadRate
|
||||
expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 200
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk read rate {{ $labels.node }}
|
||||
description: "Disk is probably reading too much data (> 200 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
- name: HostUnusualDiskWriteRate
|
||||
rules:
|
||||
- alert: HostUnusualDiskWriteRate
|
||||
expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 200
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk write rate {{ $labels.node }}
|
||||
description: "Disk is probably writing too much data (> 200 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
- name: HostCpuStealNoisyNeighbor
|
||||
rules:
|
||||
- alert: HostCpuStealNoisyNeighbor
|
||||
expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host CPU steal noisy neighbor {{ $labels.node }}
|
||||
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
- name: HostPhysicalComponentTooHot
|
||||
rules:
|
||||
- alert: HostPhysicalComponentTooHot
|
||||
expr: node_hwmon_temp_celsius > 85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host physical component too hot {{ $labels.node }}
|
||||
description: "Physical hardware component too hot\n Sensor = {{ $labels.sensor }}\n Temp = {{ $value }}"
|
||||
- name: SMARTbad
|
||||
rules:
|
||||
- alert: SMARTbad
|
||||
expr: smartmon_device_smart_healthy < 1
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: SMART check bad of drive {{ $labels.exported_disk }} in server {{ $labels.node }}
|
||||
description: "SMART check returned bad health of {{ $labels.exported_disk }} in server {{ $labels.node }}. VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
- name: SMARTcheck_old
|
||||
rules:
|
||||
- alert: "SMARTcheck too old"
|
||||
expr: (time() - smartmon_smartctl_run) > 10800
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "SMARTcheck not running"
|
||||
description: 'The last SMARTcheck on server {{ $labels.node }} was more than 3h ago. Plox fix.'
|
|
@ -1,53 +1,14 @@
|
|||
alertmanager:
|
||||
alertmanagerSpec:
|
||||
image:
|
||||
registry: registry-1.docker.io
|
||||
repository: aaronriedel/alertmanager
|
||||
tag: latest
|
||||
replicas: 2
|
||||
config:
|
||||
global:
|
||||
resolve_timeout: 5m
|
||||
templates:
|
||||
- '/etc/alertmanager/config/*.tmpl'
|
||||
route:
|
||||
group_by: ['alertname']
|
||||
group_wait: 30s
|
||||
group_interval: 30s
|
||||
repeat_interval: 24h
|
||||
receiver: 'tg1'
|
||||
routes:
|
||||
- matchers:
|
||||
- severity=warning
|
||||
receiver: 'tg1'
|
||||
- matchers:
|
||||
- severity=critical
|
||||
receiver: 'tg1'
|
||||
receivers:
|
||||
- name: tg1
|
||||
telegram_configs:
|
||||
- bot_token_file: '/etc/alertmanager/telegram-token/api_key'
|
||||
chat_id: -995270884
|
||||
api_url: "https://api.telegram.org"
|
||||
send_resolved: true
|
||||
parse_mode: "HTML"
|
||||
message: '{{ template "telegram.aaron" .}}'
|
||||
inhibit_rules:
|
||||
- source_matchers:
|
||||
- severity = critical
|
||||
target_matchers:
|
||||
- severity = warning
|
||||
equal: ['alertname', 'server', 'instance']
|
||||
templateFiles:
|
||||
telegram.tmpl: |-
|
||||
{{ define "telegram.aaron" }}
|
||||
{{ range .Alerts }}
|
||||
{{ if eq .Status "firing"}}🔥 <b>{{ .Labels.alertname }}</b> 🔥{{ else }}✅ <b>{{ .Labels.alertname }}</b> ✅{{ end }}
|
||||
<b> {{ .Annotations.summary }} </b>
|
||||
|
||||
{{ .Annotations.description }}
|
||||
{{ end }}
|
||||
{{ end }}
|
||||
storage:
|
||||
volumeClaimTemplate:
|
||||
spec:
|
||||
accessModes: ["ReadWriteOnce"]
|
||||
resources:
|
||||
requests:
|
||||
storage: 5Gi
|
||||
useExistingSecret: true
|
||||
ingress:
|
||||
paths:
|
||||
- /
|
||||
|
@ -105,5 +66,18 @@ prometheus:
|
|||
nginx.org/basic-auth-secret: prometheus-basic-auth-secret
|
||||
prometheusSpec:
|
||||
replicas: 2
|
||||
storageSpec:
|
||||
volumeClaimTemplate:
|
||||
spec:
|
||||
accessModes: ["ReadWriteOnce"]
|
||||
resources:
|
||||
requests:
|
||||
storage: 10Gi
|
||||
ruleNamespaceSelector: {}
|
||||
ruleSelectorNilUsesHelmValues: false
|
||||
serviceMonitorSelectorNilUsesHelmValues: false
|
||||
podMonitorSelectorNilUsesHelmValues: false
|
||||
probeSelectorNilUsesHelmValues: false
|
||||
scrapeConfigSelectorNilUsesHelmValues: false
|
||||
servicePerReplica:
|
||||
enabled: true
|
Loading…
Reference in a new issue