fix typos and file layout for yamllint
Some checks failed
ci/woodpecker/push/yamllint Pipeline failed
Some checks failed
ci/woodpecker/push/yamllint Pipeline failed
This commit is contained in:
parent
61d19f3413
commit
e00cc2d4dd
65 changed files with 684 additions and 631 deletions
|
@ -10,114 +10,114 @@ spec:
|
|||
groups:
|
||||
- name: hardware
|
||||
rules:
|
||||
- alert: MemoryHigh
|
||||
expr: round((((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * 100), 0.1) > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Memory over 80%"
|
||||
description: "Memory on node {{ $labels.node }} is over 80% for more than 5 minutes. Plox fix. Memory usage: {{ $value }}%"
|
||||
- alert: DiskspaceLow
|
||||
expr: round(node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100, 1) < 5
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Free disk space at {{ $value }}%"
|
||||
description: "Disk space on node {{ $labels.node }} is only {{ $value }}%. Plox fix. Partition: {{ $labels.device }}"
|
||||
- alert: HostMemoryUnderMemoryPressure
|
||||
expr: rate(node_vmstat_pgmajfault[1m]) > 1000
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host memory under memory pressure {{ $labels.node }}
|
||||
description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
- alert: HostUnusualDiskReadRate
|
||||
expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 200
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk read rate {{ $labels.node }}
|
||||
description: "Disk is probably reading too much data (> 200 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
- alert: HostUnusualDiskWriteRate
|
||||
expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 200
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk write rate {{ $labels.node }}
|
||||
description: "Disk is probably writing too much data (> 200 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
- alert: HostCpuStealNoisyNeighbor
|
||||
expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host CPU steal noisy neighbor {{ $labels.node }}
|
||||
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
- alert: HostPhysicalComponentTooHot
|
||||
expr: node_hwmon_temp_celsius > 90
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host physical component too hot {{ $labels.node }}
|
||||
description: "Physical hardware component too hot\n Sensor = {{ $labels.sensor }}\n Temp = {{ $value }}"
|
||||
- alert: SMARTbad
|
||||
expr: smartmon_device_smart_healthy < 1
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: SMART check bad of drive {{ $labels.exported_disk }} in node {{ $labels.node }}
|
||||
description: "SMART check returned bad health of {{ $labels.exported_disk }} in node {{ $labels.node }}. VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
- alert: "SMARTcheck too old"
|
||||
expr: (time() - smartmon_smartctl_run) > 10800
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "SMARTcheck not running"
|
||||
description: 'The last SMARTcheck on node {{ $labels.node }} was more than 3h ago. Plox fix.'
|
||||
- alert: "ECC Memory errors"
|
||||
expr: (node_edac_correctable_errors_total) > 100
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "ECC errors on {{ $labels.node }}"
|
||||
description: 'The node {{ $labels.node }} accumulated {{ $value }} correctable errors.'
|
||||
- alert: "ECC Memory uncorrectable errors"
|
||||
expr: (node_edac_uncorrectable_errors_total) > 0
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "ECC errors on {{ $labels.node }}"
|
||||
description: 'The node {{ $labels.node }} accumulated {{ $value }} uncorrectable errors.'
|
||||
- alert: MemoryHigh
|
||||
expr: round((((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * 100), 0.1) > 80
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Memory over 80%"
|
||||
description: "Memory on node {{ $labels.node }} is over 80% for more than 5 minutes. Plox fix. Memory usage: {{ $value }}%"
|
||||
- alert: DiskspaceLow
|
||||
expr: round(node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100, 1) < 5
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Free disk space at {{ $value }}%"
|
||||
description: "Disk space on node {{ $labels.node }} is only {{ $value }}%. Plox fix. Partition: {{ $labels.device }}"
|
||||
- alert: HostMemoryUnderMemoryPressure
|
||||
expr: rate(node_vmstat_pgmajfault[1m]) > 1000
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host memory under memory pressure {{ $labels.node }}
|
||||
description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
- alert: HostUnusualDiskReadRate
|
||||
expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 200
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk read rate {{ $labels.node }}
|
||||
description: "Disk is probably reading too much data (> 200 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
- alert: HostUnusualDiskWriteRate
|
||||
expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 200
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk write rate {{ $labels.node }}
|
||||
description: "Disk is probably writing too much data (> 200 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
- alert: HostCpuStealNoisyNeighbor
|
||||
expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host CPU steal noisy neighbor {{ $labels.node }}
|
||||
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
- alert: HostPhysicalComponentTooHot
|
||||
expr: node_hwmon_temp_celsius > 90
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host physical component too hot {{ $labels.node }}
|
||||
description: "Physical hardware component too hot\n Sensor = {{ $labels.sensor }}\n Temp = {{ $value }}"
|
||||
- alert: SMARTbad
|
||||
expr: smartmon_device_smart_healthy < 1
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: SMART check bad of drive {{ $labels.exported_disk }} in node {{ $labels.node }}
|
||||
description: "SMART check returned bad health of {{ $labels.exported_disk }} in node {{ $labels.node }}. VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
- alert: "SMARTcheck too old"
|
||||
expr: (time() - smartmon_smartctl_run) > 10800
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "SMARTcheck not running"
|
||||
description: 'The last SMARTcheck on node {{ $labels.node }} was more than 3h ago. Plox fix.'
|
||||
- alert: "ECC Memory errors"
|
||||
expr: (node_edac_correctable_errors_total) > 100
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "ECC errors on {{ $labels.node }}"
|
||||
description: 'The node {{ $labels.node }} accumulated {{ $value }} correctable errors.'
|
||||
- alert: "ECC Memory uncorrectable errors"
|
||||
expr: (node_edac_uncorrectable_errors_total) > 0
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "ECC errors on {{ $labels.node }}"
|
||||
description: 'The node {{ $labels.node }} accumulated {{ $value }} uncorrectable errors.'
|
||||
- name: etcdbackup
|
||||
rules:
|
||||
- alert: "etcdbackup too old"
|
||||
expr: (time() - etcdbackup_time) > 10800
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "etcd backup not running"
|
||||
description: 'The last etcd backup on node {{ $labels.node }} was more than 3h ago. Plox fix.'
|
||||
- alert: "etcdbackup failed"
|
||||
expr: etcdbackup_result > 0
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "etcdbackup failed"
|
||||
description: "The backup script for etcd failed on node {{ $labels.node }}. Plox fix."
|
||||
- alert: "etcdbackup too old"
|
||||
expr: (time() - etcdbackup_time) > 10800
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "etcd backup not running"
|
||||
description: 'The last etcd backup on node {{ $labels.node }} was more than 3h ago. Plox fix.'
|
||||
- alert: "etcdbackup failed"
|
||||
expr: etcdbackup_result > 0
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "etcdbackup failed"
|
||||
description: "The backup script for etcd failed on node {{ $labels.node }}. Plox fix."
|
||||
- name: kubernetes
|
||||
rules:
|
||||
- alert: KubernetesUnhealthyPod
|
||||
expr: kube_pod_container_status_waiting_reason == 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "The Pod {{ $labels.pod }} is {{ $labels.reason }}"
|
||||
description: "The Pod {{ $labels.pod }} is in the state {{ $labels.reason }} for more than 5m. The Pod is in namespace {{ $labels.namespace }} and on node {{ $labels.node }}."
|
||||
- alert: KubernetesUnhealthyPod
|
||||
expr: kube_pod_container_status_waiting_reason == 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "The Pod {{ $labels.pod }} is {{ $labels.reason }}"
|
||||
description: "The Pod {{ $labels.pod }} is in the state {{ $labels.reason }} for more than 5m. The Pod is in namespace {{ $labels.namespace }} and on node {{ $labels.node }}."
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
---
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
generators:
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
---
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
---
|
||||
apiVersion: viaduct.ai/v1
|
||||
kind: ksops
|
||||
metadata:
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
---
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
---
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
|
@ -11,6 +12,6 @@ spec:
|
|||
app: longhorn-manager
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- longhorn-system
|
||||
- longhorn-system
|
||||
endpoints:
|
||||
- port: manager
|
||||
- port: manager
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
|
@ -12,4 +13,4 @@ data:
|
|||
|
||||
{{ .Annotations.description }}
|
||||
{{ end }}
|
||||
{{ end }}
|
||||
{{ end }}
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
---
|
||||
alertmanager:
|
||||
alertmanagerSpec:
|
||||
podAntiAffinity: "hard"
|
||||
|
@ -8,11 +9,11 @@ alertmanager:
|
|||
- "templates"
|
||||
storage:
|
||||
volumeClaimTemplate:
|
||||
spec:
|
||||
accessModes: ["ReadWriteOnce"]
|
||||
resources:
|
||||
requests:
|
||||
storage: 5Gi
|
||||
spec:
|
||||
accessModes: ["ReadWriteOnce"]
|
||||
resources:
|
||||
requests:
|
||||
storage: 5Gi
|
||||
useExistingSecret: false
|
||||
config:
|
||||
global:
|
||||
|
@ -27,20 +28,20 @@ alertmanager:
|
|||
receiver: 'tg1'
|
||||
routes:
|
||||
- matchers:
|
||||
- severity=warning
|
||||
- severity=warning
|
||||
receiver: 'tg1'
|
||||
- matchers:
|
||||
- severity=critical
|
||||
- severity=critical
|
||||
receiver: 'tg1'
|
||||
receivers:
|
||||
- name: tg1
|
||||
telegram_configs:
|
||||
- bot_token_file: '/etc/alertmanager/secrets/telegram-api/api_key'
|
||||
chat_id: -995270884
|
||||
api_url: "https://api.telegram.org"
|
||||
send_resolved: true
|
||||
parse_mode: "HTML"
|
||||
message: '{{ template "telegram.aaron" .}}'
|
||||
- name: tg1
|
||||
telegram_configs:
|
||||
- bot_token_file: '/etc/alertmanager/secrets/telegram-api/api_key'
|
||||
chat_id: -995270884
|
||||
api_url: "https://api.telegram.org"
|
||||
send_resolved: true
|
||||
parse_mode: "HTML"
|
||||
message: '{{ template "telegram.aaron" .}}'
|
||||
inhibit_rules:
|
||||
- source_matchers:
|
||||
- severity = critical
|
||||
|
@ -97,7 +98,7 @@ grafana:
|
|||
persistence:
|
||||
enabled: true
|
||||
accessModes:
|
||||
- ReadWriteMany
|
||||
- ReadWriteMany
|
||||
grafana.ini:
|
||||
auth:
|
||||
disable_login_form: true
|
||||
|
@ -168,12 +169,12 @@ prometheus:
|
|||
replicas: 2
|
||||
storageSpec:
|
||||
volumeClaimTemplate:
|
||||
spec:
|
||||
storageClassName: longhorn
|
||||
accessModes: ["ReadWriteOnce"]
|
||||
resources:
|
||||
requests:
|
||||
storage: 10Gi
|
||||
spec:
|
||||
storageClassName: longhorn
|
||||
accessModes: ["ReadWriteOnce"]
|
||||
resources:
|
||||
requests:
|
||||
storage: 10Gi
|
||||
serviceMonitorNamespaceSelector:
|
||||
matchLabels:
|
||||
prometheus: yolokube
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue