fix typos and file layout for yamllint

2024-10-07 09:19:39 +02:00 · 2024-10-07 09:19:39 +02:00 · e00cc2d4dd
commit e00cc2d4dd
parent 61d19f3413
65 changed files with 684 additions and 631 deletions
--- a/prometheus/alerts.yaml
+++ b/prometheus/alerts.yaml
@ -10,114 +10,114 @@ spec:
  groups:
    - name: hardware
      rules:
-      - alert: MemoryHigh
-        expr: round((((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * 100), 0.1) > 80
-        for: 5m
-        labels:
-          severity: warning
-        annotations:
-          summary: "Memory over 80%"
-          description: "Memory on node {{ $labels.node }} is over 80% for more than 5 minutes. Plox fix. Memory usage: {{ $value }}%"
-      - alert: DiskspaceLow
-        expr: round(node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100, 1) < 5
-        for: 1m
-        labels:
-          severity: warning
-        annotations:
-          summary: "Free disk space at {{ $value }}%"
-          description: "Disk space on node {{ $labels.node }} is only {{ $value }}%. Plox fix. Partition: {{ $labels.device }}"
-      - alert: HostMemoryUnderMemoryPressure
-        expr: rate(node_vmstat_pgmajfault[1m]) > 1000
-        for: 2m
-        labels:
-          severity: warning
-        annotations:
-          summary: Host memory under memory pressure {{ $labels.node }}
-          description: "The node is under heavy memory pressure. High rate of major page faults\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-      - alert: HostUnusualDiskReadRate
-        expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 200
-        for: 5m
-        labels:
-          severity: warning
-        annotations:
-          summary: Host unusual disk read rate {{ $labels.node }}
-          description: "Disk is probably reading too much data (> 200 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-      - alert: HostUnusualDiskWriteRate
-        expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 200
-        for: 3m
-        labels:
-          severity: warning
-        annotations:
-          summary: Host unusual disk write rate {{ $labels.node }}
-          description: "Disk is probably writing too much data (> 200 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-      - alert: HostCpuStealNoisyNeighbor
-        expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10
-        for: 1m
-        labels:
-          severity: warning
-        annotations:
-          summary: Host CPU steal noisy neighbor {{ $labels.node }}
-          description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-      - alert: HostPhysicalComponentTooHot
-        expr: node_hwmon_temp_celsius > 90
-        for: 5m
-        labels:
-          severity: warning
-        annotations:
-          summary: Host physical component too hot {{ $labels.node }}
-          description: "Physical hardware component too hot\n  Sensor = {{ $labels.sensor }}\n Temp = {{ $value }}"
-      - alert: SMARTbad
-        expr: smartmon_device_smart_healthy < 1
-        for: 0m
-        labels:
-          severity: critical
-        annotations:
-          summary: SMART check bad of drive {{ $labels.exported_disk }} in node {{ $labels.node }}
-          description: "SMART check returned bad health of {{ $labels.exported_disk }} in node {{ $labels.node }}. VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-      - alert: "SMARTcheck too old"
-        expr: (time() - smartmon_smartctl_run) > 10800
-        labels:
-          severity: warning
-        annotations:
-          summary: "SMARTcheck not running"
-          description: 'The last SMARTcheck on node {{ $labels.node }} was more than 3h ago. Plox fix.'
-      - alert: "ECC Memory errors"
-        expr: (node_edac_correctable_errors_total) > 100
-        labels:
-          severity: warning
-        annotations:
-          summary: "ECC errors on {{ $labels.node }}"
-          description: 'The node {{ $labels.node }} accumulated {{ $value }} correctable errors.'
-      - alert: "ECC Memory uncorrectable errors"
-        expr: (node_edac_uncorrectable_errors_total) > 0
-        labels:
-          severity: critical
-        annotations:
-          summary: "ECC errors on {{ $labels.node }}"
-          description: 'The node {{ $labels.node }} accumulated {{ $value }} uncorrectable errors.'
+        - alert: MemoryHigh
+          expr: round((((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * 100), 0.1) > 80
+          for: 5m
+          labels:
+            severity: warning
+          annotations:
+            summary: "Memory over 80%"
+            description: "Memory on node {{ $labels.node }} is over 80% for more than 5 minutes. Plox fix. Memory usage: {{ $value }}%"
+        - alert: DiskspaceLow
+          expr: round(node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100, 1) < 5
+          for: 1m
+          labels:
+            severity: warning
+          annotations:
+            summary: "Free disk space at {{ $value }}%"
+            description: "Disk space on node {{ $labels.node }} is only {{ $value }}%. Plox fix. Partition: {{ $labels.device }}"
+        - alert: HostMemoryUnderMemoryPressure
+          expr: rate(node_vmstat_pgmajfault[1m]) > 1000
+          for: 2m
+          labels:
+            severity: warning
+          annotations:
+            summary: Host memory under memory pressure {{ $labels.node }}
+            description: "The node is under heavy memory pressure. High rate of major page faults\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        - alert: HostUnusualDiskReadRate
+          expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 200
+          for: 5m
+          labels:
+            severity: warning
+          annotations:
+            summary: Host unusual disk read rate {{ $labels.node }}
+            description: "Disk is probably reading too much data (> 200 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        - alert: HostUnusualDiskWriteRate
+          expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 200
+          for: 3m
+          labels:
+            severity: warning
+          annotations:
+            summary: Host unusual disk write rate {{ $labels.node }}
+            description: "Disk is probably writing too much data (> 200 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        - alert: HostCpuStealNoisyNeighbor
+          expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10
+          for: 1m
+          labels:
+            severity: warning
+          annotations:
+            summary: Host CPU steal noisy neighbor {{ $labels.node }}
+            description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        - alert: HostPhysicalComponentTooHot
+          expr: node_hwmon_temp_celsius > 90
+          for: 5m
+          labels:
+            severity: warning
+          annotations:
+            summary: Host physical component too hot {{ $labels.node }}
+            description: "Physical hardware component too hot\n  Sensor = {{ $labels.sensor }}\n Temp = {{ $value }}"
+        - alert: SMARTbad
+          expr: smartmon_device_smart_healthy < 1
+          for: 0m
+          labels:
+            severity: critical
+          annotations:
+            summary: SMART check bad of drive {{ $labels.exported_disk }} in node {{ $labels.node }}
+            description: "SMART check returned bad health of {{ $labels.exported_disk }} in node {{ $labels.node }}. VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        - alert: "SMARTcheck too old"
+          expr: (time() - smartmon_smartctl_run) > 10800
+          labels:
+            severity: warning
+          annotations:
+            summary: "SMARTcheck not running"
+            description: 'The last SMARTcheck on node {{ $labels.node }} was more than 3h ago. Plox fix.'
+        - alert: "ECC Memory errors"
+          expr: (node_edac_correctable_errors_total) > 100
+          labels:
+            severity: warning
+          annotations:
+            summary: "ECC errors on {{ $labels.node }}"
+            description: 'The node {{ $labels.node }} accumulated {{ $value }} correctable errors.'
+        - alert: "ECC Memory uncorrectable errors"
+          expr: (node_edac_uncorrectable_errors_total) > 0
+          labels:
+            severity: critical
+          annotations:
+            summary: "ECC errors on {{ $labels.node }}"
+            description: 'The node {{ $labels.node }} accumulated {{ $value }} uncorrectable errors.'
    - name: etcdbackup
      rules:
-      - alert: "etcdbackup too old"
-        expr: (time() - etcdbackup_time) > 10800
-        labels:
-          severity: warning
-        annotations:
-          summary: "etcd backup not running"
-          description: 'The last etcd backup on node {{ $labels.node }} was more than 3h ago. Plox fix.'
-      - alert: "etcdbackup failed"
-        expr: etcdbackup_result > 0
-        labels:
-          severity: warning
-        annotations:
-          summary: "etcdbackup failed"
-          description: "The backup script for etcd failed on node {{ $labels.node }}. Plox fix."
+        - alert: "etcdbackup too old"
+          expr: (time() - etcdbackup_time) > 10800
+          labels:
+            severity: warning
+          annotations:
+            summary: "etcd backup not running"
+            description: 'The last etcd backup on node {{ $labels.node }} was more than 3h ago. Plox fix.'
+        - alert: "etcdbackup failed"
+          expr: etcdbackup_result > 0
+          labels:
+            severity: warning
+          annotations:
+            summary: "etcdbackup failed"
+            description: "The backup script for etcd failed on node {{ $labels.node }}. Plox fix."
    - name: kubernetes
      rules:
-      - alert: KubernetesUnhealthyPod
-        expr: kube_pod_container_status_waiting_reason == 1
-        for: 5m
-        labels:
-          severity: warning
-        annotations:
-          summary: "The Pod {{ $labels.pod }} is {{ $labels.reason }}"
-          description: "The Pod {{ $labels.pod }} is in the state {{ $labels.reason }} for more than 5m. The Pod is in namespace {{ $labels.namespace }} and on node {{ $labels.node }}."
+        - alert: KubernetesUnhealthyPod
+          expr: kube_pod_container_status_waiting_reason == 1
+          for: 5m
+          labels:
+            severity: warning
+          annotations:
+            summary: "The Pod {{ $labels.pod }} is {{ $labels.reason }}"
+            description: "The Pod {{ $labels.pod }} is in the state {{ $labels.reason }} for more than 5m. The Pod is in namespace {{ $labels.namespace }} and on node {{ $labels.node }}."
--- a/prometheus/kustomization.yaml
+++ b/prometheus/kustomization.yaml
@ -1,3 +1,4 @@
+---
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 generators:
--- a/prometheus/namespace.yaml
+++ b/prometheus/namespace.yaml
@ -1,3 +1,4 @@
+---
 apiVersion: v1
 kind: Namespace
 metadata:
--- a/prometheus/secret-generator.yaml
+++ b/prometheus/secret-generator.yaml
@ -1,3 +1,4 @@
+---
 apiVersion: viaduct.ai/v1
 kind: ksops
 metadata:
--- a/prometheus/secret.yaml
+++ b/prometheus/secret.yaml
@ -1,3 +1,4 @@
+---
 apiVersion: v1
 kind: Secret
 metadata:
--- a/prometheus/service-monitor-longhorn.yaml
+++ b/prometheus/service-monitor-longhorn.yaml
@ -1,3 +1,4 @@
+---
 apiVersion: monitoring.coreos.com/v1
 kind: ServiceMonitor
 metadata:
@ -11,6 +12,6 @@ spec:
      app: longhorn-manager
  namespaceSelector:
    matchNames:
-    - longhorn-system
+      - longhorn-system
  endpoints:
-  - port: manager
+    - port: manager
--- a/prometheus/templates.yaml
+++ b/prometheus/templates.yaml
@ -1,3 +1,4 @@
+---
 apiVersion: v1
 kind: ConfigMap
 metadata:
@ -12,4 +13,4 @@ data:

      {{ .Annotations.description }}
      {{ end }}
-    {{ end }}
+    {{ end }}
--- a/prometheus/values.yaml
+++ b/prometheus/values.yaml
@ -1,3 +1,4 @@
+---
 alertmanager:
  alertmanagerSpec:
    podAntiAffinity: "hard"
@ -8,11 +9,11 @@ alertmanager:
      - "templates"
    storage:
      volumeClaimTemplate:
-       spec:
-         accessModes: ["ReadWriteOnce"]
-         resources:
-           requests:
-             storage: 5Gi
+        spec:
+          accessModes: ["ReadWriteOnce"]
+          resources:
+            requests:
+              storage: 5Gi
    useExistingSecret: false
  config:
    global:
@ -27,20 +28,20 @@ alertmanager:
      receiver: 'tg1'
      routes:
        - matchers:
-          - severity=warning
+            - severity=warning
          receiver: 'tg1'
        - matchers:
-          - severity=critical
+            - severity=critical
          receiver: 'tg1'
    receivers:
-    - name: tg1
-      telegram_configs:
-      - bot_token_file: '/etc/alertmanager/secrets/telegram-api/api_key'
-        chat_id: -995270884
-        api_url: "https://api.telegram.org"
-        send_resolved: true
-        parse_mode: "HTML"
-        message: '{{ template "telegram.aaron" .}}'
+      - name: tg1
+        telegram_configs:
+          - bot_token_file: '/etc/alertmanager/secrets/telegram-api/api_key'
+            chat_id: -995270884
+            api_url: "https://api.telegram.org"
+            send_resolved: true
+            parse_mode: "HTML"
+            message: '{{ template "telegram.aaron" .}}'
    inhibit_rules:
      - source_matchers:
          - severity = critical
@ -97,7 +98,7 @@ grafana:
  persistence:
    enabled: true
    accessModes:
-     - ReadWriteMany
+      - ReadWriteMany
  grafana.ini:
    auth:
      disable_login_form: true
@ -168,12 +169,12 @@ prometheus:
    replicas: 2
    storageSpec:
      volumeClaimTemplate:
-       spec:
-         storageClassName: longhorn
-         accessModes: ["ReadWriteOnce"]
-         resources:
-           requests:
-             storage: 10Gi
+        spec:
+          storageClassName: longhorn
+          accessModes: ["ReadWriteOnce"]
+          resources:
+            requests:
+              storage: 10Gi
    serviceMonitorNamespaceSelector:
      matchLabels:
        prometheus: yolokube