From 9d6fa51fc7dbcd0de51ac9b62a5219383bd1d286 Mon Sep 17 00:00:00 2001 From: Ultradesu Date: Mon, 9 Feb 2026 12:44:34 +0200 Subject: [PATCH 01/12] Fixed node alert --- k8s/core/prom-stack/grafana-alerting-configmap.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/k8s/core/prom-stack/grafana-alerting-configmap.yaml b/k8s/core/prom-stack/grafana-alerting-configmap.yaml index 27a412c..2dfdddd 100644 --- a/k8s/core/prom-stack/grafana-alerting-configmap.yaml +++ b/k8s/core/prom-stack/grafana-alerting-configmap.yaml @@ -63,7 +63,7 @@ data: - orgId: 1 name: kubernetes_alerts folder: Kubernetes - interval: 30s + interval: 2m rules: - uid: node_not_ready title: Kubernetes Node Not Ready @@ -71,7 +71,7 @@ data: data: - refId: A relativeTimeRange: - from: 300 + from: 600 to: 0 datasourceUid: P76F38748CEC837F0 model: @@ -81,7 +81,7 @@ data: maxDataPoints: 43200 - refId: B relativeTimeRange: - from: 300 + from: 600 to: 0 datasourceUid: __expr__ model: @@ -103,7 +103,7 @@ data: type: reduce noDataState: Alerting execErrState: Alerting - for: 0s + for: 10m annotations: node: '{{ $labels.node }}' condition: '{{ $labels.condition }}' @@ -149,4 +149,4 @@ data: - alertname group_wait: 10s group_interval: 5m - repeat_interval: 4h + repeat_interval: 12h From 1aee4d5cd7678eb8fc4c734e1caa8d2974426953 Mon Sep 17 00:00:00 2001 From: Ultradesu Date: Mon, 9 Feb 2026 12:48:09 +0200 Subject: [PATCH 02/12] Fixed node alert --- k8s/core/prom-stack/grafana-alerting-configmap.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/k8s/core/prom-stack/grafana-alerting-configmap.yaml b/k8s/core/prom-stack/grafana-alerting-configmap.yaml index 2dfdddd..706abad 100644 --- a/k8s/core/prom-stack/grafana-alerting-configmap.yaml +++ b/k8s/core/prom-stack/grafana-alerting-configmap.yaml @@ -75,7 +75,7 @@ data: to: 0 datasourceUid: P76F38748CEC837F0 model: - expr: 'kube_node_status_condition{condition="Ready",status="true"} == 0' + expr: 'kube_node_status_condition{condition="Ready",status="false"} == 1' refId: A intervalMs: 1000 maxDataPoints: 43200 From 137384ce55a3a17a3eb461c8936df7957bf26d26 Mon Sep 17 00:00:00 2001 From: Ultradesu Date: Mon, 9 Feb 2026 12:52:22 +0200 Subject: [PATCH 03/12] Fixed node alert --- k8s/core/prom-stack/grafana-alerting-configmap.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/k8s/core/prom-stack/grafana-alerting-configmap.yaml b/k8s/core/prom-stack/grafana-alerting-configmap.yaml index 706abad..2dfdddd 100644 --- a/k8s/core/prom-stack/grafana-alerting-configmap.yaml +++ b/k8s/core/prom-stack/grafana-alerting-configmap.yaml @@ -75,7 +75,7 @@ data: to: 0 datasourceUid: P76F38748CEC837F0 model: - expr: 'kube_node_status_condition{condition="Ready",status="false"} == 1' + expr: 'kube_node_status_condition{condition="Ready",status="true"} == 0' refId: A intervalMs: 1000 maxDataPoints: 43200 From 8a8cab019f3e8ca8dbaee96715f57b0531557808 Mon Sep 17 00:00:00 2001 From: Ultradesu Date: Mon, 9 Feb 2026 13:00:15 +0200 Subject: [PATCH 04/12] Added node alerts --- .../grafana-alerting-configmap.yaml | 230 ++++++++++++++++++ 1 file changed, 230 insertions(+) diff --git a/k8s/core/prom-stack/grafana-alerting-configmap.yaml b/k8s/core/prom-stack/grafana-alerting-configmap.yaml index 2dfdddd..9c7ad41 100644 --- a/k8s/core/prom-stack/grafana-alerting-configmap.yaml +++ b/k8s/core/prom-stack/grafana-alerting-configmap.yaml @@ -110,6 +110,236 @@ data: summary: 'Kubernetes node is not ready' labels: severity: critical + + - uid: node_high_memory_usage + title: High Node Memory Usage + condition: B + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: P76F38748CEC837F0 + model: + expr: '(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100' + refId: A + intervalMs: 1000 + maxDataPoints: 43200 + - refId: B + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 80 + type: gt + operator: + type: and + query: + params: [] + datasource: + type: __expr__ + uid: __expr__ + expression: A + reducer: last + refId: B + type: reduce + noDataState: NoData + execErrState: Alerting + for: 5m + annotations: + node: '{{ $labels.instance }}' + memory_usage: '{{ printf "%.1f%%" $values.A }}' + summary: 'Node memory usage is critically high' + labels: + severity: warning + + - uid: node_high_cpu_usage + title: High Node CPU Usage + condition: B + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: P76F38748CEC837F0 + model: + expr: '100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)' + refId: A + intervalMs: 1000 + maxDataPoints: 43200 + - refId: B + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 80 + type: gt + operator: + type: and + query: + params: [] + datasource: + type: __expr__ + uid: __expr__ + expression: A + reducer: last + refId: B + type: reduce + noDataState: NoData + execErrState: Alerting + for: 10m + annotations: + node: '{{ $labels.instance }}' + cpu_usage: '{{ printf "%.1f%%" $values.A }}' + summary: 'Node CPU usage is critically high' + labels: + severity: warning + + - uid: node_high_disk_usage + title: High Node Disk Usage + condition: B + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: P76F38748CEC837F0 + model: + expr: '(1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|fuse.lxcfs"} / node_filesystem_size_bytes)) * 100' + refId: A + intervalMs: 1000 + maxDataPoints: 43200 + - refId: B + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 85 + type: gt + operator: + type: and + query: + params: [] + datasource: + type: __expr__ + uid: __expr__ + expression: A + reducer: last + refId: B + type: reduce + noDataState: NoData + execErrState: Alerting + for: 5m + annotations: + node: '{{ $labels.instance }}' + filesystem: '{{ $labels.mountpoint }}' + disk_usage: '{{ printf "%.1f%%" $values.A }}' + summary: 'Node disk usage is critically high' + labels: + severity: critical + + - uid: node_load_average_high + title: High Node Load Average + condition: B + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: P76F38748CEC837F0 + model: + expr: 'node_load5 / on(instance) group_left count by(instance)(node_cpu_seconds_total{mode="idle"})' + refId: A + intervalMs: 1000 + maxDataPoints: 43200 + - refId: B + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0.8 + type: gt + operator: + type: and + query: + params: [] + datasource: + type: __expr__ + uid: __expr__ + expression: A + reducer: last + refId: B + type: reduce + noDataState: NoData + execErrState: Alerting + for: 5m + annotations: + node: '{{ $labels.instance }}' + load_average: '{{ printf "%.2f" $values.A }}' + summary: 'Node load average is high relative to CPU count' + labels: + severity: warning + + - uid: node_exporter_down + title: Node Exporter Down + condition: B + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: P76F38748CEC837F0 + model: + expr: 'up{job="node-exporter"}' + refId: A + intervalMs: 1000 + maxDataPoints: 43200 + - refId: B + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 1 + type: lt + operator: + type: and + query: + params: [] + datasource: + type: __expr__ + uid: __expr__ + expression: A + reducer: last + refId: B + type: reduce + noDataState: Alerting + execErrState: Alerting + for: 2m + annotations: + node: '{{ $labels.instance }}' + summary: 'Node exporter is down - unable to collect metrics' + labels: + severity: critical contactpoints.yaml: | apiVersion: 1 From d19ae33cd15aa9c03e1463504816da251f4baec6 Mon Sep 17 00:00:00 2001 From: Ultradesu Date: Mon, 9 Feb 2026 13:04:25 +0200 Subject: [PATCH 05/12] Adjusted node alerts --- k8s/core/prom-stack/grafana-alerting-configmap.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/k8s/core/prom-stack/grafana-alerting-configmap.yaml b/k8s/core/prom-stack/grafana-alerting-configmap.yaml index 9c7ad41..d864833 100644 --- a/k8s/core/prom-stack/grafana-alerting-configmap.yaml +++ b/k8s/core/prom-stack/grafana-alerting-configmap.yaml @@ -75,7 +75,7 @@ data: to: 0 datasourceUid: P76F38748CEC837F0 model: - expr: 'kube_node_status_condition{condition="Ready",status="true"} == 0' + expr: 'kube_node_status_condition{condition="Ready",status="false"}' refId: A intervalMs: 1000 maxDataPoints: 43200 From 1451a5fb373bffcf070bee80e619594a6b9a55ef Mon Sep 17 00:00:00 2001 From: Ultradesu Date: Mon, 9 Feb 2026 14:15:17 +0200 Subject: [PATCH 06/12] 2 Adjusted node alerts --- .../grafana-alerting-configmap.yaml | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/k8s/core/prom-stack/grafana-alerting-configmap.yaml b/k8s/core/prom-stack/grafana-alerting-configmap.yaml index d864833..fc49d45 100644 --- a/k8s/core/prom-stack/grafana-alerting-configmap.yaml +++ b/k8s/core/prom-stack/grafana-alerting-configmap.yaml @@ -45,7 +45,7 @@ data: type: __expr__ uid: __expr__ expression: A - reducer: last + reducer: min refId: B type: reduce noDataState: NoData @@ -98,10 +98,10 @@ data: type: __expr__ uid: __expr__ expression: A - reducer: last + reducer: min refId: B type: reduce - noDataState: Alerting + noDataState: NoData execErrState: Alerting for: 10m annotations: @@ -144,7 +144,7 @@ data: type: __expr__ uid: __expr__ expression: A - reducer: last + reducer: max refId: B type: reduce noDataState: NoData @@ -190,7 +190,7 @@ data: type: __expr__ uid: __expr__ expression: A - reducer: last + reducer: max refId: B type: reduce noDataState: NoData @@ -213,7 +213,7 @@ data: to: 0 datasourceUid: P76F38748CEC837F0 model: - expr: '(1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|fuse.lxcfs"} / node_filesystem_size_bytes)) * 100' + expr: '(1 - (node_filesystem_avail_bytes{fstype=~"ext[234]|xfs|zfs|btrfs"} / node_filesystem_size_bytes)) * 100' refId: A intervalMs: 1000 maxDataPoints: 43200 @@ -236,7 +236,7 @@ data: type: __expr__ uid: __expr__ expression: A - reducer: last + reducer: max refId: B type: reduce noDataState: NoData @@ -283,7 +283,7 @@ data: type: __expr__ uid: __expr__ expression: A - reducer: last + reducer: max refId: B type: reduce noDataState: NoData @@ -329,10 +329,10 @@ data: type: __expr__ uid: __expr__ expression: A - reducer: last + reducer: min refId: B type: reduce - noDataState: Alerting + noDataState: NoData execErrState: Alerting for: 2m annotations: From fc00513db31ad3ea40b15ff7114b5a4dbd2e4922 Mon Sep 17 00:00:00 2001 From: Ultradesu Date: Mon, 9 Feb 2026 14:46:27 +0200 Subject: [PATCH 07/12] 3 Adjusted node alerts --- k8s/core/prom-stack/alertmanager-config.yaml | 46 ++++++++++++++++++++ k8s/core/prom-stack/kustomization.yaml | 1 + 2 files changed, 47 insertions(+) create mode 100644 k8s/core/prom-stack/alertmanager-config.yaml diff --git a/k8s/core/prom-stack/alertmanager-config.yaml b/k8s/core/prom-stack/alertmanager-config.yaml new file mode 100644 index 0000000..0b0221d --- /dev/null +++ b/k8s/core/prom-stack/alertmanager-config.yaml @@ -0,0 +1,46 @@ +apiVersion: monitoring.coreos.com/v1alpha1 +kind: AlertmanagerConfig +metadata: + name: telegram-notifications + namespace: prometheus + labels: + app: kube-prometheus-stack-alertmanager + release: prometheus +spec: + route: + groupBy: ['alertname', 'cluster', 'service'] + groupWait: 10s + groupInterval: 5m + repeatInterval: 12h + receiver: telegram + routes: + - matchers: + - name: alertname + value: Watchdog + matchType: "=" + receiver: 'null' + receivers: + - name: telegram + telegramConfigs: + - botToken: + name: alertmanager-telegram-secret + key: TELEGRAM_BOT_TOKEN + chatID: 124317807 + parseMode: HTML + sendResolved: true + disableNotifications: false + message: | + {{ if eq .Status "firing" }}🔥 FIRING{{ else }}✅ RESOLVED{{ end }} + + {{ range .Alerts }} + 📊 {{ .Labels.alertname }} + {{ .Annotations.summary }} + + {{ if .Annotations.node }}🖥 Node: {{ .Annotations.node }}{{ end }} + {{ if .Annotations.pod }}📦 Pod: {{ .Annotations.pod }}{{ end }} + {{ if .Annotations.namespace }}📁 Namespace: {{ .Annotations.namespace }}{{ end }} + {{ if .Annotations.throttle_rate }}⚠️ Throttling rate: {{ .Annotations.throttle_rate }}{{ end }} + + 🔗 View in Grafana + {{ end }} + - name: 'null' diff --git a/k8s/core/prom-stack/kustomization.yaml b/k8s/core/prom-stack/kustomization.yaml index e08a833..225c5e4 100644 --- a/k8s/core/prom-stack/kustomization.yaml +++ b/k8s/core/prom-stack/kustomization.yaml @@ -5,6 +5,7 @@ resources: - persistentVolume.yaml - external-secrets.yaml - grafana-alerting-configmap.yaml + - alertmanager-config.yaml helmCharts: - name: kube-prometheus-stack From 014284c11d864e5e298a179fbc7c71fa1b741cc2 Mon Sep 17 00:00:00 2001 From: Ultradesu Date: Mon, 9 Feb 2026 14:51:43 +0200 Subject: [PATCH 08/12] Added alertmanager ingress --- k8s/core/prom-stack/prom-values.yaml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/k8s/core/prom-stack/prom-values.yaml b/k8s/core/prom-stack/prom-values.yaml index b96bd52..2978ab0 100644 --- a/k8s/core/prom-stack/prom-values.yaml +++ b/k8s/core/prom-stack/prom-values.yaml @@ -29,6 +29,17 @@ alertmanager: alertmanagerSpec: secrets: - alertmanager-telegram-secret + ingress: + enabled: true + ingressClassName: traefik + hosts: + - prom.hexor.cy + paths: + - /alertmanager + tls: + - secretName: grafana-tls + hosts: + - prom.hexor.cy prometheus: prometheusSpec: From 1b2b4da98dbd29007c5b0e9e319497806ed11891 Mon Sep 17 00:00:00 2001 From: Ultradesu Date: Mon, 9 Feb 2026 14:55:22 +0200 Subject: [PATCH 09/12] Added alertmanager ingress --- k8s/core/prom-stack/prom-values.yaml | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/k8s/core/prom-stack/prom-values.yaml b/k8s/core/prom-stack/prom-values.yaml index 2978ab0..850e06b 100644 --- a/k8s/core/prom-stack/prom-values.yaml +++ b/k8s/core/prom-stack/prom-values.yaml @@ -26,20 +26,23 @@ alertmanager: {{ if .Annotations.description }}Description: {{ .Annotations.description }}{{ end }} {{ end }} + ingress: + enabled: true + ingressClassName: traefik + annotations: + cert-manager.io/cluster-issuer: letsencrypt + traefik.ingress.kubernetes.io/router.middlewares: kube-system-https-redirect@kubernetescrd + hosts: + - prom.hexor.cy + paths: + - / + tls: + - secretName: alertmanager-tls + hosts: + - prom.hexor.cy alertmanagerSpec: secrets: - alertmanager-telegram-secret - ingress: - enabled: true - ingressClassName: traefik - hosts: - - prom.hexor.cy - paths: - - /alertmanager - tls: - - secretName: grafana-tls - hosts: - - prom.hexor.cy prometheus: prometheusSpec: From c28566ce21277605cbe52585dd41307c2435ee1c Mon Sep 17 00:00:00 2001 From: Ultradesu Date: Mon, 9 Feb 2026 14:57:52 +0200 Subject: [PATCH 10/12] Added alertmanager ingress --- k8s/core/prom-stack/prom-values.yaml | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/k8s/core/prom-stack/prom-values.yaml b/k8s/core/prom-stack/prom-values.yaml index 850e06b..fff45e2 100644 --- a/k8s/core/prom-stack/prom-values.yaml +++ b/k8s/core/prom-stack/prom-values.yaml @@ -35,7 +35,7 @@ alertmanager: hosts: - prom.hexor.cy paths: - - / + - /alertmanager tls: - secretName: alertmanager-tls hosts: @@ -43,8 +43,24 @@ alertmanager: alertmanagerSpec: secrets: - alertmanager-telegram-secret + externalUrl: https://prom.hexor.cy/alertmanager + routePrefix: /alertmanager prometheus: + ingress: + enabled: true + ingressClassName: traefik + annotations: + cert-manager.io/cluster-issuer: letsencrypt + traefik.ingress.kubernetes.io/router.middlewares: kube-system-https-redirect@kubernetescrd + hosts: + - prom.hexor.cy + paths: + - / + tls: + - secretName: prometheus-tls + hosts: + - prom.hexor.cy prometheusSpec: enableRemoteWriteReceiver: true additionalScrapeConfigs: From bf337653021da98121ed22d8393a44da91584f6a Mon Sep 17 00:00:00 2001 From: ab Date: Tue, 10 Feb 2026 08:34:40 +0000 Subject: [PATCH 11/12] Update terraform/authentik/proxy-apps.tfvars --- terraform/authentik/proxy-apps.tfvars | 3 +++ 1 file changed, 3 insertions(+) diff --git a/terraform/authentik/proxy-apps.tfvars b/terraform/authentik/proxy-apps.tfvars index f68705a..ec518ad 100644 --- a/terraform/authentik/proxy-apps.tfvars +++ b/terraform/authentik/proxy-apps.tfvars @@ -176,6 +176,9 @@ EOT access_groups = ["admins", "khm"] # Используем существующие группы create_group = true access_groups = ["admins"] + skip_path_regex = <<-EOT +/client +EOT } "minecraft" = { From c19a086b3831e0ab2940eb6f1965c23238df250b Mon Sep 17 00:00:00 2001 From: ab Date: Tue, 10 Feb 2026 08:38:31 +0000 Subject: [PATCH 12/12] Update terraform/authentik/proxy-apps.tfvars --- terraform/authentik/proxy-apps.tfvars | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/terraform/authentik/proxy-apps.tfvars b/terraform/authentik/proxy-apps.tfvars index ec518ad..956b55f 100644 --- a/terraform/authentik/proxy-apps.tfvars +++ b/terraform/authentik/proxy-apps.tfvars @@ -176,9 +176,6 @@ EOT access_groups = ["admins", "khm"] # Используем существующие группы create_group = true access_groups = ["admins"] - skip_path_regex = <<-EOT -/client -EOT } "minecraft" = { @@ -192,6 +189,9 @@ EOT meta_icon = "https://img.icons8.com/color/48/minecraft-grass-cube.png" mode = "proxy" outpost = "kubernetes-outpost" + skip_path_regex = <<-EOT +/clients +EOT } "pasarguard" = { name = "PasarGuard"