diff --git a/k8s/core/prom-stack/alertmanager-config.yaml b/k8s/core/prom-stack/alertmanager-config.yaml
new file mode 100644
index 0000000..0b0221d
--- /dev/null
+++ b/k8s/core/prom-stack/alertmanager-config.yaml
@@ -0,0 +1,46 @@
+apiVersion: monitoring.coreos.com/v1alpha1
+kind: AlertmanagerConfig
+metadata:
+ name: telegram-notifications
+ namespace: prometheus
+ labels:
+ app: kube-prometheus-stack-alertmanager
+ release: prometheus
+spec:
+ route:
+ groupBy: ['alertname', 'cluster', 'service']
+ groupWait: 10s
+ groupInterval: 5m
+ repeatInterval: 12h
+ receiver: telegram
+ routes:
+ - matchers:
+ - name: alertname
+ value: Watchdog
+ matchType: "="
+ receiver: 'null'
+ receivers:
+ - name: telegram
+ telegramConfigs:
+ - botToken:
+ name: alertmanager-telegram-secret
+ key: TELEGRAM_BOT_TOKEN
+ chatID: 124317807
+ parseMode: HTML
+ sendResolved: true
+ disableNotifications: false
+ message: |
+ {{ if eq .Status "firing" }}🔥 FIRING{{ else }}✅ RESOLVED{{ end }}
+
+ {{ range .Alerts }}
+ 📊 {{ .Labels.alertname }}
+ {{ .Annotations.summary }}
+
+ {{ if .Annotations.node }}🖥 Node: {{ .Annotations.node }}{{ end }}
+ {{ if .Annotations.pod }}📦 Pod: {{ .Annotations.pod }}{{ end }}
+ {{ if .Annotations.namespace }}📁 Namespace: {{ .Annotations.namespace }}{{ end }}
+ {{ if .Annotations.throttle_rate }}⚠️ Throttling rate: {{ .Annotations.throttle_rate }}{{ end }}
+
+ 🔗 View in Grafana
+ {{ end }}
+ - name: 'null'
diff --git a/k8s/core/prom-stack/grafana-alerting-configmap.yaml b/k8s/core/prom-stack/grafana-alerting-configmap.yaml
index 27a412c..fc49d45 100644
--- a/k8s/core/prom-stack/grafana-alerting-configmap.yaml
+++ b/k8s/core/prom-stack/grafana-alerting-configmap.yaml
@@ -45,7 +45,7 @@ data:
type: __expr__
uid: __expr__
expression: A
- reducer: last
+ reducer: min
refId: B
type: reduce
noDataState: NoData
@@ -63,7 +63,7 @@ data:
- orgId: 1
name: kubernetes_alerts
folder: Kubernetes
- interval: 30s
+ interval: 2m
rules:
- uid: node_not_ready
title: Kubernetes Node Not Ready
@@ -71,17 +71,17 @@ data:
data:
- refId: A
relativeTimeRange:
- from: 300
+ from: 600
to: 0
datasourceUid: P76F38748CEC837F0
model:
- expr: 'kube_node_status_condition{condition="Ready",status="true"} == 0'
+ expr: 'kube_node_status_condition{condition="Ready",status="false"}'
refId: A
intervalMs: 1000
maxDataPoints: 43200
- refId: B
relativeTimeRange:
- from: 300
+ from: 600
to: 0
datasourceUid: __expr__
model:
@@ -98,18 +98,248 @@ data:
type: __expr__
uid: __expr__
expression: A
- reducer: last
+ reducer: min
refId: B
type: reduce
- noDataState: Alerting
+ noDataState: NoData
execErrState: Alerting
- for: 0s
+ for: 10m
annotations:
node: '{{ $labels.node }}'
condition: '{{ $labels.condition }}'
summary: 'Kubernetes node is not ready'
labels:
severity: critical
+
+ - uid: node_high_memory_usage
+ title: High Node Memory Usage
+ condition: B
+ data:
+ - refId: A
+ relativeTimeRange:
+ from: 300
+ to: 0
+ datasourceUid: P76F38748CEC837F0
+ model:
+ expr: '(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100'
+ refId: A
+ intervalMs: 1000
+ maxDataPoints: 43200
+ - refId: B
+ relativeTimeRange:
+ from: 300
+ to: 0
+ datasourceUid: __expr__
+ model:
+ conditions:
+ - evaluator:
+ params:
+ - 80
+ type: gt
+ operator:
+ type: and
+ query:
+ params: []
+ datasource:
+ type: __expr__
+ uid: __expr__
+ expression: A
+ reducer: max
+ refId: B
+ type: reduce
+ noDataState: NoData
+ execErrState: Alerting
+ for: 5m
+ annotations:
+ node: '{{ $labels.instance }}'
+ memory_usage: '{{ printf "%.1f%%" $values.A }}'
+ summary: 'Node memory usage is critically high'
+ labels:
+ severity: warning
+
+ - uid: node_high_cpu_usage
+ title: High Node CPU Usage
+ condition: B
+ data:
+ - refId: A
+ relativeTimeRange:
+ from: 300
+ to: 0
+ datasourceUid: P76F38748CEC837F0
+ model:
+ expr: '100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)'
+ refId: A
+ intervalMs: 1000
+ maxDataPoints: 43200
+ - refId: B
+ relativeTimeRange:
+ from: 300
+ to: 0
+ datasourceUid: __expr__
+ model:
+ conditions:
+ - evaluator:
+ params:
+ - 80
+ type: gt
+ operator:
+ type: and
+ query:
+ params: []
+ datasource:
+ type: __expr__
+ uid: __expr__
+ expression: A
+ reducer: max
+ refId: B
+ type: reduce
+ noDataState: NoData
+ execErrState: Alerting
+ for: 10m
+ annotations:
+ node: '{{ $labels.instance }}'
+ cpu_usage: '{{ printf "%.1f%%" $values.A }}'
+ summary: 'Node CPU usage is critically high'
+ labels:
+ severity: warning
+
+ - uid: node_high_disk_usage
+ title: High Node Disk Usage
+ condition: B
+ data:
+ - refId: A
+ relativeTimeRange:
+ from: 300
+ to: 0
+ datasourceUid: P76F38748CEC837F0
+ model:
+ expr: '(1 - (node_filesystem_avail_bytes{fstype=~"ext[234]|xfs|zfs|btrfs"} / node_filesystem_size_bytes)) * 100'
+ refId: A
+ intervalMs: 1000
+ maxDataPoints: 43200
+ - refId: B
+ relativeTimeRange:
+ from: 300
+ to: 0
+ datasourceUid: __expr__
+ model:
+ conditions:
+ - evaluator:
+ params:
+ - 85
+ type: gt
+ operator:
+ type: and
+ query:
+ params: []
+ datasource:
+ type: __expr__
+ uid: __expr__
+ expression: A
+ reducer: max
+ refId: B
+ type: reduce
+ noDataState: NoData
+ execErrState: Alerting
+ for: 5m
+ annotations:
+ node: '{{ $labels.instance }}'
+ filesystem: '{{ $labels.mountpoint }}'
+ disk_usage: '{{ printf "%.1f%%" $values.A }}'
+ summary: 'Node disk usage is critically high'
+ labels:
+ severity: critical
+
+ - uid: node_load_average_high
+ title: High Node Load Average
+ condition: B
+ data:
+ - refId: A
+ relativeTimeRange:
+ from: 300
+ to: 0
+ datasourceUid: P76F38748CEC837F0
+ model:
+ expr: 'node_load5 / on(instance) group_left count by(instance)(node_cpu_seconds_total{mode="idle"})'
+ refId: A
+ intervalMs: 1000
+ maxDataPoints: 43200
+ - refId: B
+ relativeTimeRange:
+ from: 300
+ to: 0
+ datasourceUid: __expr__
+ model:
+ conditions:
+ - evaluator:
+ params:
+ - 0.8
+ type: gt
+ operator:
+ type: and
+ query:
+ params: []
+ datasource:
+ type: __expr__
+ uid: __expr__
+ expression: A
+ reducer: max
+ refId: B
+ type: reduce
+ noDataState: NoData
+ execErrState: Alerting
+ for: 5m
+ annotations:
+ node: '{{ $labels.instance }}'
+ load_average: '{{ printf "%.2f" $values.A }}'
+ summary: 'Node load average is high relative to CPU count'
+ labels:
+ severity: warning
+
+ - uid: node_exporter_down
+ title: Node Exporter Down
+ condition: B
+ data:
+ - refId: A
+ relativeTimeRange:
+ from: 300
+ to: 0
+ datasourceUid: P76F38748CEC837F0
+ model:
+ expr: 'up{job="node-exporter"}'
+ refId: A
+ intervalMs: 1000
+ maxDataPoints: 43200
+ - refId: B
+ relativeTimeRange:
+ from: 300
+ to: 0
+ datasourceUid: __expr__
+ model:
+ conditions:
+ - evaluator:
+ params:
+ - 1
+ type: lt
+ operator:
+ type: and
+ query:
+ params: []
+ datasource:
+ type: __expr__
+ uid: __expr__
+ expression: A
+ reducer: min
+ refId: B
+ type: reduce
+ noDataState: NoData
+ execErrState: Alerting
+ for: 2m
+ annotations:
+ node: '{{ $labels.instance }}'
+ summary: 'Node exporter is down - unable to collect metrics'
+ labels:
+ severity: critical
contactpoints.yaml: |
apiVersion: 1
@@ -149,4 +379,4 @@ data:
- alertname
group_wait: 10s
group_interval: 5m
- repeat_interval: 4h
+ repeat_interval: 12h
diff --git a/k8s/core/prom-stack/kustomization.yaml b/k8s/core/prom-stack/kustomization.yaml
index e08a833..225c5e4 100644
--- a/k8s/core/prom-stack/kustomization.yaml
+++ b/k8s/core/prom-stack/kustomization.yaml
@@ -5,6 +5,7 @@ resources:
- persistentVolume.yaml
- external-secrets.yaml
- grafana-alerting-configmap.yaml
+ - alertmanager-config.yaml
helmCharts:
- name: kube-prometheus-stack
diff --git a/k8s/core/prom-stack/prom-values.yaml b/k8s/core/prom-stack/prom-values.yaml
index b96bd52..fff45e2 100644
--- a/k8s/core/prom-stack/prom-values.yaml
+++ b/k8s/core/prom-stack/prom-values.yaml
@@ -26,11 +26,41 @@ alertmanager:
{{ if .Annotations.description }}Description: {{ .Annotations.description }}{{ end }}
{{ end }}
+ ingress:
+ enabled: true
+ ingressClassName: traefik
+ annotations:
+ cert-manager.io/cluster-issuer: letsencrypt
+ traefik.ingress.kubernetes.io/router.middlewares: kube-system-https-redirect@kubernetescrd
+ hosts:
+ - prom.hexor.cy
+ paths:
+ - /alertmanager
+ tls:
+ - secretName: alertmanager-tls
+ hosts:
+ - prom.hexor.cy
alertmanagerSpec:
secrets:
- alertmanager-telegram-secret
+ externalUrl: https://prom.hexor.cy/alertmanager
+ routePrefix: /alertmanager
prometheus:
+ ingress:
+ enabled: true
+ ingressClassName: traefik
+ annotations:
+ cert-manager.io/cluster-issuer: letsencrypt
+ traefik.ingress.kubernetes.io/router.middlewares: kube-system-https-redirect@kubernetescrd
+ hosts:
+ - prom.hexor.cy
+ paths:
+ - /
+ tls:
+ - secretName: prometheus-tls
+ hosts:
+ - prom.hexor.cy
prometheusSpec:
enableRemoteWriteReceiver: true
additionalScrapeConfigs:
diff --git a/terraform/authentik/proxy-apps.tfvars b/terraform/authentik/proxy-apps.tfvars
index f68705a..956b55f 100644
--- a/terraform/authentik/proxy-apps.tfvars
+++ b/terraform/authentik/proxy-apps.tfvars
@@ -189,6 +189,9 @@ EOT
meta_icon = "https://img.icons8.com/color/48/minecraft-grass-cube.png"
mode = "proxy"
outpost = "kubernetes-outpost"
+ skip_path_regex = <<-EOT
+/clients
+EOT
}
"pasarguard" = {
name = "PasarGuard"