Merge branch 'main' of ssh://gt.hexor.cy:30022/ab/homelab
All checks were successful
Update Kubernetes Services Wiki / Generate and Update K8s Wiki (push) Successful in 7s
Check with kubeconform / lint (push) Successful in 7s
Auto-update README / Generate README and Create MR (push) Successful in 4s

This commit is contained in:
AB
2026-02-10 11:36:43 +02:00
5 changed files with 319 additions and 9 deletions

View File

@@ -0,0 +1,46 @@
apiVersion: monitoring.coreos.com/v1alpha1
kind: AlertmanagerConfig
metadata:
name: telegram-notifications
namespace: prometheus
labels:
app: kube-prometheus-stack-alertmanager
release: prometheus
spec:
route:
groupBy: ['alertname', 'cluster', 'service']
groupWait: 10s
groupInterval: 5m
repeatInterval: 12h
receiver: telegram
routes:
- matchers:
- name: alertname
value: Watchdog
matchType: "="
receiver: 'null'
receivers:
- name: telegram
telegramConfigs:
- botToken:
name: alertmanager-telegram-secret
key: TELEGRAM_BOT_TOKEN
chatID: 124317807
parseMode: HTML
sendResolved: true
disableNotifications: false
message: |
{{ if eq .Status "firing" }}🔥 FIRING{{ else }}✅ RESOLVED{{ end }}
{{ range .Alerts }}
📊 <b>{{ .Labels.alertname }}</b>
{{ .Annotations.summary }}
{{ if .Annotations.node }}🖥 <b>Node:</b> <code>{{ .Annotations.node }}</code>{{ end }}
{{ if .Annotations.pod }}📦 <b>Pod:</b> <code>{{ .Annotations.pod }}</code>{{ end }}
{{ if .Annotations.namespace }}📁 <b>Namespace:</b> <code>{{ .Annotations.namespace }}</code>{{ end }}
{{ if .Annotations.throttle_rate }}⚠️ <b>Throttling rate:</b> {{ .Annotations.throttle_rate }}{{ end }}
🔗 <a href="{{ .GeneratorURL }}">View in Grafana</a>
{{ end }}
- name: 'null'

View File

@@ -45,7 +45,7 @@ data:
type: __expr__ type: __expr__
uid: __expr__ uid: __expr__
expression: A expression: A
reducer: last reducer: min
refId: B refId: B
type: reduce type: reduce
noDataState: NoData noDataState: NoData
@@ -63,7 +63,7 @@ data:
- orgId: 1 - orgId: 1
name: kubernetes_alerts name: kubernetes_alerts
folder: Kubernetes folder: Kubernetes
interval: 30s interval: 2m
rules: rules:
- uid: node_not_ready - uid: node_not_ready
title: Kubernetes Node Not Ready title: Kubernetes Node Not Ready
@@ -71,17 +71,17 @@ data:
data: data:
- refId: A - refId: A
relativeTimeRange: relativeTimeRange:
from: 300 from: 600
to: 0 to: 0
datasourceUid: P76F38748CEC837F0 datasourceUid: P76F38748CEC837F0
model: model:
expr: 'kube_node_status_condition{condition="Ready",status="true"} == 0' expr: 'kube_node_status_condition{condition="Ready",status="false"}'
refId: A refId: A
intervalMs: 1000 intervalMs: 1000
maxDataPoints: 43200 maxDataPoints: 43200
- refId: B - refId: B
relativeTimeRange: relativeTimeRange:
from: 300 from: 600
to: 0 to: 0
datasourceUid: __expr__ datasourceUid: __expr__
model: model:
@@ -98,12 +98,12 @@ data:
type: __expr__ type: __expr__
uid: __expr__ uid: __expr__
expression: A expression: A
reducer: last reducer: min
refId: B refId: B
type: reduce type: reduce
noDataState: Alerting noDataState: NoData
execErrState: Alerting execErrState: Alerting
for: 0s for: 10m
annotations: annotations:
node: '{{ $labels.node }}' node: '{{ $labels.node }}'
condition: '{{ $labels.condition }}' condition: '{{ $labels.condition }}'
@@ -111,6 +111,236 @@ data:
labels: labels:
severity: critical severity: critical
- uid: node_high_memory_usage
title: High Node Memory Usage
condition: B
data:
- refId: A
relativeTimeRange:
from: 300
to: 0
datasourceUid: P76F38748CEC837F0
model:
expr: '(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100'
refId: A
intervalMs: 1000
maxDataPoints: 43200
- refId: B
relativeTimeRange:
from: 300
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 80
type: gt
operator:
type: and
query:
params: []
datasource:
type: __expr__
uid: __expr__
expression: A
reducer: max
refId: B
type: reduce
noDataState: NoData
execErrState: Alerting
for: 5m
annotations:
node: '{{ $labels.instance }}'
memory_usage: '{{ printf "%.1f%%" $values.A }}'
summary: 'Node memory usage is critically high'
labels:
severity: warning
- uid: node_high_cpu_usage
title: High Node CPU Usage
condition: B
data:
- refId: A
relativeTimeRange:
from: 300
to: 0
datasourceUid: P76F38748CEC837F0
model:
expr: '100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)'
refId: A
intervalMs: 1000
maxDataPoints: 43200
- refId: B
relativeTimeRange:
from: 300
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 80
type: gt
operator:
type: and
query:
params: []
datasource:
type: __expr__
uid: __expr__
expression: A
reducer: max
refId: B
type: reduce
noDataState: NoData
execErrState: Alerting
for: 10m
annotations:
node: '{{ $labels.instance }}'
cpu_usage: '{{ printf "%.1f%%" $values.A }}'
summary: 'Node CPU usage is critically high'
labels:
severity: warning
- uid: node_high_disk_usage
title: High Node Disk Usage
condition: B
data:
- refId: A
relativeTimeRange:
from: 300
to: 0
datasourceUid: P76F38748CEC837F0
model:
expr: '(1 - (node_filesystem_avail_bytes{fstype=~"ext[234]|xfs|zfs|btrfs"} / node_filesystem_size_bytes)) * 100'
refId: A
intervalMs: 1000
maxDataPoints: 43200
- refId: B
relativeTimeRange:
from: 300
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 85
type: gt
operator:
type: and
query:
params: []
datasource:
type: __expr__
uid: __expr__
expression: A
reducer: max
refId: B
type: reduce
noDataState: NoData
execErrState: Alerting
for: 5m
annotations:
node: '{{ $labels.instance }}'
filesystem: '{{ $labels.mountpoint }}'
disk_usage: '{{ printf "%.1f%%" $values.A }}'
summary: 'Node disk usage is critically high'
labels:
severity: critical
- uid: node_load_average_high
title: High Node Load Average
condition: B
data:
- refId: A
relativeTimeRange:
from: 300
to: 0
datasourceUid: P76F38748CEC837F0
model:
expr: 'node_load5 / on(instance) group_left count by(instance)(node_cpu_seconds_total{mode="idle"})'
refId: A
intervalMs: 1000
maxDataPoints: 43200
- refId: B
relativeTimeRange:
from: 300
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0.8
type: gt
operator:
type: and
query:
params: []
datasource:
type: __expr__
uid: __expr__
expression: A
reducer: max
refId: B
type: reduce
noDataState: NoData
execErrState: Alerting
for: 5m
annotations:
node: '{{ $labels.instance }}'
load_average: '{{ printf "%.2f" $values.A }}'
summary: 'Node load average is high relative to CPU count'
labels:
severity: warning
- uid: node_exporter_down
title: Node Exporter Down
condition: B
data:
- refId: A
relativeTimeRange:
from: 300
to: 0
datasourceUid: P76F38748CEC837F0
model:
expr: 'up{job="node-exporter"}'
refId: A
intervalMs: 1000
maxDataPoints: 43200
- refId: B
relativeTimeRange:
from: 300
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 1
type: lt
operator:
type: and
query:
params: []
datasource:
type: __expr__
uid: __expr__
expression: A
reducer: min
refId: B
type: reduce
noDataState: NoData
execErrState: Alerting
for: 2m
annotations:
node: '{{ $labels.instance }}'
summary: 'Node exporter is down - unable to collect metrics'
labels:
severity: critical
contactpoints.yaml: | contactpoints.yaml: |
apiVersion: 1 apiVersion: 1
contactPoints: contactPoints:
@@ -149,4 +379,4 @@ data:
- alertname - alertname
group_wait: 10s group_wait: 10s
group_interval: 5m group_interval: 5m
repeat_interval: 4h repeat_interval: 12h

View File

@@ -5,6 +5,7 @@ resources:
- persistentVolume.yaml - persistentVolume.yaml
- external-secrets.yaml - external-secrets.yaml
- grafana-alerting-configmap.yaml - grafana-alerting-configmap.yaml
- alertmanager-config.yaml
helmCharts: helmCharts:
- name: kube-prometheus-stack - name: kube-prometheus-stack

View File

@@ -26,11 +26,41 @@ alertmanager:
{{ if .Annotations.description }}<b>Description:</b> {{ .Annotations.description }}{{ end }} {{ if .Annotations.description }}<b>Description:</b> {{ .Annotations.description }}{{ end }}
{{ end }} {{ end }}
ingress:
enabled: true
ingressClassName: traefik
annotations:
cert-manager.io/cluster-issuer: letsencrypt
traefik.ingress.kubernetes.io/router.middlewares: kube-system-https-redirect@kubernetescrd
hosts:
- prom.hexor.cy
paths:
- /alertmanager
tls:
- secretName: alertmanager-tls
hosts:
- prom.hexor.cy
alertmanagerSpec: alertmanagerSpec:
secrets: secrets:
- alertmanager-telegram-secret - alertmanager-telegram-secret
externalUrl: https://prom.hexor.cy/alertmanager
routePrefix: /alertmanager
prometheus: prometheus:
ingress:
enabled: true
ingressClassName: traefik
annotations:
cert-manager.io/cluster-issuer: letsencrypt
traefik.ingress.kubernetes.io/router.middlewares: kube-system-https-redirect@kubernetescrd
hosts:
- prom.hexor.cy
paths:
- /
tls:
- secretName: prometheus-tls
hosts:
- prom.hexor.cy
prometheusSpec: prometheusSpec:
enableRemoteWriteReceiver: true enableRemoteWriteReceiver: true
additionalScrapeConfigs: additionalScrapeConfigs:

View File

@@ -189,6 +189,9 @@ EOT
meta_icon = "https://img.icons8.com/color/48/minecraft-grass-cube.png" meta_icon = "https://img.icons8.com/color/48/minecraft-grass-cube.png"
mode = "proxy" mode = "proxy"
outpost = "kubernetes-outpost" outpost = "kubernetes-outpost"
skip_path_regex = <<-EOT
/clients
EOT
} }
"pasarguard" = { "pasarguard" = {
name = "PasarGuard" name = "PasarGuard"