Merge branch 'main' of ssh://gt.hexor.cy:30022/ab/homelab
This commit is contained in:
46
k8s/core/prom-stack/alertmanager-config.yaml
Normal file
46
k8s/core/prom-stack/alertmanager-config.yaml
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
apiVersion: monitoring.coreos.com/v1alpha1
|
||||||
|
kind: AlertmanagerConfig
|
||||||
|
metadata:
|
||||||
|
name: telegram-notifications
|
||||||
|
namespace: prometheus
|
||||||
|
labels:
|
||||||
|
app: kube-prometheus-stack-alertmanager
|
||||||
|
release: prometheus
|
||||||
|
spec:
|
||||||
|
route:
|
||||||
|
groupBy: ['alertname', 'cluster', 'service']
|
||||||
|
groupWait: 10s
|
||||||
|
groupInterval: 5m
|
||||||
|
repeatInterval: 12h
|
||||||
|
receiver: telegram
|
||||||
|
routes:
|
||||||
|
- matchers:
|
||||||
|
- name: alertname
|
||||||
|
value: Watchdog
|
||||||
|
matchType: "="
|
||||||
|
receiver: 'null'
|
||||||
|
receivers:
|
||||||
|
- name: telegram
|
||||||
|
telegramConfigs:
|
||||||
|
- botToken:
|
||||||
|
name: alertmanager-telegram-secret
|
||||||
|
key: TELEGRAM_BOT_TOKEN
|
||||||
|
chatID: 124317807
|
||||||
|
parseMode: HTML
|
||||||
|
sendResolved: true
|
||||||
|
disableNotifications: false
|
||||||
|
message: |
|
||||||
|
{{ if eq .Status "firing" }}🔥 FIRING{{ else }}✅ RESOLVED{{ end }}
|
||||||
|
|
||||||
|
{{ range .Alerts }}
|
||||||
|
📊 <b>{{ .Labels.alertname }}</b>
|
||||||
|
{{ .Annotations.summary }}
|
||||||
|
|
||||||
|
{{ if .Annotations.node }}🖥 <b>Node:</b> <code>{{ .Annotations.node }}</code>{{ end }}
|
||||||
|
{{ if .Annotations.pod }}📦 <b>Pod:</b> <code>{{ .Annotations.pod }}</code>{{ end }}
|
||||||
|
{{ if .Annotations.namespace }}📁 <b>Namespace:</b> <code>{{ .Annotations.namespace }}</code>{{ end }}
|
||||||
|
{{ if .Annotations.throttle_rate }}⚠️ <b>Throttling rate:</b> {{ .Annotations.throttle_rate }}{{ end }}
|
||||||
|
|
||||||
|
🔗 <a href="{{ .GeneratorURL }}">View in Grafana</a>
|
||||||
|
{{ end }}
|
||||||
|
- name: 'null'
|
||||||
@@ -45,7 +45,7 @@ data:
|
|||||||
type: __expr__
|
type: __expr__
|
||||||
uid: __expr__
|
uid: __expr__
|
||||||
expression: A
|
expression: A
|
||||||
reducer: last
|
reducer: min
|
||||||
refId: B
|
refId: B
|
||||||
type: reduce
|
type: reduce
|
||||||
noDataState: NoData
|
noDataState: NoData
|
||||||
@@ -63,7 +63,7 @@ data:
|
|||||||
- orgId: 1
|
- orgId: 1
|
||||||
name: kubernetes_alerts
|
name: kubernetes_alerts
|
||||||
folder: Kubernetes
|
folder: Kubernetes
|
||||||
interval: 30s
|
interval: 2m
|
||||||
rules:
|
rules:
|
||||||
- uid: node_not_ready
|
- uid: node_not_ready
|
||||||
title: Kubernetes Node Not Ready
|
title: Kubernetes Node Not Ready
|
||||||
@@ -71,17 +71,17 @@ data:
|
|||||||
data:
|
data:
|
||||||
- refId: A
|
- refId: A
|
||||||
relativeTimeRange:
|
relativeTimeRange:
|
||||||
from: 300
|
from: 600
|
||||||
to: 0
|
to: 0
|
||||||
datasourceUid: P76F38748CEC837F0
|
datasourceUid: P76F38748CEC837F0
|
||||||
model:
|
model:
|
||||||
expr: 'kube_node_status_condition{condition="Ready",status="true"} == 0'
|
expr: 'kube_node_status_condition{condition="Ready",status="false"}'
|
||||||
refId: A
|
refId: A
|
||||||
intervalMs: 1000
|
intervalMs: 1000
|
||||||
maxDataPoints: 43200
|
maxDataPoints: 43200
|
||||||
- refId: B
|
- refId: B
|
||||||
relativeTimeRange:
|
relativeTimeRange:
|
||||||
from: 300
|
from: 600
|
||||||
to: 0
|
to: 0
|
||||||
datasourceUid: __expr__
|
datasourceUid: __expr__
|
||||||
model:
|
model:
|
||||||
@@ -98,18 +98,248 @@ data:
|
|||||||
type: __expr__
|
type: __expr__
|
||||||
uid: __expr__
|
uid: __expr__
|
||||||
expression: A
|
expression: A
|
||||||
reducer: last
|
reducer: min
|
||||||
refId: B
|
refId: B
|
||||||
type: reduce
|
type: reduce
|
||||||
noDataState: Alerting
|
noDataState: NoData
|
||||||
execErrState: Alerting
|
execErrState: Alerting
|
||||||
for: 0s
|
for: 10m
|
||||||
annotations:
|
annotations:
|
||||||
node: '{{ $labels.node }}'
|
node: '{{ $labels.node }}'
|
||||||
condition: '{{ $labels.condition }}'
|
condition: '{{ $labels.condition }}'
|
||||||
summary: 'Kubernetes node is not ready'
|
summary: 'Kubernetes node is not ready'
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
|
|
||||||
|
- uid: node_high_memory_usage
|
||||||
|
title: High Node Memory Usage
|
||||||
|
condition: B
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange:
|
||||||
|
from: 300
|
||||||
|
to: 0
|
||||||
|
datasourceUid: P76F38748CEC837F0
|
||||||
|
model:
|
||||||
|
expr: '(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100'
|
||||||
|
refId: A
|
||||||
|
intervalMs: 1000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
- refId: B
|
||||||
|
relativeTimeRange:
|
||||||
|
from: 300
|
||||||
|
to: 0
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model:
|
||||||
|
conditions:
|
||||||
|
- evaluator:
|
||||||
|
params:
|
||||||
|
- 80
|
||||||
|
type: gt
|
||||||
|
operator:
|
||||||
|
type: and
|
||||||
|
query:
|
||||||
|
params: []
|
||||||
|
datasource:
|
||||||
|
type: __expr__
|
||||||
|
uid: __expr__
|
||||||
|
expression: A
|
||||||
|
reducer: max
|
||||||
|
refId: B
|
||||||
|
type: reduce
|
||||||
|
noDataState: NoData
|
||||||
|
execErrState: Alerting
|
||||||
|
for: 5m
|
||||||
|
annotations:
|
||||||
|
node: '{{ $labels.instance }}'
|
||||||
|
memory_usage: '{{ printf "%.1f%%" $values.A }}'
|
||||||
|
summary: 'Node memory usage is critically high'
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
|
||||||
|
- uid: node_high_cpu_usage
|
||||||
|
title: High Node CPU Usage
|
||||||
|
condition: B
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange:
|
||||||
|
from: 300
|
||||||
|
to: 0
|
||||||
|
datasourceUid: P76F38748CEC837F0
|
||||||
|
model:
|
||||||
|
expr: '100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)'
|
||||||
|
refId: A
|
||||||
|
intervalMs: 1000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
- refId: B
|
||||||
|
relativeTimeRange:
|
||||||
|
from: 300
|
||||||
|
to: 0
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model:
|
||||||
|
conditions:
|
||||||
|
- evaluator:
|
||||||
|
params:
|
||||||
|
- 80
|
||||||
|
type: gt
|
||||||
|
operator:
|
||||||
|
type: and
|
||||||
|
query:
|
||||||
|
params: []
|
||||||
|
datasource:
|
||||||
|
type: __expr__
|
||||||
|
uid: __expr__
|
||||||
|
expression: A
|
||||||
|
reducer: max
|
||||||
|
refId: B
|
||||||
|
type: reduce
|
||||||
|
noDataState: NoData
|
||||||
|
execErrState: Alerting
|
||||||
|
for: 10m
|
||||||
|
annotations:
|
||||||
|
node: '{{ $labels.instance }}'
|
||||||
|
cpu_usage: '{{ printf "%.1f%%" $values.A }}'
|
||||||
|
summary: 'Node CPU usage is critically high'
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
|
||||||
|
- uid: node_high_disk_usage
|
||||||
|
title: High Node Disk Usage
|
||||||
|
condition: B
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange:
|
||||||
|
from: 300
|
||||||
|
to: 0
|
||||||
|
datasourceUid: P76F38748CEC837F0
|
||||||
|
model:
|
||||||
|
expr: '(1 - (node_filesystem_avail_bytes{fstype=~"ext[234]|xfs|zfs|btrfs"} / node_filesystem_size_bytes)) * 100'
|
||||||
|
refId: A
|
||||||
|
intervalMs: 1000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
- refId: B
|
||||||
|
relativeTimeRange:
|
||||||
|
from: 300
|
||||||
|
to: 0
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model:
|
||||||
|
conditions:
|
||||||
|
- evaluator:
|
||||||
|
params:
|
||||||
|
- 85
|
||||||
|
type: gt
|
||||||
|
operator:
|
||||||
|
type: and
|
||||||
|
query:
|
||||||
|
params: []
|
||||||
|
datasource:
|
||||||
|
type: __expr__
|
||||||
|
uid: __expr__
|
||||||
|
expression: A
|
||||||
|
reducer: max
|
||||||
|
refId: B
|
||||||
|
type: reduce
|
||||||
|
noDataState: NoData
|
||||||
|
execErrState: Alerting
|
||||||
|
for: 5m
|
||||||
|
annotations:
|
||||||
|
node: '{{ $labels.instance }}'
|
||||||
|
filesystem: '{{ $labels.mountpoint }}'
|
||||||
|
disk_usage: '{{ printf "%.1f%%" $values.A }}'
|
||||||
|
summary: 'Node disk usage is critically high'
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
|
||||||
|
- uid: node_load_average_high
|
||||||
|
title: High Node Load Average
|
||||||
|
condition: B
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange:
|
||||||
|
from: 300
|
||||||
|
to: 0
|
||||||
|
datasourceUid: P76F38748CEC837F0
|
||||||
|
model:
|
||||||
|
expr: 'node_load5 / on(instance) group_left count by(instance)(node_cpu_seconds_total{mode="idle"})'
|
||||||
|
refId: A
|
||||||
|
intervalMs: 1000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
- refId: B
|
||||||
|
relativeTimeRange:
|
||||||
|
from: 300
|
||||||
|
to: 0
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model:
|
||||||
|
conditions:
|
||||||
|
- evaluator:
|
||||||
|
params:
|
||||||
|
- 0.8
|
||||||
|
type: gt
|
||||||
|
operator:
|
||||||
|
type: and
|
||||||
|
query:
|
||||||
|
params: []
|
||||||
|
datasource:
|
||||||
|
type: __expr__
|
||||||
|
uid: __expr__
|
||||||
|
expression: A
|
||||||
|
reducer: max
|
||||||
|
refId: B
|
||||||
|
type: reduce
|
||||||
|
noDataState: NoData
|
||||||
|
execErrState: Alerting
|
||||||
|
for: 5m
|
||||||
|
annotations:
|
||||||
|
node: '{{ $labels.instance }}'
|
||||||
|
load_average: '{{ printf "%.2f" $values.A }}'
|
||||||
|
summary: 'Node load average is high relative to CPU count'
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
|
||||||
|
- uid: node_exporter_down
|
||||||
|
title: Node Exporter Down
|
||||||
|
condition: B
|
||||||
|
data:
|
||||||
|
- refId: A
|
||||||
|
relativeTimeRange:
|
||||||
|
from: 300
|
||||||
|
to: 0
|
||||||
|
datasourceUid: P76F38748CEC837F0
|
||||||
|
model:
|
||||||
|
expr: 'up{job="node-exporter"}'
|
||||||
|
refId: A
|
||||||
|
intervalMs: 1000
|
||||||
|
maxDataPoints: 43200
|
||||||
|
- refId: B
|
||||||
|
relativeTimeRange:
|
||||||
|
from: 300
|
||||||
|
to: 0
|
||||||
|
datasourceUid: __expr__
|
||||||
|
model:
|
||||||
|
conditions:
|
||||||
|
- evaluator:
|
||||||
|
params:
|
||||||
|
- 1
|
||||||
|
type: lt
|
||||||
|
operator:
|
||||||
|
type: and
|
||||||
|
query:
|
||||||
|
params: []
|
||||||
|
datasource:
|
||||||
|
type: __expr__
|
||||||
|
uid: __expr__
|
||||||
|
expression: A
|
||||||
|
reducer: min
|
||||||
|
refId: B
|
||||||
|
type: reduce
|
||||||
|
noDataState: NoData
|
||||||
|
execErrState: Alerting
|
||||||
|
for: 2m
|
||||||
|
annotations:
|
||||||
|
node: '{{ $labels.instance }}'
|
||||||
|
summary: 'Node exporter is down - unable to collect metrics'
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
|
||||||
contactpoints.yaml: |
|
contactpoints.yaml: |
|
||||||
apiVersion: 1
|
apiVersion: 1
|
||||||
@@ -149,4 +379,4 @@ data:
|
|||||||
- alertname
|
- alertname
|
||||||
group_wait: 10s
|
group_wait: 10s
|
||||||
group_interval: 5m
|
group_interval: 5m
|
||||||
repeat_interval: 4h
|
repeat_interval: 12h
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ resources:
|
|||||||
- persistentVolume.yaml
|
- persistentVolume.yaml
|
||||||
- external-secrets.yaml
|
- external-secrets.yaml
|
||||||
- grafana-alerting-configmap.yaml
|
- grafana-alerting-configmap.yaml
|
||||||
|
- alertmanager-config.yaml
|
||||||
|
|
||||||
helmCharts:
|
helmCharts:
|
||||||
- name: kube-prometheus-stack
|
- name: kube-prometheus-stack
|
||||||
|
|||||||
@@ -26,11 +26,41 @@ alertmanager:
|
|||||||
{{ if .Annotations.description }}<b>Description:</b> {{ .Annotations.description }}{{ end }}
|
{{ if .Annotations.description }}<b>Description:</b> {{ .Annotations.description }}{{ end }}
|
||||||
{{ end }}
|
{{ end }}
|
||||||
|
|
||||||
|
ingress:
|
||||||
|
enabled: true
|
||||||
|
ingressClassName: traefik
|
||||||
|
annotations:
|
||||||
|
cert-manager.io/cluster-issuer: letsencrypt
|
||||||
|
traefik.ingress.kubernetes.io/router.middlewares: kube-system-https-redirect@kubernetescrd
|
||||||
|
hosts:
|
||||||
|
- prom.hexor.cy
|
||||||
|
paths:
|
||||||
|
- /alertmanager
|
||||||
|
tls:
|
||||||
|
- secretName: alertmanager-tls
|
||||||
|
hosts:
|
||||||
|
- prom.hexor.cy
|
||||||
alertmanagerSpec:
|
alertmanagerSpec:
|
||||||
secrets:
|
secrets:
|
||||||
- alertmanager-telegram-secret
|
- alertmanager-telegram-secret
|
||||||
|
externalUrl: https://prom.hexor.cy/alertmanager
|
||||||
|
routePrefix: /alertmanager
|
||||||
|
|
||||||
prometheus:
|
prometheus:
|
||||||
|
ingress:
|
||||||
|
enabled: true
|
||||||
|
ingressClassName: traefik
|
||||||
|
annotations:
|
||||||
|
cert-manager.io/cluster-issuer: letsencrypt
|
||||||
|
traefik.ingress.kubernetes.io/router.middlewares: kube-system-https-redirect@kubernetescrd
|
||||||
|
hosts:
|
||||||
|
- prom.hexor.cy
|
||||||
|
paths:
|
||||||
|
- /
|
||||||
|
tls:
|
||||||
|
- secretName: prometheus-tls
|
||||||
|
hosts:
|
||||||
|
- prom.hexor.cy
|
||||||
prometheusSpec:
|
prometheusSpec:
|
||||||
enableRemoteWriteReceiver: true
|
enableRemoteWriteReceiver: true
|
||||||
additionalScrapeConfigs:
|
additionalScrapeConfigs:
|
||||||
|
|||||||
@@ -189,6 +189,9 @@ EOT
|
|||||||
meta_icon = "https://img.icons8.com/color/48/minecraft-grass-cube.png"
|
meta_icon = "https://img.icons8.com/color/48/minecraft-grass-cube.png"
|
||||||
mode = "proxy"
|
mode = "proxy"
|
||||||
outpost = "kubernetes-outpost"
|
outpost = "kubernetes-outpost"
|
||||||
|
skip_path_regex = <<-EOT
|
||||||
|
/clients
|
||||||
|
EOT
|
||||||
}
|
}
|
||||||
"pasarguard" = {
|
"pasarguard" = {
|
||||||
name = "PasarGuard"
|
name = "PasarGuard"
|
||||||
|
|||||||
Reference in New Issue
Block a user