Merge branch 'main' of ssh://gt.hexor.cy:30022/ab/homelab
This commit is contained in:
46
k8s/core/prom-stack/alertmanager-config.yaml
Normal file
46
k8s/core/prom-stack/alertmanager-config.yaml
Normal file
@@ -0,0 +1,46 @@
|
||||
apiVersion: monitoring.coreos.com/v1alpha1
|
||||
kind: AlertmanagerConfig
|
||||
metadata:
|
||||
name: telegram-notifications
|
||||
namespace: prometheus
|
||||
labels:
|
||||
app: kube-prometheus-stack-alertmanager
|
||||
release: prometheus
|
||||
spec:
|
||||
route:
|
||||
groupBy: ['alertname', 'cluster', 'service']
|
||||
groupWait: 10s
|
||||
groupInterval: 5m
|
||||
repeatInterval: 12h
|
||||
receiver: telegram
|
||||
routes:
|
||||
- matchers:
|
||||
- name: alertname
|
||||
value: Watchdog
|
||||
matchType: "="
|
||||
receiver: 'null'
|
||||
receivers:
|
||||
- name: telegram
|
||||
telegramConfigs:
|
||||
- botToken:
|
||||
name: alertmanager-telegram-secret
|
||||
key: TELEGRAM_BOT_TOKEN
|
||||
chatID: 124317807
|
||||
parseMode: HTML
|
||||
sendResolved: true
|
||||
disableNotifications: false
|
||||
message: |
|
||||
{{ if eq .Status "firing" }}🔥 FIRING{{ else }}✅ RESOLVED{{ end }}
|
||||
|
||||
{{ range .Alerts }}
|
||||
📊 <b>{{ .Labels.alertname }}</b>
|
||||
{{ .Annotations.summary }}
|
||||
|
||||
{{ if .Annotations.node }}🖥 <b>Node:</b> <code>{{ .Annotations.node }}</code>{{ end }}
|
||||
{{ if .Annotations.pod }}📦 <b>Pod:</b> <code>{{ .Annotations.pod }}</code>{{ end }}
|
||||
{{ if .Annotations.namespace }}📁 <b>Namespace:</b> <code>{{ .Annotations.namespace }}</code>{{ end }}
|
||||
{{ if .Annotations.throttle_rate }}⚠️ <b>Throttling rate:</b> {{ .Annotations.throttle_rate }}{{ end }}
|
||||
|
||||
🔗 <a href="{{ .GeneratorURL }}">View in Grafana</a>
|
||||
{{ end }}
|
||||
- name: 'null'
|
||||
@@ -45,7 +45,7 @@ data:
|
||||
type: __expr__
|
||||
uid: __expr__
|
||||
expression: A
|
||||
reducer: last
|
||||
reducer: min
|
||||
refId: B
|
||||
type: reduce
|
||||
noDataState: NoData
|
||||
@@ -63,7 +63,7 @@ data:
|
||||
- orgId: 1
|
||||
name: kubernetes_alerts
|
||||
folder: Kubernetes
|
||||
interval: 30s
|
||||
interval: 2m
|
||||
rules:
|
||||
- uid: node_not_ready
|
||||
title: Kubernetes Node Not Ready
|
||||
@@ -71,17 +71,17 @@ data:
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
from: 600
|
||||
to: 0
|
||||
datasourceUid: P76F38748CEC837F0
|
||||
model:
|
||||
expr: 'kube_node_status_condition{condition="Ready",status="true"} == 0'
|
||||
expr: 'kube_node_status_condition{condition="Ready",status="false"}'
|
||||
refId: A
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
- refId: B
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
from: 600
|
||||
to: 0
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
@@ -98,18 +98,248 @@ data:
|
||||
type: __expr__
|
||||
uid: __expr__
|
||||
expression: A
|
||||
reducer: last
|
||||
reducer: min
|
||||
refId: B
|
||||
type: reduce
|
||||
noDataState: Alerting
|
||||
noDataState: NoData
|
||||
execErrState: Alerting
|
||||
for: 0s
|
||||
for: 10m
|
||||
annotations:
|
||||
node: '{{ $labels.node }}'
|
||||
condition: '{{ $labels.condition }}'
|
||||
summary: 'Kubernetes node is not ready'
|
||||
labels:
|
||||
severity: critical
|
||||
|
||||
- uid: node_high_memory_usage
|
||||
title: High Node Memory Usage
|
||||
condition: B
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
to: 0
|
||||
datasourceUid: P76F38748CEC837F0
|
||||
model:
|
||||
expr: '(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100'
|
||||
refId: A
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
- refId: B
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
to: 0
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
params:
|
||||
- 80
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
query:
|
||||
params: []
|
||||
datasource:
|
||||
type: __expr__
|
||||
uid: __expr__
|
||||
expression: A
|
||||
reducer: max
|
||||
refId: B
|
||||
type: reduce
|
||||
noDataState: NoData
|
||||
execErrState: Alerting
|
||||
for: 5m
|
||||
annotations:
|
||||
node: '{{ $labels.instance }}'
|
||||
memory_usage: '{{ printf "%.1f%%" $values.A }}'
|
||||
summary: 'Node memory usage is critically high'
|
||||
labels:
|
||||
severity: warning
|
||||
|
||||
- uid: node_high_cpu_usage
|
||||
title: High Node CPU Usage
|
||||
condition: B
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
to: 0
|
||||
datasourceUid: P76F38748CEC837F0
|
||||
model:
|
||||
expr: '100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)'
|
||||
refId: A
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
- refId: B
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
to: 0
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
params:
|
||||
- 80
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
query:
|
||||
params: []
|
||||
datasource:
|
||||
type: __expr__
|
||||
uid: __expr__
|
||||
expression: A
|
||||
reducer: max
|
||||
refId: B
|
||||
type: reduce
|
||||
noDataState: NoData
|
||||
execErrState: Alerting
|
||||
for: 10m
|
||||
annotations:
|
||||
node: '{{ $labels.instance }}'
|
||||
cpu_usage: '{{ printf "%.1f%%" $values.A }}'
|
||||
summary: 'Node CPU usage is critically high'
|
||||
labels:
|
||||
severity: warning
|
||||
|
||||
- uid: node_high_disk_usage
|
||||
title: High Node Disk Usage
|
||||
condition: B
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
to: 0
|
||||
datasourceUid: P76F38748CEC837F0
|
||||
model:
|
||||
expr: '(1 - (node_filesystem_avail_bytes{fstype=~"ext[234]|xfs|zfs|btrfs"} / node_filesystem_size_bytes)) * 100'
|
||||
refId: A
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
- refId: B
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
to: 0
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
params:
|
||||
- 85
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
query:
|
||||
params: []
|
||||
datasource:
|
||||
type: __expr__
|
||||
uid: __expr__
|
||||
expression: A
|
||||
reducer: max
|
||||
refId: B
|
||||
type: reduce
|
||||
noDataState: NoData
|
||||
execErrState: Alerting
|
||||
for: 5m
|
||||
annotations:
|
||||
node: '{{ $labels.instance }}'
|
||||
filesystem: '{{ $labels.mountpoint }}'
|
||||
disk_usage: '{{ printf "%.1f%%" $values.A }}'
|
||||
summary: 'Node disk usage is critically high'
|
||||
labels:
|
||||
severity: critical
|
||||
|
||||
- uid: node_load_average_high
|
||||
title: High Node Load Average
|
||||
condition: B
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
to: 0
|
||||
datasourceUid: P76F38748CEC837F0
|
||||
model:
|
||||
expr: 'node_load5 / on(instance) group_left count by(instance)(node_cpu_seconds_total{mode="idle"})'
|
||||
refId: A
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
- refId: B
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
to: 0
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
params:
|
||||
- 0.8
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
query:
|
||||
params: []
|
||||
datasource:
|
||||
type: __expr__
|
||||
uid: __expr__
|
||||
expression: A
|
||||
reducer: max
|
||||
refId: B
|
||||
type: reduce
|
||||
noDataState: NoData
|
||||
execErrState: Alerting
|
||||
for: 5m
|
||||
annotations:
|
||||
node: '{{ $labels.instance }}'
|
||||
load_average: '{{ printf "%.2f" $values.A }}'
|
||||
summary: 'Node load average is high relative to CPU count'
|
||||
labels:
|
||||
severity: warning
|
||||
|
||||
- uid: node_exporter_down
|
||||
title: Node Exporter Down
|
||||
condition: B
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
to: 0
|
||||
datasourceUid: P76F38748CEC837F0
|
||||
model:
|
||||
expr: 'up{job="node-exporter"}'
|
||||
refId: A
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
- refId: B
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
to: 0
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
params:
|
||||
- 1
|
||||
type: lt
|
||||
operator:
|
||||
type: and
|
||||
query:
|
||||
params: []
|
||||
datasource:
|
||||
type: __expr__
|
||||
uid: __expr__
|
||||
expression: A
|
||||
reducer: min
|
||||
refId: B
|
||||
type: reduce
|
||||
noDataState: NoData
|
||||
execErrState: Alerting
|
||||
for: 2m
|
||||
annotations:
|
||||
node: '{{ $labels.instance }}'
|
||||
summary: 'Node exporter is down - unable to collect metrics'
|
||||
labels:
|
||||
severity: critical
|
||||
|
||||
contactpoints.yaml: |
|
||||
apiVersion: 1
|
||||
@@ -149,4 +379,4 @@ data:
|
||||
- alertname
|
||||
group_wait: 10s
|
||||
group_interval: 5m
|
||||
repeat_interval: 4h
|
||||
repeat_interval: 12h
|
||||
|
||||
@@ -5,6 +5,7 @@ resources:
|
||||
- persistentVolume.yaml
|
||||
- external-secrets.yaml
|
||||
- grafana-alerting-configmap.yaml
|
||||
- alertmanager-config.yaml
|
||||
|
||||
helmCharts:
|
||||
- name: kube-prometheus-stack
|
||||
|
||||
@@ -26,11 +26,41 @@ alertmanager:
|
||||
{{ if .Annotations.description }}<b>Description:</b> {{ .Annotations.description }}{{ end }}
|
||||
{{ end }}
|
||||
|
||||
ingress:
|
||||
enabled: true
|
||||
ingressClassName: traefik
|
||||
annotations:
|
||||
cert-manager.io/cluster-issuer: letsencrypt
|
||||
traefik.ingress.kubernetes.io/router.middlewares: kube-system-https-redirect@kubernetescrd
|
||||
hosts:
|
||||
- prom.hexor.cy
|
||||
paths:
|
||||
- /alertmanager
|
||||
tls:
|
||||
- secretName: alertmanager-tls
|
||||
hosts:
|
||||
- prom.hexor.cy
|
||||
alertmanagerSpec:
|
||||
secrets:
|
||||
- alertmanager-telegram-secret
|
||||
externalUrl: https://prom.hexor.cy/alertmanager
|
||||
routePrefix: /alertmanager
|
||||
|
||||
prometheus:
|
||||
ingress:
|
||||
enabled: true
|
||||
ingressClassName: traefik
|
||||
annotations:
|
||||
cert-manager.io/cluster-issuer: letsencrypt
|
||||
traefik.ingress.kubernetes.io/router.middlewares: kube-system-https-redirect@kubernetescrd
|
||||
hosts:
|
||||
- prom.hexor.cy
|
||||
paths:
|
||||
- /
|
||||
tls:
|
||||
- secretName: prometheus-tls
|
||||
hosts:
|
||||
- prom.hexor.cy
|
||||
prometheusSpec:
|
||||
enableRemoteWriteReceiver: true
|
||||
additionalScrapeConfigs:
|
||||
|
||||
@@ -189,6 +189,9 @@ EOT
|
||||
meta_icon = "https://img.icons8.com/color/48/minecraft-grass-cube.png"
|
||||
mode = "proxy"
|
||||
outpost = "kubernetes-outpost"
|
||||
skip_path_regex = <<-EOT
|
||||
/clients
|
||||
EOT
|
||||
}
|
||||
"pasarguard" = {
|
||||
name = "PasarGuard"
|
||||
|
||||
Reference in New Issue
Block a user