Files
homelab/k8s/core/prom-stack/grafana-alerting-configmap.yaml

153 lines
4.8 KiB
YAML
Raw Normal View History

apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-alerting
namespace: prometheus
data:
rules.yaml: |
apiVersion: 1
groups:
- orgId: 1
name: pasarguard_alerts
folder: Kubernetes
interval: 1m
rules:
- uid: pasarguard_cpu_throttling
title: VPN CPU Throttle
2026-01-08 17:17:21 +00:00
condition: B
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
2026-01-08 17:03:20 +00:00
datasourceUid: P76F38748CEC837F0
model:
2026-01-08 17:17:21 +00:00
expr: 'rate(container_cpu_cfs_throttled_periods_total{container="pasarguard-node"}[5m])'
refId: A
2026-01-08 17:17:21 +00:00
intervalMs: 1000
maxDataPoints: 43200
- refId: B
relativeTimeRange:
from: 600
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0.1
type: gt
operator:
type: and
query:
params: []
datasource:
type: __expr__
uid: __expr__
expression: A
reducer: last
refId: B
type: reduce
noDataState: NoData
execErrState: Alerting
for: 5m
annotations:
2026-01-08 17:34:28 +00:00
pod: '{{ $labels.pod }}'
node: '{{ $labels.node }}'
namespace: '{{ $labels.namespace }}'
throttle_rate: '{{ printf "%.2f" $values.A }}'
summary: 'VPN node throttling CPU'
labels:
severity: warning
2026-01-09 01:26:23 +00:00
- orgId: 1
name: kubernetes_alerts
folder: Kubernetes
interval: 30s
rules:
- uid: node_not_ready
title: Kubernetes Node Not Ready
condition: B
data:
- refId: A
relativeTimeRange:
from: 300
to: 0
datasourceUid: P76F38748CEC837F0
model:
expr: 'kube_node_status_condition{condition="Ready",status="true"} == 0'
refId: A
intervalMs: 1000
maxDataPoints: 43200
- refId: B
relativeTimeRange:
from: 300
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0
type: gt
operator:
type: and
query:
params: []
datasource:
type: __expr__
uid: __expr__
expression: A
reducer: last
refId: B
type: reduce
noDataState: Alerting
execErrState: Alerting
for: 0s
annotations:
node: '{{ $labels.node }}'
condition: '{{ $labels.condition }}'
summary: 'Kubernetes node is not ready'
labels:
severity: critical
contactpoints.yaml: |
apiVersion: 1
contactPoints:
- orgId: 1
name: telegram
receivers:
- uid: telegram_default
type: telegram
disableResolveMessage: false
settings:
bottoken: $TELEGRAM_BOT_TOKEN
2026-01-08 16:53:18 +00:00
chatid: "124317807"
message: |
{{ if eq .Status "firing" }}🔥 FIRING{{ else }}✅ RESOLVED{{ end }}
{{ range .Alerts }}
📊 <b>{{ .Labels.alertname }}</b>
2026-01-08 17:34:28 +00:00
{{ .Annotations.summary }}
2026-01-09 01:26:23 +00:00
{{ if .Annotations.node }}🖥 <b>Node:</b> <code>{{ .Annotations.node }}</code>{{ end }}
{{ if .Annotations.pod }}📦 <b>Pod:</b> <code>{{ .Annotations.pod }}</code>{{ end }}
{{ if .Annotations.namespace }}📁 <b>Namespace:</b> <code>{{ .Annotations.namespace }}</code>{{ end }}
{{ if .Annotations.throttle_rate }}⚠️ <b>Throttling rate:</b> {{ .Annotations.throttle_rate }}{{ end }}
2026-01-08 17:34:28 +00:00
🔗 <a href="{{ .GeneratorURL }}">View in Grafana</a>
{{ end }}
parse_mode: HTML
policies.yaml: |
apiVersion: 1
policies:
- orgId: 1
receiver: telegram
group_by:
- grafana_folder
- alertname
group_wait: 10s
group_interval: 5m
repeat_interval: 4h