diff --git a/k8s/core/prom-stack/grafana-alerting-configmap.yaml b/k8s/core/prom-stack/grafana-alerting-configmap.yaml new file mode 100644 index 00000000..cb04acf8 --- /dev/null +++ b/k8s/core/prom-stack/grafana-alerting-configmap.yaml @@ -0,0 +1,75 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-alerting + namespace: prometheus +data: + rules.yaml: | + apiVersion: 1 + groups: + - orgId: 1 + name: pasarguard_alerts + folder: Kubernetes + interval: 1m + rules: + - uid: pasarguard_cpu_throttling + title: VPN CPU Throttle + condition: A + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: prometheus + model: + expr: 'rate(container_cpu_cfs_throttled_periods_total{container="pasarguard-node"}[5m]) > 0.1' + refId: A + noDataState: NoData + execErrState: Alerting + for: 5m + annotations: + description: 'Throttling rate: {{ printf "%.2f" $values.A.Value }}' + summary: 'VPN node throttling CPU on {{ $labels.node }}' + labels: + severity: warning + + contactpoints.yaml: | + apiVersion: 1 + contactPoints: + - orgId: 1 + name: telegram + receivers: + - uid: telegram_default + type: telegram + disableResolveMessage: false + settings: + bottoken: $TELEGRAM_BOT_TOKEN + chatid: $TELEGRAM_CHAT_ID + message: | + {{ if eq .Status "firing" }}🔥 FIRING{{ else }}✅ RESOLVED{{ end }} + + {{ range .Alerts }} + 📊 {{ .Labels.alertname }} + {{ if .Annotations.summary }}{{ .Annotations.summary }}{{ end }} + + 🎯 Details: + • Pod: {{ .Labels.pod }} + • Node: {{ .Labels.node }} + • Namespace: {{ .Labels.namespace }} + {{ if .Annotations.description }}• {{ .Annotations.description }}{{ end }} + + 🔗 View in Grafana + {{ end }} + parse_mode: HTML + + policies.yaml: | + apiVersion: 1 + policies: + - orgId: 1 + receiver: telegram + group_by: + - grafana_folder + - alertname + group_wait: 10s + group_interval: 5m + repeat_interval: 4h