apiVersion: v1 kind: ConfigMap metadata: name: grafana-alerting namespace: prometheus data: rules.yaml: | apiVersion: 1 groups: - orgId: 1 name: pasarguard_alerts folder: Kubernetes interval: 1m rules: - uid: pasarguard_cpu_throttling title: VPN CPU Throttle condition: B data: - refId: A relativeTimeRange: from: 600 to: 0 datasourceUid: P76F38748CEC837F0 model: expr: 'rate(container_cpu_cfs_throttled_periods_total{container="pasarguard-node"}[5m])' refId: A intervalMs: 1000 maxDataPoints: 43200 - refId: B relativeTimeRange: from: 600 to: 0 datasourceUid: __expr__ model: conditions: - evaluator: params: - 0.1 type: gt operator: type: and query: params: [] datasource: type: __expr__ uid: __expr__ expression: A reducer: min refId: B type: reduce noDataState: NoData execErrState: Alerting for: 5m annotations: pod: '{{ $labels.pod }}' node: '{{ $labels.node }}' namespace: '{{ $labels.namespace }}' throttle_rate: '{{ printf "%.2f" $values.A }}' summary: 'VPN node throttling CPU' labels: severity: warning - orgId: 1 name: kubernetes_alerts folder: Kubernetes interval: 2m rules: - uid: node_not_ready title: Kubernetes Node Not Ready condition: B data: - refId: A relativeTimeRange: from: 600 to: 0 datasourceUid: P76F38748CEC837F0 model: expr: 'kube_node_status_condition{condition="Ready",status="false"}' refId: A intervalMs: 1000 maxDataPoints: 43200 - refId: B relativeTimeRange: from: 600 to: 0 datasourceUid: __expr__ model: conditions: - evaluator: params: - 0 type: gt operator: type: and query: params: [] datasource: type: __expr__ uid: __expr__ expression: A reducer: min refId: B type: reduce noDataState: NoData execErrState: Alerting for: 10m annotations: node: '{{ $labels.node }}' condition: '{{ $labels.condition }}' summary: 'Kubernetes node is not ready' labels: severity: critical - uid: node_high_memory_usage title: High Node Memory Usage condition: B data: - refId: A relativeTimeRange: from: 300 to: 0 datasourceUid: P76F38748CEC837F0 model: expr: '(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100' refId: A intervalMs: 1000 maxDataPoints: 43200 - refId: B relativeTimeRange: from: 300 to: 0 datasourceUid: __expr__ model: conditions: - evaluator: params: - 80 type: gt operator: type: and query: params: [] datasource: type: __expr__ uid: __expr__ expression: A reducer: max refId: B type: reduce noDataState: NoData execErrState: Alerting for: 5m annotations: node: '{{ $labels.instance }}' memory_usage: '{{ printf "%.1f%%" $values.A }}' summary: 'Node memory usage is critically high' labels: severity: warning - uid: node_high_cpu_usage title: High Node CPU Usage condition: B data: - refId: A relativeTimeRange: from: 300 to: 0 datasourceUid: P76F38748CEC837F0 model: expr: '100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)' refId: A intervalMs: 1000 maxDataPoints: 43200 - refId: B relativeTimeRange: from: 300 to: 0 datasourceUid: __expr__ model: conditions: - evaluator: params: - 80 type: gt operator: type: and query: params: [] datasource: type: __expr__ uid: __expr__ expression: A reducer: max refId: B type: reduce noDataState: NoData execErrState: Alerting for: 10m annotations: node: '{{ $labels.instance }}' cpu_usage: '{{ printf "%.1f%%" $values.A }}' summary: 'Node CPU usage is critically high' labels: severity: warning - uid: node_high_disk_usage title: High Node Disk Usage condition: B data: - refId: A relativeTimeRange: from: 300 to: 0 datasourceUid: P76F38748CEC837F0 model: expr: '(1 - (node_filesystem_avail_bytes{fstype=~"ext[234]|xfs|zfs|btrfs"} / node_filesystem_size_bytes)) * 100' refId: A intervalMs: 1000 maxDataPoints: 43200 - refId: B relativeTimeRange: from: 300 to: 0 datasourceUid: __expr__ model: conditions: - evaluator: params: - 85 type: gt operator: type: and query: params: [] datasource: type: __expr__ uid: __expr__ expression: A reducer: max refId: B type: reduce noDataState: NoData execErrState: Alerting for: 5m annotations: node: '{{ $labels.instance }}' filesystem: '{{ $labels.mountpoint }}' disk_usage: '{{ printf "%.1f%%" $values.A }}' summary: 'Node disk usage is critically high' labels: severity: critical - uid: node_load_average_high title: High Node Load Average condition: B data: - refId: A relativeTimeRange: from: 300 to: 0 datasourceUid: P76F38748CEC837F0 model: expr: 'node_load5 / on(instance) group_left count by(instance)(node_cpu_seconds_total{mode="idle"})' refId: A intervalMs: 1000 maxDataPoints: 43200 - refId: B relativeTimeRange: from: 300 to: 0 datasourceUid: __expr__ model: conditions: - evaluator: params: - 0.8 type: gt operator: type: and query: params: [] datasource: type: __expr__ uid: __expr__ expression: A reducer: max refId: B type: reduce noDataState: NoData execErrState: Alerting for: 5m annotations: node: '{{ $labels.instance }}' load_average: '{{ printf "%.2f" $values.A }}' summary: 'Node load average is high relative to CPU count' labels: severity: warning - uid: node_exporter_down title: Node Exporter Down condition: B data: - refId: A relativeTimeRange: from: 300 to: 0 datasourceUid: P76F38748CEC837F0 model: expr: 'up{job="node-exporter"}' refId: A intervalMs: 1000 maxDataPoints: 43200 - refId: B relativeTimeRange: from: 300 to: 0 datasourceUid: __expr__ model: conditions: - evaluator: params: - 1 type: lt operator: type: and query: params: [] datasource: type: __expr__ uid: __expr__ expression: A reducer: min refId: B type: reduce noDataState: NoData execErrState: Alerting for: 2m annotations: node: '{{ $labels.instance }}' summary: 'Node exporter is down - unable to collect metrics' labels: severity: critical contactpoints.yaml: | apiVersion: 1 contactPoints: - orgId: 1 name: telegram receivers: - uid: telegram_default type: telegram disableResolveMessage: false settings: bottoken: $TELEGRAM_BOT_TOKEN chatid: "124317807" message: | {{ if eq .Status "firing" }}🔥 FIRING{{ else }}✅ RESOLVED{{ end }} {{ range .Alerts }} 📊 {{ .Labels.alertname }} {{ .Annotations.summary }} {{ if .Annotations.node }}🖥 Node: {{ .Annotations.node }}{{ end }} {{ if .Annotations.pod }}📦 Pod: {{ .Annotations.pod }}{{ end }} {{ if .Annotations.namespace }}📁 Namespace: {{ .Annotations.namespace }}{{ end }} {{ if .Annotations.throttle_rate }}⚠️ Throttling rate: {{ .Annotations.throttle_rate }}{{ end }} 🔗 View in Grafana {{ end }} parse_mode: HTML policies.yaml: | apiVersion: 1 policies: - orgId: 1 receiver: telegram group_by: - grafana_folder - alertname group_wait: 10s group_interval: 5m repeat_interval: 12h