apiVersion: v1
kind: ConfigMap
metadata:
  name: grafana-alerting
  namespace: prometheus
data:
  rules.yaml: |
    apiVersion: 1
    groups:
      - orgId: 1
        name: pasarguard_alerts
        folder: Kubernetes
        interval: 1m
        rules:
          - uid: pasarguard_cpu_throttling
            title: VPN CPU Throttle
            condition: B
            data:
              - refId: A
                relativeTimeRange:
                  from: 600
                  to: 0
                datasourceUid: P76F38748CEC837F0
                model:
                  expr: 'rate(container_cpu_cfs_throttled_periods_total{container="pasarguard-node"}[5m])'
                  refId: A
                  intervalMs: 1000
                  maxDataPoints: 43200
              - refId: B
                relativeTimeRange:
                  from: 600
                  to: 0
                datasourceUid: __expr__
                model:
                  conditions:
                    - evaluator:
                        params:
                          - 0.1
                        type: gt
                      operator:
                        type: and
                      query:
                        params: []
                  datasource:
                    type: __expr__
                    uid: __expr__
                  expression: A
                  reducer: min
                  refId: B
                  type: reduce
            noDataState: NoData
            execErrState: Alerting
            for: 5m
            annotations:
              pod: '{{ $labels.pod }}'
              node: '{{ $labels.node }}'
              namespace: '{{ $labels.namespace }}'
              throttle_rate: '{{ printf "%.2f" $values.A }}'
              summary: 'VPN node throttling CPU'
            labels:
              severity: warning
      
      - orgId: 1
        name: kubernetes_alerts
        folder: Kubernetes
        interval: 2m
        rules:
          - uid: node_not_ready
            title: Kubernetes Node Not Ready
            condition: B
            data:
              - refId: A
                relativeTimeRange:
                  from: 600
                  to: 0
                datasourceUid: P76F38748CEC837F0
                model:
                  expr: 'kube_node_status_condition{condition="Ready",status="false"}'
                  refId: A
                  intervalMs: 1000
                  maxDataPoints: 43200
              - refId: B
                relativeTimeRange:
                  from: 600
                  to: 0
                datasourceUid: __expr__
                model:
                  conditions:
                    - evaluator:
                        params:
                          - 0
                        type: gt
                      operator:
                        type: and
                      query:
                        params: []
                  datasource:
                    type: __expr__
                    uid: __expr__
                  expression: A
                  reducer: min
                  refId: B
                  type: reduce
            noDataState: NoData
            execErrState: Alerting
            for: 10m
            annotations:
              node: '{{ $labels.node }}'
              condition: '{{ $labels.condition }}'
              summary: 'Kubernetes node is not ready'
            labels:
              severity: critical

          - uid: node_high_memory_usage
            title: High Node Memory Usage
            condition: B
            data:
              - refId: A
                relativeTimeRange:
                  from: 300
                  to: 0
                datasourceUid: P76F38748CEC837F0
                model:
                  expr: '(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100'
                  refId: A
                  intervalMs: 1000
                  maxDataPoints: 43200
              - refId: B
                relativeTimeRange:
                  from: 300
                  to: 0
                datasourceUid: __expr__
                model:
                  conditions:
                    - evaluator:
                        params:
                          - 80
                        type: gt
                      operator:
                        type: and
                      query:
                        params: []
                  datasource:
                    type: __expr__
                    uid: __expr__
                  expression: A
                  reducer: max
                  refId: B
                  type: reduce
            noDataState: NoData
            execErrState: Alerting
            for: 5m
            annotations:
              node: '{{ $labels.instance }}'
              memory_usage: '{{ printf "%.1f%%" $values.A }}'
              summary: 'Node memory usage is critically high'
            labels:
              severity: warning

          - uid: node_high_cpu_usage
            title: High Node CPU Usage
            condition: B
            data:
              - refId: A
                relativeTimeRange:
                  from: 300
                  to: 0
                datasourceUid: P76F38748CEC837F0
                model:
                  expr: '100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)'
                  refId: A
                  intervalMs: 1000
                  maxDataPoints: 43200
              - refId: B
                relativeTimeRange:
                  from: 300
                  to: 0
                datasourceUid: __expr__
                model:
                  conditions:
                    - evaluator:
                        params:
                          - 80
                        type: gt
                      operator:
                        type: and
                      query:
                        params: []
                  datasource:
                    type: __expr__
                    uid: __expr__
                  expression: A
                  reducer: max
                  refId: B
                  type: reduce
            noDataState: NoData
            execErrState: Alerting
            for: 10m
            annotations:
              node: '{{ $labels.instance }}'
              cpu_usage: '{{ printf "%.1f%%" $values.A }}'
              summary: 'Node CPU usage is critically high'
            labels:
              severity: warning

          - uid: node_high_disk_usage
            title: High Node Disk Usage
            condition: B
            data:
              - refId: A
                relativeTimeRange:
                  from: 300
                  to: 0
                datasourceUid: P76F38748CEC837F0
                model:
                  expr: '(1 - (node_filesystem_avail_bytes{fstype=~"ext[234]|xfs|zfs|btrfs"} / node_filesystem_size_bytes)) * 100'
                  refId: A
                  intervalMs: 1000
                  maxDataPoints: 43200
              - refId: B
                relativeTimeRange:
                  from: 300
                  to: 0
                datasourceUid: __expr__
                model:
                  conditions:
                    - evaluator:
                        params:
                          - 85
                        type: gt
                      operator:
                        type: and
                      query:
                        params: []
                  datasource:
                    type: __expr__
                    uid: __expr__
                  expression: A
                  reducer: max
                  refId: B
                  type: reduce
            noDataState: NoData
            execErrState: Alerting
            for: 5m
            annotations:
              node: '{{ $labels.instance }}'
              filesystem: '{{ $labels.mountpoint }}'
              disk_usage: '{{ printf "%.1f%%" $values.A }}'
              summary: 'Node disk usage is critically high'
            labels:
              severity: critical

          - uid: node_load_average_high
            title: High Node Load Average
            condition: B
            data:
              - refId: A
                relativeTimeRange:
                  from: 300
                  to: 0
                datasourceUid: P76F38748CEC837F0
                model:
                  expr: 'node_load5 / on(instance) group_left count by(instance)(node_cpu_seconds_total{mode="idle"})'
                  refId: A
                  intervalMs: 1000
                  maxDataPoints: 43200
              - refId: B
                relativeTimeRange:
                  from: 300
                  to: 0
                datasourceUid: __expr__
                model:
                  conditions:
                    - evaluator:
                        params:
                          - 0.8
                        type: gt
                      operator:
                        type: and
                      query:
                        params: []
                  datasource:
                    type: __expr__
                    uid: __expr__
                  expression: A
                  reducer: max
                  refId: B
                  type: reduce
            noDataState: NoData
            execErrState: Alerting
            for: 5m
            annotations:
              node: '{{ $labels.instance }}'
              load_average: '{{ printf "%.2f" $values.A }}'
              summary: 'Node load average is high relative to CPU count'
            labels:
              severity: warning

          - uid: node_exporter_down
            title: Node Exporter Down
            condition: B
            data:
              - refId: A
                relativeTimeRange:
                  from: 300
                  to: 0
                datasourceUid: P76F38748CEC837F0
                model:
                  expr: 'up{job="node-exporter"}'
                  refId: A
                  intervalMs: 1000
                  maxDataPoints: 43200
              - refId: B
                relativeTimeRange:
                  from: 300
                  to: 0
                datasourceUid: __expr__
                model:
                  conditions:
                    - evaluator:
                        params:
                          - 1
                        type: lt
                      operator:
                        type: and
                      query:
                        params: []
                  datasource:
                    type: __expr__
                    uid: __expr__
                  expression: A
                  reducer: min
                  refId: B
                  type: reduce
            noDataState: NoData
            execErrState: Alerting
            for: 2m
            annotations:
              node: '{{ $labels.instance }}'
              summary: 'Node exporter is down - unable to collect metrics'
            labels:
              severity: critical
  
  contactpoints.yaml: |
    apiVersion: 1
    contactPoints:
      - orgId: 1
        name: telegram
        receivers:
          - uid: telegram_default
            type: telegram
            disableResolveMessage: false
            settings:
              bottoken: $TELEGRAM_BOT_TOKEN
              chatid: "124317807"
              message: |
                {{ if eq .Status "firing" }}🔥 FIRING{{ else }}✅ RESOLVED{{ end }}
                
                {{ range .Alerts }}
                📊 <b>{{ .Labels.alertname }}</b>
                {{ .Annotations.summary }}
                
                {{ if .Annotations.node }}🖥 <b>Node:</b> <code>{{ .Annotations.node }}</code>{{ end }}
                {{ if .Annotations.pod }}📦 <b>Pod:</b> <code>{{ .Annotations.pod }}</code>{{ end }}
                {{ if .Annotations.namespace }}📁 <b>Namespace:</b> <code>{{ .Annotations.namespace }}</code>{{ end }}
                {{ if .Annotations.throttle_rate }}⚠️ <b>Throttling rate:</b> {{ .Annotations.throttle_rate }}{{ end }}
                
                🔗 <a href="{{ .GeneratorURL }}">View in Grafana</a>
                {{ end }}
              parse_mode: HTML
  
  policies.yaml: |
    apiVersion: 1
    policies:
      - orgId: 1
        receiver: telegram
        group_by:
          - grafana_folder
          - alertname
        group_wait: 10s
        group_interval: 5m
        repeat_interval: 12h