diff --git a/k8s/core/prom-stack/alertmanager-config.yaml b/k8s/core/prom-stack/alertmanager-config.yaml new file mode 100644 index 0000000..0b0221d --- /dev/null +++ b/k8s/core/prom-stack/alertmanager-config.yaml @@ -0,0 +1,46 @@ +apiVersion: monitoring.coreos.com/v1alpha1 +kind: AlertmanagerConfig +metadata: + name: telegram-notifications + namespace: prometheus + labels: + app: kube-prometheus-stack-alertmanager + release: prometheus +spec: + route: + groupBy: ['alertname', 'cluster', 'service'] + groupWait: 10s + groupInterval: 5m + repeatInterval: 12h + receiver: telegram + routes: + - matchers: + - name: alertname + value: Watchdog + matchType: "=" + receiver: 'null' + receivers: + - name: telegram + telegramConfigs: + - botToken: + name: alertmanager-telegram-secret + key: TELEGRAM_BOT_TOKEN + chatID: 124317807 + parseMode: HTML + sendResolved: true + disableNotifications: false + message: | + {{ if eq .Status "firing" }}🔥 FIRING{{ else }}✅ RESOLVED{{ end }} + + {{ range .Alerts }} + 📊 {{ .Labels.alertname }} + {{ .Annotations.summary }} + + {{ if .Annotations.node }}🖥 Node: {{ .Annotations.node }}{{ end }} + {{ if .Annotations.pod }}📦 Pod: {{ .Annotations.pod }}{{ end }} + {{ if .Annotations.namespace }}📁 Namespace: {{ .Annotations.namespace }}{{ end }} + {{ if .Annotations.throttle_rate }}⚠️ Throttling rate: {{ .Annotations.throttle_rate }}{{ end }} + + 🔗 View in Grafana + {{ end }} + - name: 'null' diff --git a/k8s/core/prom-stack/grafana-alerting-configmap.yaml b/k8s/core/prom-stack/grafana-alerting-configmap.yaml index 27a412c..fc49d45 100644 --- a/k8s/core/prom-stack/grafana-alerting-configmap.yaml +++ b/k8s/core/prom-stack/grafana-alerting-configmap.yaml @@ -45,7 +45,7 @@ data: type: __expr__ uid: __expr__ expression: A - reducer: last + reducer: min refId: B type: reduce noDataState: NoData @@ -63,7 +63,7 @@ data: - orgId: 1 name: kubernetes_alerts folder: Kubernetes - interval: 30s + interval: 2m rules: - uid: node_not_ready title: Kubernetes Node Not Ready @@ -71,17 +71,17 @@ data: data: - refId: A relativeTimeRange: - from: 300 + from: 600 to: 0 datasourceUid: P76F38748CEC837F0 model: - expr: 'kube_node_status_condition{condition="Ready",status="true"} == 0' + expr: 'kube_node_status_condition{condition="Ready",status="false"}' refId: A intervalMs: 1000 maxDataPoints: 43200 - refId: B relativeTimeRange: - from: 300 + from: 600 to: 0 datasourceUid: __expr__ model: @@ -98,18 +98,248 @@ data: type: __expr__ uid: __expr__ expression: A - reducer: last + reducer: min refId: B type: reduce - noDataState: Alerting + noDataState: NoData execErrState: Alerting - for: 0s + for: 10m annotations: node: '{{ $labels.node }}' condition: '{{ $labels.condition }}' summary: 'Kubernetes node is not ready' labels: severity: critical + + - uid: node_high_memory_usage + title: High Node Memory Usage + condition: B + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: P76F38748CEC837F0 + model: + expr: '(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100' + refId: A + intervalMs: 1000 + maxDataPoints: 43200 + - refId: B + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 80 + type: gt + operator: + type: and + query: + params: [] + datasource: + type: __expr__ + uid: __expr__ + expression: A + reducer: max + refId: B + type: reduce + noDataState: NoData + execErrState: Alerting + for: 5m + annotations: + node: '{{ $labels.instance }}' + memory_usage: '{{ printf "%.1f%%" $values.A }}' + summary: 'Node memory usage is critically high' + labels: + severity: warning + + - uid: node_high_cpu_usage + title: High Node CPU Usage + condition: B + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: P76F38748CEC837F0 + model: + expr: '100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)' + refId: A + intervalMs: 1000 + maxDataPoints: 43200 + - refId: B + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 80 + type: gt + operator: + type: and + query: + params: [] + datasource: + type: __expr__ + uid: __expr__ + expression: A + reducer: max + refId: B + type: reduce + noDataState: NoData + execErrState: Alerting + for: 10m + annotations: + node: '{{ $labels.instance }}' + cpu_usage: '{{ printf "%.1f%%" $values.A }}' + summary: 'Node CPU usage is critically high' + labels: + severity: warning + + - uid: node_high_disk_usage + title: High Node Disk Usage + condition: B + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: P76F38748CEC837F0 + model: + expr: '(1 - (node_filesystem_avail_bytes{fstype=~"ext[234]|xfs|zfs|btrfs"} / node_filesystem_size_bytes)) * 100' + refId: A + intervalMs: 1000 + maxDataPoints: 43200 + - refId: B + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 85 + type: gt + operator: + type: and + query: + params: [] + datasource: + type: __expr__ + uid: __expr__ + expression: A + reducer: max + refId: B + type: reduce + noDataState: NoData + execErrState: Alerting + for: 5m + annotations: + node: '{{ $labels.instance }}' + filesystem: '{{ $labels.mountpoint }}' + disk_usage: '{{ printf "%.1f%%" $values.A }}' + summary: 'Node disk usage is critically high' + labels: + severity: critical + + - uid: node_load_average_high + title: High Node Load Average + condition: B + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: P76F38748CEC837F0 + model: + expr: 'node_load5 / on(instance) group_left count by(instance)(node_cpu_seconds_total{mode="idle"})' + refId: A + intervalMs: 1000 + maxDataPoints: 43200 + - refId: B + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0.8 + type: gt + operator: + type: and + query: + params: [] + datasource: + type: __expr__ + uid: __expr__ + expression: A + reducer: max + refId: B + type: reduce + noDataState: NoData + execErrState: Alerting + for: 5m + annotations: + node: '{{ $labels.instance }}' + load_average: '{{ printf "%.2f" $values.A }}' + summary: 'Node load average is high relative to CPU count' + labels: + severity: warning + + - uid: node_exporter_down + title: Node Exporter Down + condition: B + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: P76F38748CEC837F0 + model: + expr: 'up{job="node-exporter"}' + refId: A + intervalMs: 1000 + maxDataPoints: 43200 + - refId: B + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 1 + type: lt + operator: + type: and + query: + params: [] + datasource: + type: __expr__ + uid: __expr__ + expression: A + reducer: min + refId: B + type: reduce + noDataState: NoData + execErrState: Alerting + for: 2m + annotations: + node: '{{ $labels.instance }}' + summary: 'Node exporter is down - unable to collect metrics' + labels: + severity: critical contactpoints.yaml: | apiVersion: 1 @@ -149,4 +379,4 @@ data: - alertname group_wait: 10s group_interval: 5m - repeat_interval: 4h + repeat_interval: 12h diff --git a/k8s/core/prom-stack/kustomization.yaml b/k8s/core/prom-stack/kustomization.yaml index e08a833..225c5e4 100644 --- a/k8s/core/prom-stack/kustomization.yaml +++ b/k8s/core/prom-stack/kustomization.yaml @@ -5,6 +5,7 @@ resources: - persistentVolume.yaml - external-secrets.yaml - grafana-alerting-configmap.yaml + - alertmanager-config.yaml helmCharts: - name: kube-prometheus-stack diff --git a/k8s/core/prom-stack/prom-values.yaml b/k8s/core/prom-stack/prom-values.yaml index b96bd52..fff45e2 100644 --- a/k8s/core/prom-stack/prom-values.yaml +++ b/k8s/core/prom-stack/prom-values.yaml @@ -26,11 +26,41 @@ alertmanager: {{ if .Annotations.description }}Description: {{ .Annotations.description }}{{ end }} {{ end }} + ingress: + enabled: true + ingressClassName: traefik + annotations: + cert-manager.io/cluster-issuer: letsencrypt + traefik.ingress.kubernetes.io/router.middlewares: kube-system-https-redirect@kubernetescrd + hosts: + - prom.hexor.cy + paths: + - /alertmanager + tls: + - secretName: alertmanager-tls + hosts: + - prom.hexor.cy alertmanagerSpec: secrets: - alertmanager-telegram-secret + externalUrl: https://prom.hexor.cy/alertmanager + routePrefix: /alertmanager prometheus: + ingress: + enabled: true + ingressClassName: traefik + annotations: + cert-manager.io/cluster-issuer: letsencrypt + traefik.ingress.kubernetes.io/router.middlewares: kube-system-https-redirect@kubernetescrd + hosts: + - prom.hexor.cy + paths: + - / + tls: + - secretName: prometheus-tls + hosts: + - prom.hexor.cy prometheusSpec: enableRemoteWriteReceiver: true additionalScrapeConfigs: diff --git a/terraform/authentik/proxy-apps.tfvars b/terraform/authentik/proxy-apps.tfvars index f68705a..956b55f 100644 --- a/terraform/authentik/proxy-apps.tfvars +++ b/terraform/authentik/proxy-apps.tfvars @@ -189,6 +189,9 @@ EOT meta_icon = "https://img.icons8.com/color/48/minecraft-grass-cube.png" mode = "proxy" outpost = "kubernetes-outpost" + skip_path_regex = <<-EOT +/clients +EOT } "pasarguard" = { name = "PasarGuard"