Compare commits
19 Commits
auto-updat
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c28566ce21 | ||
|
|
1b2b4da98d | ||
|
|
014284c11d | ||
|
|
fc00513db3 | ||
|
|
1451a5fb37 | ||
|
|
d19ae33cd1 | ||
|
|
8a8cab019f | ||
|
|
137384ce55 | ||
|
|
1aee4d5cd7 | ||
|
|
9d6fa51fc7 | ||
| fc689d5e22 | |||
| a2f4f989e7 | |||
| cacc5ef02b | |||
| f05a1515e6 | |||
| dbb9722840 | |||
| e7e066587f | |||
| cb83a3fa38 | |||
|
|
4b3e1a10d4 | ||
|
|
caf024aaa2 |
@@ -19,6 +19,35 @@ spec:
|
||||
component: main
|
||||
spec:
|
||||
serviceAccountName: n8n
|
||||
initContainers:
|
||||
- name: install-tools
|
||||
image: alpine:3.22
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
- |
|
||||
set -e
|
||||
if [ -x /tools/kubectl ]; then
|
||||
echo "kubectl already exists, skipping download"
|
||||
/tools/kubectl version --client
|
||||
exit 0
|
||||
fi
|
||||
echo "Downloading kubectl..."
|
||||
ARCH=$(uname -m)
|
||||
case $ARCH in
|
||||
x86_64) ARCH="amd64" ;;
|
||||
aarch64) ARCH="arm64" ;;
|
||||
esac
|
||||
wget -O /tools/kubectl "https://dl.k8s.io/release/$(wget -qO- https://dl.k8s.io/release/stable.txt)/bin/linux/${ARCH}/kubectl"
|
||||
chmod +x /tools/kubectl
|
||||
/tools/kubectl version --client
|
||||
volumeMounts:
|
||||
- name: tools
|
||||
mountPath: /tools
|
||||
securityContext:
|
||||
runAsUser: 1000
|
||||
runAsGroup: 1000
|
||||
runAsNonRoot: true
|
||||
containers:
|
||||
- name: n8n
|
||||
image: docker.n8n.io/n8nio/n8n:latest
|
||||
@@ -26,6 +55,8 @@ spec:
|
||||
- containerPort: 5678
|
||||
name: http
|
||||
env:
|
||||
- name: PATH
|
||||
value: "/opt/tools:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
|
||||
- name: HOME
|
||||
value: "/home/node"
|
||||
- name: N8N_ENFORCE_SETTINGS_FILE_PERMISSIONS
|
||||
@@ -83,13 +114,15 @@ spec:
|
||||
volumeMounts:
|
||||
- name: n8n-data
|
||||
mountPath: /home/node/.n8n
|
||||
- name: tools
|
||||
mountPath: /opt/tools
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 128Mi
|
||||
limits:
|
||||
cpu: 512m
|
||||
cpu: 2000m
|
||||
memory: 512Mi
|
||||
limits:
|
||||
cpu: 4000m
|
||||
memory: 2048Gi
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /healthz
|
||||
@@ -110,6 +143,9 @@ spec:
|
||||
- name: n8n-data
|
||||
persistentVolumeClaim:
|
||||
claimName: n8n-data
|
||||
- name: tools
|
||||
persistentVolumeClaim:
|
||||
claimName: n8n-tools
|
||||
securityContext:
|
||||
runAsUser: 1000
|
||||
runAsGroup: 1000
|
||||
|
||||
@@ -7,7 +7,7 @@ metadata:
|
||||
app: n8n
|
||||
component: worker
|
||||
spec:
|
||||
replicas: 1
|
||||
replicas: 2
|
||||
selector:
|
||||
matchLabels:
|
||||
app: n8n
|
||||
@@ -24,8 +24,12 @@ spec:
|
||||
image: docker.n8n.io/n8nio/n8n:latest
|
||||
command: ["n8n", "worker"]
|
||||
env:
|
||||
- name: PATH
|
||||
value: "/opt/tools:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
|
||||
- name: HOME
|
||||
value: "/home/node"
|
||||
- name: NODES_EXCLUDE
|
||||
value: "[]"
|
||||
- name: N8N_ENFORCE_SETTINGS_FILE_PERMISSIONS
|
||||
value: "true"
|
||||
- name: N8N_RUNNERS_ENABLED
|
||||
@@ -75,13 +79,15 @@ spec:
|
||||
volumeMounts:
|
||||
- name: n8n-data
|
||||
mountPath: /home/node/.n8n
|
||||
- name: tools
|
||||
mountPath: /opt/tools
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 256Mi
|
||||
cpu: 2000m
|
||||
memory: 512Mi
|
||||
limits:
|
||||
cpu: 1000m
|
||||
memory: 1Gi
|
||||
cpu: 4000m
|
||||
memory: 2048Gi
|
||||
livenessProbe:
|
||||
exec:
|
||||
command:
|
||||
@@ -96,6 +102,9 @@ spec:
|
||||
- name: n8n-data
|
||||
persistentVolumeClaim:
|
||||
claimName: n8n-data
|
||||
- name: tools
|
||||
persistentVolumeClaim:
|
||||
claimName: n8n-tools
|
||||
securityContext:
|
||||
runAsUser: 1000
|
||||
runAsGroup: 1000
|
||||
|
||||
@@ -9,19 +9,27 @@ kind: ClusterRole
|
||||
metadata:
|
||||
name: n8n-clusterrole
|
||||
rules:
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- pods
|
||||
- jobs
|
||||
- cronjobs
|
||||
- deployments
|
||||
- statefulsets
|
||||
verbs:
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
- create
|
||||
# Core API group ("")
|
||||
- apiGroups: [""]
|
||||
resources: ["*"]
|
||||
verbs: ["get", "list", "watch"]
|
||||
|
||||
# Common built-in API groups
|
||||
- apiGroups: ["apps", "batch", "autoscaling", "extensions", "policy"]
|
||||
resources: ["*"]
|
||||
verbs: ["get", "list", "watch"]
|
||||
|
||||
- apiGroups: ["networking.k8s.io", "rbac.authorization.k8s.io", "apiextensions.k8s.io"]
|
||||
resources: ["*"]
|
||||
verbs: ["get", "list", "watch"]
|
||||
|
||||
- apiGroups: ["coordination.k8s.io", "discovery.k8s.io", "events.k8s.io"]
|
||||
resources: ["*"]
|
||||
verbs: ["get", "list", "watch"]
|
||||
|
||||
- apiGroups: ["storage.k8s.io", "admissionregistration.k8s.io", "authentication.k8s.io", "authorization.k8s.io"]
|
||||
resources: ["*"]
|
||||
verbs: ["get", "list", "watch"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
|
||||
@@ -10,3 +10,15 @@ spec:
|
||||
resources:
|
||||
requests:
|
||||
storage: 10Gi
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: n8n-tools
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteMany
|
||||
storageClassName: longhorn
|
||||
resources:
|
||||
requests:
|
||||
storage: 20Gi
|
||||
|
||||
@@ -1,2 +1,4 @@
|
||||
longhornUI:
|
||||
replicas: 1
|
||||
persistence:
|
||||
reclaimPolicy: "Retain"
|
||||
|
||||
46
k8s/core/prom-stack/alertmanager-config.yaml
Normal file
46
k8s/core/prom-stack/alertmanager-config.yaml
Normal file
@@ -0,0 +1,46 @@
|
||||
apiVersion: monitoring.coreos.com/v1alpha1
|
||||
kind: AlertmanagerConfig
|
||||
metadata:
|
||||
name: telegram-notifications
|
||||
namespace: prometheus
|
||||
labels:
|
||||
app: kube-prometheus-stack-alertmanager
|
||||
release: prometheus
|
||||
spec:
|
||||
route:
|
||||
groupBy: ['alertname', 'cluster', 'service']
|
||||
groupWait: 10s
|
||||
groupInterval: 5m
|
||||
repeatInterval: 12h
|
||||
receiver: telegram
|
||||
routes:
|
||||
- matchers:
|
||||
- name: alertname
|
||||
value: Watchdog
|
||||
matchType: "="
|
||||
receiver: 'null'
|
||||
receivers:
|
||||
- name: telegram
|
||||
telegramConfigs:
|
||||
- botToken:
|
||||
name: alertmanager-telegram-secret
|
||||
key: TELEGRAM_BOT_TOKEN
|
||||
chatID: 124317807
|
||||
parseMode: HTML
|
||||
sendResolved: true
|
||||
disableNotifications: false
|
||||
message: |
|
||||
{{ if eq .Status "firing" }}🔥 FIRING{{ else }}✅ RESOLVED{{ end }}
|
||||
|
||||
{{ range .Alerts }}
|
||||
📊 <b>{{ .Labels.alertname }}</b>
|
||||
{{ .Annotations.summary }}
|
||||
|
||||
{{ if .Annotations.node }}🖥 <b>Node:</b> <code>{{ .Annotations.node }}</code>{{ end }}
|
||||
{{ if .Annotations.pod }}📦 <b>Pod:</b> <code>{{ .Annotations.pod }}</code>{{ end }}
|
||||
{{ if .Annotations.namespace }}📁 <b>Namespace:</b> <code>{{ .Annotations.namespace }}</code>{{ end }}
|
||||
{{ if .Annotations.throttle_rate }}⚠️ <b>Throttling rate:</b> {{ .Annotations.throttle_rate }}{{ end }}
|
||||
|
||||
🔗 <a href="{{ .GeneratorURL }}">View in Grafana</a>
|
||||
{{ end }}
|
||||
- name: 'null'
|
||||
@@ -45,7 +45,7 @@ data:
|
||||
type: __expr__
|
||||
uid: __expr__
|
||||
expression: A
|
||||
reducer: last
|
||||
reducer: min
|
||||
refId: B
|
||||
type: reduce
|
||||
noDataState: NoData
|
||||
@@ -63,7 +63,7 @@ data:
|
||||
- orgId: 1
|
||||
name: kubernetes_alerts
|
||||
folder: Kubernetes
|
||||
interval: 30s
|
||||
interval: 2m
|
||||
rules:
|
||||
- uid: node_not_ready
|
||||
title: Kubernetes Node Not Ready
|
||||
@@ -71,17 +71,17 @@ data:
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
from: 600
|
||||
to: 0
|
||||
datasourceUid: P76F38748CEC837F0
|
||||
model:
|
||||
expr: 'kube_node_status_condition{condition="Ready",status="true"} == 0'
|
||||
expr: 'kube_node_status_condition{condition="Ready",status="false"}'
|
||||
refId: A
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
- refId: B
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
from: 600
|
||||
to: 0
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
@@ -98,12 +98,12 @@ data:
|
||||
type: __expr__
|
||||
uid: __expr__
|
||||
expression: A
|
||||
reducer: last
|
||||
reducer: min
|
||||
refId: B
|
||||
type: reduce
|
||||
noDataState: Alerting
|
||||
noDataState: NoData
|
||||
execErrState: Alerting
|
||||
for: 0s
|
||||
for: 10m
|
||||
annotations:
|
||||
node: '{{ $labels.node }}'
|
||||
condition: '{{ $labels.condition }}'
|
||||
@@ -111,6 +111,236 @@ data:
|
||||
labels:
|
||||
severity: critical
|
||||
|
||||
- uid: node_high_memory_usage
|
||||
title: High Node Memory Usage
|
||||
condition: B
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
to: 0
|
||||
datasourceUid: P76F38748CEC837F0
|
||||
model:
|
||||
expr: '(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100'
|
||||
refId: A
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
- refId: B
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
to: 0
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
params:
|
||||
- 80
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
query:
|
||||
params: []
|
||||
datasource:
|
||||
type: __expr__
|
||||
uid: __expr__
|
||||
expression: A
|
||||
reducer: max
|
||||
refId: B
|
||||
type: reduce
|
||||
noDataState: NoData
|
||||
execErrState: Alerting
|
||||
for: 5m
|
||||
annotations:
|
||||
node: '{{ $labels.instance }}'
|
||||
memory_usage: '{{ printf "%.1f%%" $values.A }}'
|
||||
summary: 'Node memory usage is critically high'
|
||||
labels:
|
||||
severity: warning
|
||||
|
||||
- uid: node_high_cpu_usage
|
||||
title: High Node CPU Usage
|
||||
condition: B
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
to: 0
|
||||
datasourceUid: P76F38748CEC837F0
|
||||
model:
|
||||
expr: '100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)'
|
||||
refId: A
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
- refId: B
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
to: 0
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
params:
|
||||
- 80
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
query:
|
||||
params: []
|
||||
datasource:
|
||||
type: __expr__
|
||||
uid: __expr__
|
||||
expression: A
|
||||
reducer: max
|
||||
refId: B
|
||||
type: reduce
|
||||
noDataState: NoData
|
||||
execErrState: Alerting
|
||||
for: 10m
|
||||
annotations:
|
||||
node: '{{ $labels.instance }}'
|
||||
cpu_usage: '{{ printf "%.1f%%" $values.A }}'
|
||||
summary: 'Node CPU usage is critically high'
|
||||
labels:
|
||||
severity: warning
|
||||
|
||||
- uid: node_high_disk_usage
|
||||
title: High Node Disk Usage
|
||||
condition: B
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
to: 0
|
||||
datasourceUid: P76F38748CEC837F0
|
||||
model:
|
||||
expr: '(1 - (node_filesystem_avail_bytes{fstype=~"ext[234]|xfs|zfs|btrfs"} / node_filesystem_size_bytes)) * 100'
|
||||
refId: A
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
- refId: B
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
to: 0
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
params:
|
||||
- 85
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
query:
|
||||
params: []
|
||||
datasource:
|
||||
type: __expr__
|
||||
uid: __expr__
|
||||
expression: A
|
||||
reducer: max
|
||||
refId: B
|
||||
type: reduce
|
||||
noDataState: NoData
|
||||
execErrState: Alerting
|
||||
for: 5m
|
||||
annotations:
|
||||
node: '{{ $labels.instance }}'
|
||||
filesystem: '{{ $labels.mountpoint }}'
|
||||
disk_usage: '{{ printf "%.1f%%" $values.A }}'
|
||||
summary: 'Node disk usage is critically high'
|
||||
labels:
|
||||
severity: critical
|
||||
|
||||
- uid: node_load_average_high
|
||||
title: High Node Load Average
|
||||
condition: B
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
to: 0
|
||||
datasourceUid: P76F38748CEC837F0
|
||||
model:
|
||||
expr: 'node_load5 / on(instance) group_left count by(instance)(node_cpu_seconds_total{mode="idle"})'
|
||||
refId: A
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
- refId: B
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
to: 0
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
params:
|
||||
- 0.8
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
query:
|
||||
params: []
|
||||
datasource:
|
||||
type: __expr__
|
||||
uid: __expr__
|
||||
expression: A
|
||||
reducer: max
|
||||
refId: B
|
||||
type: reduce
|
||||
noDataState: NoData
|
||||
execErrState: Alerting
|
||||
for: 5m
|
||||
annotations:
|
||||
node: '{{ $labels.instance }}'
|
||||
load_average: '{{ printf "%.2f" $values.A }}'
|
||||
summary: 'Node load average is high relative to CPU count'
|
||||
labels:
|
||||
severity: warning
|
||||
|
||||
- uid: node_exporter_down
|
||||
title: Node Exporter Down
|
||||
condition: B
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
to: 0
|
||||
datasourceUid: P76F38748CEC837F0
|
||||
model:
|
||||
expr: 'up{job="node-exporter"}'
|
||||
refId: A
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
- refId: B
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
to: 0
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
params:
|
||||
- 1
|
||||
type: lt
|
||||
operator:
|
||||
type: and
|
||||
query:
|
||||
params: []
|
||||
datasource:
|
||||
type: __expr__
|
||||
uid: __expr__
|
||||
expression: A
|
||||
reducer: min
|
||||
refId: B
|
||||
type: reduce
|
||||
noDataState: NoData
|
||||
execErrState: Alerting
|
||||
for: 2m
|
||||
annotations:
|
||||
node: '{{ $labels.instance }}'
|
||||
summary: 'Node exporter is down - unable to collect metrics'
|
||||
labels:
|
||||
severity: critical
|
||||
|
||||
contactpoints.yaml: |
|
||||
apiVersion: 1
|
||||
contactPoints:
|
||||
@@ -149,4 +379,4 @@ data:
|
||||
- alertname
|
||||
group_wait: 10s
|
||||
group_interval: 5m
|
||||
repeat_interval: 4h
|
||||
repeat_interval: 12h
|
||||
|
||||
@@ -5,6 +5,7 @@ resources:
|
||||
- persistentVolume.yaml
|
||||
- external-secrets.yaml
|
||||
- grafana-alerting-configmap.yaml
|
||||
- alertmanager-config.yaml
|
||||
|
||||
helmCharts:
|
||||
- name: kube-prometheus-stack
|
||||
|
||||
@@ -26,11 +26,41 @@ alertmanager:
|
||||
{{ if .Annotations.description }}<b>Description:</b> {{ .Annotations.description }}{{ end }}
|
||||
{{ end }}
|
||||
|
||||
ingress:
|
||||
enabled: true
|
||||
ingressClassName: traefik
|
||||
annotations:
|
||||
cert-manager.io/cluster-issuer: letsencrypt
|
||||
traefik.ingress.kubernetes.io/router.middlewares: kube-system-https-redirect@kubernetescrd
|
||||
hosts:
|
||||
- prom.hexor.cy
|
||||
paths:
|
||||
- /alertmanager
|
||||
tls:
|
||||
- secretName: alertmanager-tls
|
||||
hosts:
|
||||
- prom.hexor.cy
|
||||
alertmanagerSpec:
|
||||
secrets:
|
||||
- alertmanager-telegram-secret
|
||||
externalUrl: https://prom.hexor.cy/alertmanager
|
||||
routePrefix: /alertmanager
|
||||
|
||||
prometheus:
|
||||
ingress:
|
||||
enabled: true
|
||||
ingressClassName: traefik
|
||||
annotations:
|
||||
cert-manager.io/cluster-issuer: letsencrypt
|
||||
traefik.ingress.kubernetes.io/router.middlewares: kube-system-https-redirect@kubernetescrd
|
||||
hosts:
|
||||
- prom.hexor.cy
|
||||
paths:
|
||||
- /
|
||||
tls:
|
||||
- secretName: prometheus-tls
|
||||
hosts:
|
||||
- prom.hexor.cy
|
||||
prometheusSpec:
|
||||
enableRemoteWriteReceiver: true
|
||||
additionalScrapeConfigs:
|
||||
|
||||
Reference in New Issue
Block a user