Added node alerts
All checks were successful
Update Kubernetes Services Wiki / Generate and Update K8s Wiki (push) Successful in 8s
Check with kubeconform / lint (push) Successful in 7s
Auto-update README / Generate README and Create MR (push) Successful in 7s

This commit is contained in:
Ultradesu
2026-02-09 13:00:15 +02:00
parent 137384ce55
commit 8a8cab019f

View File

@@ -111,6 +111,236 @@ data:
labels:
severity: critical
- uid: node_high_memory_usage
title: High Node Memory Usage
condition: B
data:
- refId: A
relativeTimeRange:
from: 300
to: 0
datasourceUid: P76F38748CEC837F0
model:
expr: '(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100'
refId: A
intervalMs: 1000
maxDataPoints: 43200
- refId: B
relativeTimeRange:
from: 300
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 80
type: gt
operator:
type: and
query:
params: []
datasource:
type: __expr__
uid: __expr__
expression: A
reducer: last
refId: B
type: reduce
noDataState: NoData
execErrState: Alerting
for: 5m
annotations:
node: '{{ $labels.instance }}'
memory_usage: '{{ printf "%.1f%%" $values.A }}'
summary: 'Node memory usage is critically high'
labels:
severity: warning
- uid: node_high_cpu_usage
title: High Node CPU Usage
condition: B
data:
- refId: A
relativeTimeRange:
from: 300
to: 0
datasourceUid: P76F38748CEC837F0
model:
expr: '100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)'
refId: A
intervalMs: 1000
maxDataPoints: 43200
- refId: B
relativeTimeRange:
from: 300
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 80
type: gt
operator:
type: and
query:
params: []
datasource:
type: __expr__
uid: __expr__
expression: A
reducer: last
refId: B
type: reduce
noDataState: NoData
execErrState: Alerting
for: 10m
annotations:
node: '{{ $labels.instance }}'
cpu_usage: '{{ printf "%.1f%%" $values.A }}'
summary: 'Node CPU usage is critically high'
labels:
severity: warning
- uid: node_high_disk_usage
title: High Node Disk Usage
condition: B
data:
- refId: A
relativeTimeRange:
from: 300
to: 0
datasourceUid: P76F38748CEC837F0
model:
expr: '(1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|fuse.lxcfs"} / node_filesystem_size_bytes)) * 100'
refId: A
intervalMs: 1000
maxDataPoints: 43200
- refId: B
relativeTimeRange:
from: 300
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 85
type: gt
operator:
type: and
query:
params: []
datasource:
type: __expr__
uid: __expr__
expression: A
reducer: last
refId: B
type: reduce
noDataState: NoData
execErrState: Alerting
for: 5m
annotations:
node: '{{ $labels.instance }}'
filesystem: '{{ $labels.mountpoint }}'
disk_usage: '{{ printf "%.1f%%" $values.A }}'
summary: 'Node disk usage is critically high'
labels:
severity: critical
- uid: node_load_average_high
title: High Node Load Average
condition: B
data:
- refId: A
relativeTimeRange:
from: 300
to: 0
datasourceUid: P76F38748CEC837F0
model:
expr: 'node_load5 / on(instance) group_left count by(instance)(node_cpu_seconds_total{mode="idle"})'
refId: A
intervalMs: 1000
maxDataPoints: 43200
- refId: B
relativeTimeRange:
from: 300
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0.8
type: gt
operator:
type: and
query:
params: []
datasource:
type: __expr__
uid: __expr__
expression: A
reducer: last
refId: B
type: reduce
noDataState: NoData
execErrState: Alerting
for: 5m
annotations:
node: '{{ $labels.instance }}'
load_average: '{{ printf "%.2f" $values.A }}'
summary: 'Node load average is high relative to CPU count'
labels:
severity: warning
- uid: node_exporter_down
title: Node Exporter Down
condition: B
data:
- refId: A
relativeTimeRange:
from: 300
to: 0
datasourceUid: P76F38748CEC837F0
model:
expr: 'up{job="node-exporter"}'
refId: A
intervalMs: 1000
maxDataPoints: 43200
- refId: B
relativeTimeRange:
from: 300
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 1
type: lt
operator:
type: and
query:
params: []
datasource:
type: __expr__
uid: __expr__
expression: A
reducer: last
refId: B
type: reduce
noDataState: Alerting
execErrState: Alerting
for: 2m
annotations:
node: '{{ $labels.instance }}'
summary: 'Node exporter is down - unable to collect metrics'
labels:
severity: critical
contactpoints.yaml: |
apiVersion: 1
contactPoints: