From 8a8cab019f3e8ca8dbaee96715f57b0531557808 Mon Sep 17 00:00:00 2001 From: Ultradesu Date: Mon, 9 Feb 2026 13:00:15 +0200 Subject: [PATCH] Added node alerts --- .../grafana-alerting-configmap.yaml | 230 ++++++++++++++++++ 1 file changed, 230 insertions(+) diff --git a/k8s/core/prom-stack/grafana-alerting-configmap.yaml b/k8s/core/prom-stack/grafana-alerting-configmap.yaml index 2dfdddd..9c7ad41 100644 --- a/k8s/core/prom-stack/grafana-alerting-configmap.yaml +++ b/k8s/core/prom-stack/grafana-alerting-configmap.yaml @@ -110,6 +110,236 @@ data: summary: 'Kubernetes node is not ready' labels: severity: critical + + - uid: node_high_memory_usage + title: High Node Memory Usage + condition: B + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: P76F38748CEC837F0 + model: + expr: '(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100' + refId: A + intervalMs: 1000 + maxDataPoints: 43200 + - refId: B + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 80 + type: gt + operator: + type: and + query: + params: [] + datasource: + type: __expr__ + uid: __expr__ + expression: A + reducer: last + refId: B + type: reduce + noDataState: NoData + execErrState: Alerting + for: 5m + annotations: + node: '{{ $labels.instance }}' + memory_usage: '{{ printf "%.1f%%" $values.A }}' + summary: 'Node memory usage is critically high' + labels: + severity: warning + + - uid: node_high_cpu_usage + title: High Node CPU Usage + condition: B + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: P76F38748CEC837F0 + model: + expr: '100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)' + refId: A + intervalMs: 1000 + maxDataPoints: 43200 + - refId: B + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 80 + type: gt + operator: + type: and + query: + params: [] + datasource: + type: __expr__ + uid: __expr__ + expression: A + reducer: last + refId: B + type: reduce + noDataState: NoData + execErrState: Alerting + for: 10m + annotations: + node: '{{ $labels.instance }}' + cpu_usage: '{{ printf "%.1f%%" $values.A }}' + summary: 'Node CPU usage is critically high' + labels: + severity: warning + + - uid: node_high_disk_usage + title: High Node Disk Usage + condition: B + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: P76F38748CEC837F0 + model: + expr: '(1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|fuse.lxcfs"} / node_filesystem_size_bytes)) * 100' + refId: A + intervalMs: 1000 + maxDataPoints: 43200 + - refId: B + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 85 + type: gt + operator: + type: and + query: + params: [] + datasource: + type: __expr__ + uid: __expr__ + expression: A + reducer: last + refId: B + type: reduce + noDataState: NoData + execErrState: Alerting + for: 5m + annotations: + node: '{{ $labels.instance }}' + filesystem: '{{ $labels.mountpoint }}' + disk_usage: '{{ printf "%.1f%%" $values.A }}' + summary: 'Node disk usage is critically high' + labels: + severity: critical + + - uid: node_load_average_high + title: High Node Load Average + condition: B + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: P76F38748CEC837F0 + model: + expr: 'node_load5 / on(instance) group_left count by(instance)(node_cpu_seconds_total{mode="idle"})' + refId: A + intervalMs: 1000 + maxDataPoints: 43200 + - refId: B + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0.8 + type: gt + operator: + type: and + query: + params: [] + datasource: + type: __expr__ + uid: __expr__ + expression: A + reducer: last + refId: B + type: reduce + noDataState: NoData + execErrState: Alerting + for: 5m + annotations: + node: '{{ $labels.instance }}' + load_average: '{{ printf "%.2f" $values.A }}' + summary: 'Node load average is high relative to CPU count' + labels: + severity: warning + + - uid: node_exporter_down + title: Node Exporter Down + condition: B + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: P76F38748CEC837F0 + model: + expr: 'up{job="node-exporter"}' + refId: A + intervalMs: 1000 + maxDataPoints: 43200 + - refId: B + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 1 + type: lt + operator: + type: and + query: + params: [] + datasource: + type: __expr__ + uid: __expr__ + expression: A + reducer: last + refId: B + type: reduce + noDataState: Alerting + execErrState: Alerting + for: 2m + annotations: + node: '{{ $labels.instance }}' + summary: 'Node exporter is down - unable to collect metrics' + labels: + severity: critical contactpoints.yaml: | apiVersion: 1