Added node alerts
This commit is contained in:
@@ -111,6 +111,236 @@ data:
|
||||
labels:
|
||||
severity: critical
|
||||
|
||||
- uid: node_high_memory_usage
|
||||
title: High Node Memory Usage
|
||||
condition: B
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
to: 0
|
||||
datasourceUid: P76F38748CEC837F0
|
||||
model:
|
||||
expr: '(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100'
|
||||
refId: A
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
- refId: B
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
to: 0
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
params:
|
||||
- 80
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
query:
|
||||
params: []
|
||||
datasource:
|
||||
type: __expr__
|
||||
uid: __expr__
|
||||
expression: A
|
||||
reducer: last
|
||||
refId: B
|
||||
type: reduce
|
||||
noDataState: NoData
|
||||
execErrState: Alerting
|
||||
for: 5m
|
||||
annotations:
|
||||
node: '{{ $labels.instance }}'
|
||||
memory_usage: '{{ printf "%.1f%%" $values.A }}'
|
||||
summary: 'Node memory usage is critically high'
|
||||
labels:
|
||||
severity: warning
|
||||
|
||||
- uid: node_high_cpu_usage
|
||||
title: High Node CPU Usage
|
||||
condition: B
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
to: 0
|
||||
datasourceUid: P76F38748CEC837F0
|
||||
model:
|
||||
expr: '100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)'
|
||||
refId: A
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
- refId: B
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
to: 0
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
params:
|
||||
- 80
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
query:
|
||||
params: []
|
||||
datasource:
|
||||
type: __expr__
|
||||
uid: __expr__
|
||||
expression: A
|
||||
reducer: last
|
||||
refId: B
|
||||
type: reduce
|
||||
noDataState: NoData
|
||||
execErrState: Alerting
|
||||
for: 10m
|
||||
annotations:
|
||||
node: '{{ $labels.instance }}'
|
||||
cpu_usage: '{{ printf "%.1f%%" $values.A }}'
|
||||
summary: 'Node CPU usage is critically high'
|
||||
labels:
|
||||
severity: warning
|
||||
|
||||
- uid: node_high_disk_usage
|
||||
title: High Node Disk Usage
|
||||
condition: B
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
to: 0
|
||||
datasourceUid: P76F38748CEC837F0
|
||||
model:
|
||||
expr: '(1 - (node_filesystem_avail_bytes{fstype!~"tmpfs|fuse.lxcfs"} / node_filesystem_size_bytes)) * 100'
|
||||
refId: A
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
- refId: B
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
to: 0
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
params:
|
||||
- 85
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
query:
|
||||
params: []
|
||||
datasource:
|
||||
type: __expr__
|
||||
uid: __expr__
|
||||
expression: A
|
||||
reducer: last
|
||||
refId: B
|
||||
type: reduce
|
||||
noDataState: NoData
|
||||
execErrState: Alerting
|
||||
for: 5m
|
||||
annotations:
|
||||
node: '{{ $labels.instance }}'
|
||||
filesystem: '{{ $labels.mountpoint }}'
|
||||
disk_usage: '{{ printf "%.1f%%" $values.A }}'
|
||||
summary: 'Node disk usage is critically high'
|
||||
labels:
|
||||
severity: critical
|
||||
|
||||
- uid: node_load_average_high
|
||||
title: High Node Load Average
|
||||
condition: B
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
to: 0
|
||||
datasourceUid: P76F38748CEC837F0
|
||||
model:
|
||||
expr: 'node_load5 / on(instance) group_left count by(instance)(node_cpu_seconds_total{mode="idle"})'
|
||||
refId: A
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
- refId: B
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
to: 0
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
params:
|
||||
- 0.8
|
||||
type: gt
|
||||
operator:
|
||||
type: and
|
||||
query:
|
||||
params: []
|
||||
datasource:
|
||||
type: __expr__
|
||||
uid: __expr__
|
||||
expression: A
|
||||
reducer: last
|
||||
refId: B
|
||||
type: reduce
|
||||
noDataState: NoData
|
||||
execErrState: Alerting
|
||||
for: 5m
|
||||
annotations:
|
||||
node: '{{ $labels.instance }}'
|
||||
load_average: '{{ printf "%.2f" $values.A }}'
|
||||
summary: 'Node load average is high relative to CPU count'
|
||||
labels:
|
||||
severity: warning
|
||||
|
||||
- uid: node_exporter_down
|
||||
title: Node Exporter Down
|
||||
condition: B
|
||||
data:
|
||||
- refId: A
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
to: 0
|
||||
datasourceUid: P76F38748CEC837F0
|
||||
model:
|
||||
expr: 'up{job="node-exporter"}'
|
||||
refId: A
|
||||
intervalMs: 1000
|
||||
maxDataPoints: 43200
|
||||
- refId: B
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
to: 0
|
||||
datasourceUid: __expr__
|
||||
model:
|
||||
conditions:
|
||||
- evaluator:
|
||||
params:
|
||||
- 1
|
||||
type: lt
|
||||
operator:
|
||||
type: and
|
||||
query:
|
||||
params: []
|
||||
datasource:
|
||||
type: __expr__
|
||||
uid: __expr__
|
||||
expression: A
|
||||
reducer: last
|
||||
refId: B
|
||||
type: reduce
|
||||
noDataState: Alerting
|
||||
execErrState: Alerting
|
||||
for: 2m
|
||||
annotations:
|
||||
node: '{{ $labels.instance }}'
|
||||
summary: 'Node exporter is down - unable to collect metrics'
|
||||
labels:
|
||||
severity: critical
|
||||
|
||||
contactpoints.yaml: |
|
||||
apiVersion: 1
|
||||
contactPoints:
|
||||
|
||||
Reference in New Issue
Block a user