1、监控服务器是否重启
- alert: CentosServiceRestart
expr: time() - node_boot_time_seconds < 180
for: 2m
labels:
severity: warning
annotations:
summary: "Instance is restart"
description: "Instance is restarted, uptime <3min"
- alert: WindowsServiceRestart
expr: time() - windows_system_system_up_time < 180
for: 2m
labels:
severity: warning
annotations:
summary: "Instance is restart"
description: "Instance is restarted, uptime <3min"
2、内存使用过高
- alert: InstanceMemUsageHigh
expr: 100 - (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes/node_memory_MemTotal_bytes)*100 > 98
for: 2m
labels:
severity: critical
annotations:
summary: "Memory usage high"
description: "Memory usage above 98%.(current usage: {{ $value }}%)"
- alert: WinInstanceMemUsageHigh
expr: 100-(windows_os_physical_memory_free_bytes/windows_cs_physical_memory_bytes)*100 > 98
for: 3m
labels:
severity: critical
annotations:
summary: "Instance memory usage high"
description: "Instance memory usage above 98%.(current usage: {{ $value }}%)"
3、CPU使用过高
- alert: CPUUsageHigh
expr: 100-(avg(irate(node_cpu_seconds_total[2m])) by (instance,region) *100) > 90
for: 3m
labels:
severity: warning
annotations:
summary: "CPU usage high"
description: "CPU usage above 90%.(current usage: {{ $value }})"
- alert: WinCpuUsage
expr: 100 - (avg by (instance,region) (irate(windows_cpu_time_total{mode="idle"}[2m])) * 100) > 90
for: 3m
labels:
severity: warning
annotations:
summary: "Instance CPU usage high"
description: "Instance CPU Usage is more than 90%.(current usage: {{ $value }}%)"
4、磁盘使用率过高
- alert: DiskUsageHigh
expr: 100 - (node_filesystem_avail_bytes{fstype=~"ext4|xfs"}/node_filesystem_size_bytes{fstype=~"ext4|xfs"} )*100 > 95
for: 1m
labels:
severity: critical
annotations:
summary: "Disk usage high"
description: "Disk {{ $labels.mountpoint }} usage above 95%.(current usage: {{ $value }})"
- alert: WinDiskUsageHigh
expr: 100-(windows_logical_disk_free_bytes/windows_logical_disk_size_bytes)*100 > 95
for: 1m
labels:
severity: critical
annotations:
summary: "Instance disk usage high"
description: "Instance disk {{ $labels.volume }} usage above 95%.(current usage: {{ $value }}%)"
5、网络吞吐量
- alert: HostUnusualNetworkThroughputIn
expr: sum by (instance,device,region) (irate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 30
for: 5m
labels:
severity: warning
annotations:
summary: "Host unusual network throughput in"
description: "Host network interfaces are receiving too much data (> 30 MB/s).(current speed:{{ $value }}MB/s)"
- alert: WinHostUnusualNetworkThroughputIn
expr: sum by (instance,nic,region) (irate(windows_net_bytes_received_total{nic=~".*VirtIO.*"}[2m])) / 1024 / 1024>30
for: 5m
labels:
severity: warning
annotations:
summary: "Host unusual network throughput in"
description: "Host network interfaces are probably receiving too much data (> 30 MB/s).(current speed: {{ $value }})"
- alert: HostUnusualNetworkThroughputOut
expr: sum by (instance,device,region) (irate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 30
for: 5m
labels:
severity: warning
annotations:
summary: "Host unusual network throughput out"
description: "Host network interfaces are sending too much data (> 30 MB/s).(current speed:{{ $value }}MB/s)"
6、TCP连接
- alert: TCPEstablishedNum
expr: node_netstat_Tcp_CurrEstab > 2000
for: 1m
labels:
severity: warning
annotations:
summary: "TCP established connect too many"
description: "TCP establised connect count excess 2000.(current count: {{ $value }})"
7、服务器网络传输错误
- alert: HostNetworkTransmitErrors
expr: increase(node_network_transmit_errs_total[5m]) > 2
for: 5m
labels:
severity: warning
annotations:
summary: "Host Network Transmit Errors"
#description: "{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%v" $value }} transmit errors in the last five minutes.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
description: "Interface {{ $labels.device }} has transmit errors in the last five minutes.(current error packages:{{ $value }})"
8、磁盘读写延迟
- alert: HostUnusualDiskReadLatency
expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) * 1000 > 100
for: 5m
labels:
severity: warning
annotations:
summary: "Host unusual disk read latency"
description: "Disk read latency is growing (read operations > 100ms).(current latency: {{ $value }}ms)"
- alert: HostUnusualDiskWriteLatency
expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) * 1000 > 100
for: 5m
labels:
severity: warning
annotations:
summary: "Host unusual disk write latency"
description: "Disk write latency is growing (write operations > 100ms).(current latency: {{ $value }}ms)"
9、磁盘IO过高
- alert: DiskIOTimePerSec
expr: irate(node_disk_io_time_seconds_total[1m])*100 > 60
for: 2m
labels:
severity: warning
annotations:
summary: "Host disk io time high"
description: "Disk {{ $labels.device }} io time occupy above 60% (current rate: {{ $value }})"