### node_exporter_rules.yml
```
groups:
- name: HOST
rules:
- alert: Node实例已宕机
expr: up == 0
for: 10s
labels:
user: root
severity: Warning
annotations:
summary: "Instance {{ $labels.instance }} Down"
description: "xxx系统 {{ $labels.instance }} of job {{ $labels.job }} has been Down."
- alert: MasterDown
expr: up{job='federate'} == 0
for: 10m
labels:
severity: info
annotations:
summary: "Master 主机服务异常"
description: "xxx系统{{ $labels.instance }} Master 9090 端口服务异常"
- alert: InstanceDown
expr: (up{job='node'} == 0) and ((node_time_seconds-node_boot_time_seconds) > 1800)
for: 5m
labels:
severity: info
annotations:
summary: "监控数据获取异常"
description: "xxx系统{{ $labels.instance }} 主机可能宕机,所在节点 Master 私网IP {{ $labels.master_private_ip }} "
- alert: InstanceReboot
expr: (node_time_seconds-node_boot_time_seconds) < 600
labels:
severity: info
annotations:
summary: "重新启动"
description: "xxx系统{{ $labels.instance }} 重新启动"
value: "{{ $value }}"
- alert: CPU
expr: round(100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance,job) * 100),0.01) > 80
for: 7m
labels:
severity: Warn
annotations:
summary: "CPU 使用率高"
description: "xxx系统{{ $labels.instance }} CPU 使用率 {{ $value }}%"
value: "{{ $value }}"
- alert: CPU
expr: round(100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance,job) * 100),0.01) > 96
for: 7m
labels:
severity: Error
annotations:
summary: "CPU 使用率很高"
description: "xxx系统{{ $labels.instance }} CPU 使用率 {{ $value }}%"
value: "{{ $value }}"
- alert: LOAD
expr: node_load5 / on (instance) sum(count(node_cpu_seconds_total{mode='system'}) by (cpu,instance)) by(instance) > 7
for: 7m
labels:
severity: critical
annotations:
summary: "overload"
description: "xxx系统{{ $labels.instance }} 负载/CPU核数比 {{ $value }}"
value: "{{ $value }}"
- alert: MEM
expr: round((1 - ((node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) or ((node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) / node_memory_MemTotal_bytes))) * 100,0.01) > 80
for: 10m
labels:
severity: critical
annotations:
summary: "主机内存使用率高"
description: "xxx系统{{ $labels.instance }} MEM 使用占比 {{ $value }}%"
value: "{{ $value }}"
- alert: MEM
expr: round((1 - ((node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) or ((node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) / node_memory_MemTotal_bytes))) * 100,0.01) > 90
for: 10m
labels:
severity: critical
annotations:
summary: "主机内存不足"
description: "xxx系统{{ $labels.instance }} MEM 使用占比 {{ $value }}%"
value: "{{ $value }}"
- alert: DISK
expr: round((100-(node_filesystem_avail_bytes{mountpoint!='/boot',fstype=~'ext.+|ocfs.+|xfs'}/node_filesystem_size_bytes{mountpoint!='/boot',fstype=~'ext.+|ocfs.+|xfs'})*100 > 90 and node_filesystem_avail_bytes{mountpoint!='/boot',fstype=~'ext.+|ocfs.+|xfs'}/1073741824 < 10),0.01)
for: 28m
labels:
severity: info
annotations:
summary: "存储空间不足"
description: "xxx系统{{ $labels.instance }} {{ $labels.mountpoint }} 存储空间使用占比 {{ $value }}%"
value: "{{ $value }}"
- alert: IOWAIT
expr: round((avg by (instance,job) (irate(node_cpu_seconds_total{mode="iowait"}[3m])) * 100),0.01) > 80
for: 7m
labels:
severity: info
annotations:
summary: "CPU IOWAIT 过高"
description: "xxx系统{{ $labels.instance }} CPU IOWAIT {{ $value }}%"
value: "{{ $value }}"
- alert: IO
expr: round(100-(avg(irate(node_disk_io_time_seconds_total[3m])) by(instance,job)* 100),0.01) < 60
for: 7m
labels:
severity: info
annotations:
summary: "磁盘 I/O 性能低"
description: "xxx系统{{ $labels.instance }} {{ $labels.mountpoint }} 磁盘 I/O 时间占比 {{ $value }}%"
value: "{{ $value }}"
- alert: ProcessNearFDLimits
expr: process_open_fds / process_max_fds > 0.8
for: 3m
labels:
severity: critical
annotations:
summary: "A process hits 80% of the limit"
description: "xxx系统{{ $labels.instance }} 进程使用的文件描述符数占比 {{ $value }}"
value: "{{ $value }}"
- alert: TCP_ESTAB
expr: node_netstat_Tcp_CurrEstab > 20000
for: 3m
labels:
severity: info
annotations:
summary: "TCP 会话数很多"
description: "xxx系统{{ $labels.instance }} TCP 会话数为 {{ $value }}"
value: "{{ $value }}"
```
### windows_exporter_rules.yml
```
groups:
- name: WINDOWS_EXPORTER
rules:
- alert: WindowsServerServiceStatus
expr: windows_service_status{status="ok"} != 1
for: 1m
labels:
severity: Error
annotations:
summary: "Windows Server service Status "
description: "xxx系统{{ $labels.instance }}Windows服务状态不正常"
- alert: WindowsServerCpuUsage
expr: 100 - (avg by (instance) (rate(windows_cpu_time_total{mode="idle"}[2m])) * 100) > 80
for: 0m
labels:
severity: warning
annotations:
summary: "Windows Server CPU Usage"
description: "xxx系统{{ $labels.instance }}CPU使用率超过80%"
value: "{{ $value }}"
- alert: WindowsServerCpuUsage
expr: 100 - (avg by (instance) (rate(windows_cpu_time_total{mode="idle"}[2m])) * 100) > 90
for: 0m
labels:
severity: Error
annotations:
summary: "Windows Server CPU Usage"
description: "xxx系统{{ $labels.instance }}CPU使用率超过90%"
value: "{{ $value }}"
- alert: WindowsServerMemoryUsage
expr: 100 - ((windows_os_physical_memory_free_bytes / windows_cs_physical_memory_bytes) * 100) > 80
for: 2m
labels:
severity: warning
annotations:
summary: "Windows Server memory Usage"
description: "xxx系统{{ $labels.instance }}内存使用率超过80%"
value: "{{ $value }}"
- alert: WindowsServerMemoryUsage
expr: 100 - ((windows_os_physical_memory_free_bytes / windows_cs_physical_memory_bytes) * 100) > 90
for: 2m
labels:
severity: Error
annotations:
summary: "Windows Server memory Usage"
description: "xxx系统{{ $labels.instance }}内存使用率超过90%"
value: "{{ $value }}"
- alert: WindowsServerDiskSpaceUsage
expr: 100.0 - 100 * ((windows_logical_disk_free_bytes / 1024 / 1024 ) / (windows_logical_disk_size_bytes / 1024 / 1024)) > 80
for: 2m
labels:
severity: Error
annotations:
summary: "Windows Server disk Space Usage"
description: "xxx系统{{ $labels.instance }}磁盘使用率超过80%"
value: "{{ $value }}"
```