合规国际互联网加速 OSASE为企业客户提供高速稳定SD-WAN国际加速解决方案。 广告
### node_exporter_rules.yml ``` groups: - name: HOST rules: - alert: Node实例已宕机 expr: up == 0 for: 10s labels: user: root severity: Warning annotations: summary: "Instance {{ $labels.instance }} Down" description: "xxx系统 {{ $labels.instance }} of job {{ $labels.job }} has been Down." - alert: MasterDown expr: up{job='federate'} == 0 for: 10m labels: severity: info annotations: summary: "Master 主机服务异常" description: "xxx系统{{ $labels.instance }} Master 9090 端口服务异常" - alert: InstanceDown expr: (up{job='node'} == 0) and ((node_time_seconds-node_boot_time_seconds) > 1800) for: 5m labels: severity: info annotations: summary: "监控数据获取异常" description: "xxx系统{{ $labels.instance }} 主机可能宕机,所在节点 Master 私网IP {{ $labels.master_private_ip }} " - alert: InstanceReboot expr: (node_time_seconds-node_boot_time_seconds) < 600 labels: severity: info annotations: summary: "重新启动" description: "xxx系统{{ $labels.instance }} 重新启动" value: "{{ $value }}" - alert: CPU expr: round(100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance,job) * 100),0.01) > 80 for: 7m labels: severity: Warn annotations: summary: "CPU 使用率高" description: "xxx系统{{ $labels.instance }} CPU 使用率 {{ $value }}%" value: "{{ $value }}" - alert: CPU expr: round(100 - (avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by (instance,job) * 100),0.01) > 96 for: 7m labels: severity: Error annotations: summary: "CPU 使用率很高" description: "xxx系统{{ $labels.instance }} CPU 使用率 {{ $value }}%" value: "{{ $value }}" - alert: LOAD expr: node_load5 / on (instance) sum(count(node_cpu_seconds_total{mode='system'}) by (cpu,instance)) by(instance) > 7 for: 7m labels: severity: critical annotations: summary: "overload" description: "xxx系统{{ $labels.instance }} 负载/CPU核数比 {{ $value }}" value: "{{ $value }}" - alert: MEM expr: round((1 - ((node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) or ((node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) / node_memory_MemTotal_bytes))) * 100,0.01) > 80 for: 10m labels: severity: critical annotations: summary: "主机内存使用率高" description: "xxx系统{{ $labels.instance }} MEM 使用占比 {{ $value }}%" value: "{{ $value }}" - alert: MEM expr: round((1 - ((node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) or ((node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) / node_memory_MemTotal_bytes))) * 100,0.01) > 90 for: 10m labels: severity: critical annotations: summary: "主机内存不足" description: "xxx系统{{ $labels.instance }} MEM 使用占比 {{ $value }}%" value: "{{ $value }}" - alert: DISK expr: round((100-(node_filesystem_avail_bytes{mountpoint!='/boot',fstype=~'ext.+|ocfs.+|xfs'}/node_filesystem_size_bytes{mountpoint!='/boot',fstype=~'ext.+|ocfs.+|xfs'})*100 > 90 and node_filesystem_avail_bytes{mountpoint!='/boot',fstype=~'ext.+|ocfs.+|xfs'}/1073741824 < 10),0.01) for: 28m labels: severity: info annotations: summary: "存储空间不足" description: "xxx系统{{ $labels.instance }} {{ $labels.mountpoint }} 存储空间使用占比 {{ $value }}%" value: "{{ $value }}" - alert: IOWAIT expr: round((avg by (instance,job) (irate(node_cpu_seconds_total{mode="iowait"}[3m])) * 100),0.01) > 80 for: 7m labels: severity: info annotations: summary: "CPU IOWAIT 过高" description: "xxx系统{{ $labels.instance }} CPU IOWAIT {{ $value }}%" value: "{{ $value }}" - alert: IO expr: round(100-(avg(irate(node_disk_io_time_seconds_total[3m])) by(instance,job)* 100),0.01) < 60 for: 7m labels: severity: info annotations: summary: "磁盘 I/O 性能低" description: "xxx系统{{ $labels.instance }} {{ $labels.mountpoint }} 磁盘 I/O 时间占比 {{ $value }}%" value: "{{ $value }}" - alert: ProcessNearFDLimits expr: process_open_fds / process_max_fds > 0.8 for: 3m labels: severity: critical annotations: summary: "A process hits 80% of the limit" description: "xxx系统{{ $labels.instance }} 进程使用的文件描述符数占比 {{ $value }}" value: "{{ $value }}" - alert: TCP_ESTAB expr: node_netstat_Tcp_CurrEstab > 20000 for: 3m labels: severity: info annotations: summary: "TCP 会话数很多" description: "xxx系统{{ $labels.instance }} TCP 会话数为 {{ $value }}" value: "{{ $value }}" ``` ### windows_exporter_rules.yml ``` groups: - name: WINDOWS_EXPORTER rules: - alert: WindowsServerServiceStatus expr: windows_service_status{status="ok"} != 1 for: 1m labels: severity: Error annotations: summary: "Windows Server service Status " description: "xxx系统{{ $labels.instance }}Windows服务状态不正常" - alert: WindowsServerCpuUsage expr: 100 - (avg by (instance) (rate(windows_cpu_time_total{mode="idle"}[2m])) * 100) > 80 for: 0m labels: severity: warning annotations: summary: "Windows Server CPU Usage" description: "xxx系统{{ $labels.instance }}CPU使用率超过80%" value: "{{ $value }}" - alert: WindowsServerCpuUsage expr: 100 - (avg by (instance) (rate(windows_cpu_time_total{mode="idle"}[2m])) * 100) > 90 for: 0m labels: severity: Error annotations: summary: "Windows Server CPU Usage" description: "xxx系统{{ $labels.instance }}CPU使用率超过90%" value: "{{ $value }}" - alert: WindowsServerMemoryUsage expr: 100 - ((windows_os_physical_memory_free_bytes / windows_cs_physical_memory_bytes) * 100) > 80 for: 2m labels: severity: warning annotations: summary: "Windows Server memory Usage" description: "xxx系统{{ $labels.instance }}内存使用率超过80%" value: "{{ $value }}" - alert: WindowsServerMemoryUsage expr: 100 - ((windows_os_physical_memory_free_bytes / windows_cs_physical_memory_bytes) * 100) > 90 for: 2m labels: severity: Error annotations: summary: "Windows Server memory Usage" description: "xxx系统{{ $labels.instance }}内存使用率超过90%" value: "{{ $value }}" - alert: WindowsServerDiskSpaceUsage expr: 100.0 - 100 * ((windows_logical_disk_free_bytes / 1024 / 1024 ) / (windows_logical_disk_size_bytes / 1024 / 1024)) > 80 for: 2m labels: severity: Error annotations: summary: "Windows Server disk Space Usage" description: "xxx系统{{ $labels.instance }}磁盘使用率超过80%" value: "{{ $value }}" ```