帆软+python+mysql实现服务器监控大屏并自动告警

帆软+python+mysql实现服务器监控大屏并自动告警

背景

某业务涉及大量的图像实时处理,请求由一台nginx负载均衡到三台Linux图像处理服务器,由于业务的社会关注度较高,需要特别保证系统的可用性,业务峰值时实时监控服务器及应用状态。

设计

1.新建监控数据表
2.利用python脚本获取服务器性能状态,存入监控表,并监控重要指标,达预警线时调用第三方语音API,拨打电话至手机
3.帆软从mysql取数展示监控数据和应用日志数据

实施步骤

1.在数据库中新建监控表

CREATE TABLE `sys_info` (
  `ip` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL,
  `mem_percent` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '内存占用百分比',
  `mem_used` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '内存已用',
  `mem_total` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '总内存',
  `mem_buffers` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL,
  `cpu_lavg_1` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '1分钟利用率',
  `cpu_lavg_5` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '5分钟利用率',
  `cpu_lavg_15` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '15分钟利用率',
  `cpu_nr` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT 'cpu数',
  `cpu_running_process` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '正在运行的进程数',
  `cpu_total_process` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '总进程数',
  `cpu_last_pid` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '活跃进程id',
  `disk_used` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '磁盘已用',
  `disk_capacity` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL,
  `disk_available` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '磁盘可用',
  `disk_percent` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '磁盘已用占比',
  `gpu_0` varchar(255) DEFAULT NULL,
  `gpu_1` varchar(255) DEFAULT NULL,
  `gpu_2` varchar(255) DEFAULT NULL,
  `gpu_3` varchar(255) DEFAULT NULL,
  `server_process` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '8888图像处理主服务',
  `image_server_process` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '8898图像处理换底服务',
  `create_time` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb3;

2.编写python脚本获取服务器数据,并监控告警,拨打电话

# fileName: get_sys_info.py
# 为了方便,脚本直接引用了业务框架(flask)中的一些参数和实例,若不依赖flask就麻烦一点,老老实实装pymysql操作数据库
# 核心函数 os.popen("shell命令"),执行该命令并返回该shell命令的打印信息

import time, os
from flask import Flask
from utils.mysqlclient import SqlUtil
import config

app = Flask(__name__)

sqlUtil = SqlUtil()
sqlUtil.init_app(app, config.config.get(config.config_name).DB_CONFIG)


def job():
	# 获取系统资源状态
    mem = memory_stat()
    cpu = cpu_stat()
    disk = disk_stat()
    gpu = gpu_stat()
    process = process_stat()
    # print(mem)
    # print(cpu)
    # print(disk)
    
	# 判断指标,超过警戒线则调用第三方API拨打电话语音通知
	phone = '181xxxx6589' # 接收电话手机号 
	dangerous = { # 设置警戒值
		'mem_percent': 90,
		'cpu_lavg_1': 98,
		'disk_percent': 95
	}
	if(mem['percent'] > dangerous['mem_percent'] or cpu['lavg_1' > dangerous['cpu_lavg_1'] or disk['percent'] > dangereous['percent']):
		call_phone(phone)
	

    sql = "insert into sys_info (ip, mem_percent, mem_used, mem_total, mem_buffers, cpu_lavg_1, cpu_lavg_5, " \
          "cpu_lavg_15, cpu_nr, cpu_running_process, cpu_total_process, cpu_last_pid, disk_used, disk_capacity, " \
          "disk_available, disk_percent, gpu_0, gpu_1, gpu_2, gpu_3, server_process, image_server_process, " \
          "create_time) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, " \
          "%s, %s, %s, now())"
    params = [config.config.get(config.config_name).LOCAL_IP, mem['percent'], mem['used'], mem['MemTotal'],
              mem['Buffers'],
              cpu['lavg_1'], cpu['lavg_5'], cpu['lavg_15'], cpu['nr'], cpu['running_process'],
              cpu['total_process'], cpu['last_pid'], disk['used'], disk['capacity'],
              disk['available'], disk['percent'], gpu['gpu_0'], gpu['gpu_1'], gpu['gpu_2'], gpu['gpu_3'],
              process['server_process'], process['image_server_process']]
    # params = ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0']
    sqlUtil.exec(sql, params)


# 内存监控
def memory_stat():
    mem = {}
    f = open('/proc/meminfo', 'r')
    lines = f.readlines()
    f.close()
    for line in lines:
        if len(line) < 2:
            continue
        name = line.split(':')[0]
        var = line.split(':')[1].split()[0]
        mem[name] = float(var)
    mem['MemUsed'] = mem['MemTotal'] - mem['MemFree'] - mem['Buffers'] - mem['Cached']
    # 记录内存使用率 已使用 总内存和缓存大小
    res = {'percent': int(round(mem['MemUsed'] / mem['MemTotal'] * 100)),
           'used': round(mem['MemUsed'] / (1024 * 1024), 2), 'MemTotal': round(mem['MemTotal'] / (1024 * 1024), 2),
           'Buffers': round(mem['Buffers'] / (1024 * 1024), 2)}
    return res


# CPU负载监控
def cpu_stat():
    loadavg = {}
    f = open("/proc/loadavg")
    con = f.read().split()
    f.close()
    loadavg['lavg_1'] = con[0]
    loadavg['lavg_5'] = con[1]
    loadavg['lavg_15'] = con[2]
    loadavg['nr'] = con[3]

    prosess_list = loadavg['nr'].split('/')
    loadavg['running_process'] = prosess_list[0]
    loadavg['total_process'] = prosess_list[1]

    loadavg['last_pid'] = con[4]

    return loadavg


# 磁盘空间监控
def disk_stat():
    hd = {}
    disk = os.statvfs('/')
    hd['available'] = float(disk.f_bsize * disk.f_bavail)
    hd['capacity'] = float(disk.f_bsize * disk.f_blocks)
    hd['used'] = float((disk.f_blocks - disk.f_bfree) * disk.f_frsize)
    res = {'used': round(hd['used'] / (1024 * 1024 * 1024), 2),
           'capacity': round(hd['capacity'] / (1024 * 1024 * 1024), 2)}
    res['available'] = res['capacity'] - res['used']
    res['percent'] = int(round(float(res['used']) / res['capacity'] * 100))
    return res

# gpu监控
def gpu_stat():
    gpu = {  # 最多监控4个gpu,超过了再加
        'gpu_0': None,
        'gpu_1': None,
        'gpu_2': None,
        'gpu_3': None,
    }
    info = os.popen('nvidia-smi').readlines()
    gpu_info_list = [i for i in info if 'MiB' in i and '%' in i]
    for i in gpu_info_list:
        gpu['gpu_' + str(gpu_info_list.index(i))] = i
    return gpu

# 会话数监控
def process_stat():
    process = {}
    info_server = os.popen('netstat -anp |grep 8888 |wc -l').readlines()
    info_image_server = os.popen('netstat -anp |grep 8898 |wc -l').readlines()
    process['server_process'] = info_server[-1]
    process['image_server_process'] = info_image_server[-1]
    return process


# 拨打电话
def call_phone(phone):
    """
    发送语音验证码 ( 鼎信 )
    免费获取调用次数地址 https://market.aliyun.com/products/56928004/cmapi026600.html?spm=5176.2020520132.101.2.51547218rkAXxy
    """
    appcode = '' #从上述网址获取appcode填入即可
    API_BY_VOICE_CODE_DINGXIN = 'http://yuyin2.market.alicloudapi.com/dx/voice_notice'
    data = {
        'tpl_id': 'TP1801174',
        'phone': phone,
        'param': 'name:{name},msg:{msg}'.format(
            name='管理员',  msg='服务器状态达警戒线,请注意')
        }
    response = request(url=API_BY_VOICE_CODE_DINGXIN, method='POST', data=data,
                                        headers={'Authorization': 'APPCODE {}'.format(appcode)})
    result = response.json()
    response_message = result.get('return_code')
    if response.status_code in [400, 401, 403]:
    	print(result)
        return False
    if response.status_code == 200 and result.get('return_code') == '00000':
        print('发送语音成功')
        return True
    else:
        return False


if __name__ == '__main__':
    while True:
        time.sleep(5)    # 5s获取一次
        job()

运行脚本

由于依赖于flask,将get_sys_info.py放入flask根目录,执行

nohup python get_sys_info.py & # 后台运行
ps -ef |grep python # 查看脚本进程状态

3.帆软大屏展示数据

帆软+python+mysql实现服务器监控大屏并自动告警_第1张图片

你可能感兴趣的:(帆软,python,mysql,服务器,大屏端)