某业务涉及大量的图像实时处理,请求由一台nginx负载均衡到三台Linux图像处理服务器,由于业务的社会关注度较高,需要特别保证系统的可用性,业务峰值时实时监控服务器及应用状态。
1.新建监控数据表
2.利用python脚本获取服务器性能状态,存入监控表,并监控重要指标,达预警线时调用第三方语音API,拨打电话至手机
3.帆软从mysql取数展示监控数据和应用日志数据
1.在数据库中新建监控表
CREATE TABLE `sys_info` (
`ip` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL,
`mem_percent` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '内存占用百分比',
`mem_used` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '内存已用',
`mem_total` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '总内存',
`mem_buffers` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL,
`cpu_lavg_1` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '1分钟利用率',
`cpu_lavg_5` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '5分钟利用率',
`cpu_lavg_15` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '15分钟利用率',
`cpu_nr` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT 'cpu数',
`cpu_running_process` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '正在运行的进程数',
`cpu_total_process` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '总进程数',
`cpu_last_pid` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '活跃进程id',
`disk_used` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '磁盘已用',
`disk_capacity` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL,
`disk_available` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '磁盘可用',
`disk_percent` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '磁盘已用占比',
`gpu_0` varchar(255) DEFAULT NULL,
`gpu_1` varchar(255) DEFAULT NULL,
`gpu_2` varchar(255) DEFAULT NULL,
`gpu_3` varchar(255) DEFAULT NULL,
`server_process` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '8888图像处理主服务',
`image_server_process` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL COMMENT '8898图像处理换底服务',
`create_time` varchar(255) CHARACTER SET utf8mb3 COLLATE utf8mb3_general_ci DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb3;
# fileName: get_sys_info.py
# 为了方便,脚本直接引用了业务框架(flask)中的一些参数和实例,若不依赖flask就麻烦一点,老老实实装pymysql操作数据库
# 核心函数 os.popen("shell命令"),执行该命令并返回该shell命令的打印信息
import time, os
from flask import Flask
from utils.mysqlclient import SqlUtil
import config
app = Flask(__name__)
sqlUtil = SqlUtil()
sqlUtil.init_app(app, config.config.get(config.config_name).DB_CONFIG)
def job():
# 获取系统资源状态
mem = memory_stat()
cpu = cpu_stat()
disk = disk_stat()
gpu = gpu_stat()
process = process_stat()
# print(mem)
# print(cpu)
# print(disk)
# 判断指标,超过警戒线则调用第三方API拨打电话语音通知
phone = '181xxxx6589' # 接收电话手机号
dangerous = { # 设置警戒值
'mem_percent': 90,
'cpu_lavg_1': 98,
'disk_percent': 95
}
if(mem['percent'] > dangerous['mem_percent'] or cpu['lavg_1' > dangerous['cpu_lavg_1'] or disk['percent'] > dangereous['percent']):
call_phone(phone)
sql = "insert into sys_info (ip, mem_percent, mem_used, mem_total, mem_buffers, cpu_lavg_1, cpu_lavg_5, " \
"cpu_lavg_15, cpu_nr, cpu_running_process, cpu_total_process, cpu_last_pid, disk_used, disk_capacity, " \
"disk_available, disk_percent, gpu_0, gpu_1, gpu_2, gpu_3, server_process, image_server_process, " \
"create_time) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, " \
"%s, %s, %s, now())"
params = [config.config.get(config.config_name).LOCAL_IP, mem['percent'], mem['used'], mem['MemTotal'],
mem['Buffers'],
cpu['lavg_1'], cpu['lavg_5'], cpu['lavg_15'], cpu['nr'], cpu['running_process'],
cpu['total_process'], cpu['last_pid'], disk['used'], disk['capacity'],
disk['available'], disk['percent'], gpu['gpu_0'], gpu['gpu_1'], gpu['gpu_2'], gpu['gpu_3'],
process['server_process'], process['image_server_process']]
# params = ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0']
sqlUtil.exec(sql, params)
# 内存监控
def memory_stat():
mem = {}
f = open('/proc/meminfo', 'r')
lines = f.readlines()
f.close()
for line in lines:
if len(line) < 2:
continue
name = line.split(':')[0]
var = line.split(':')[1].split()[0]
mem[name] = float(var)
mem['MemUsed'] = mem['MemTotal'] - mem['MemFree'] - mem['Buffers'] - mem['Cached']
# 记录内存使用率 已使用 总内存和缓存大小
res = {'percent': int(round(mem['MemUsed'] / mem['MemTotal'] * 100)),
'used': round(mem['MemUsed'] / (1024 * 1024), 2), 'MemTotal': round(mem['MemTotal'] / (1024 * 1024), 2),
'Buffers': round(mem['Buffers'] / (1024 * 1024), 2)}
return res
# CPU负载监控
def cpu_stat():
loadavg = {}
f = open("/proc/loadavg")
con = f.read().split()
f.close()
loadavg['lavg_1'] = con[0]
loadavg['lavg_5'] = con[1]
loadavg['lavg_15'] = con[2]
loadavg['nr'] = con[3]
prosess_list = loadavg['nr'].split('/')
loadavg['running_process'] = prosess_list[0]
loadavg['total_process'] = prosess_list[1]
loadavg['last_pid'] = con[4]
return loadavg
# 磁盘空间监控
def disk_stat():
hd = {}
disk = os.statvfs('/')
hd['available'] = float(disk.f_bsize * disk.f_bavail)
hd['capacity'] = float(disk.f_bsize * disk.f_blocks)
hd['used'] = float((disk.f_blocks - disk.f_bfree) * disk.f_frsize)
res = {'used': round(hd['used'] / (1024 * 1024 * 1024), 2),
'capacity': round(hd['capacity'] / (1024 * 1024 * 1024), 2)}
res['available'] = res['capacity'] - res['used']
res['percent'] = int(round(float(res['used']) / res['capacity'] * 100))
return res
# gpu监控
def gpu_stat():
gpu = { # 最多监控4个gpu,超过了再加
'gpu_0': None,
'gpu_1': None,
'gpu_2': None,
'gpu_3': None,
}
info = os.popen('nvidia-smi').readlines()
gpu_info_list = [i for i in info if 'MiB' in i and '%' in i]
for i in gpu_info_list:
gpu['gpu_' + str(gpu_info_list.index(i))] = i
return gpu
# 会话数监控
def process_stat():
process = {}
info_server = os.popen('netstat -anp |grep 8888 |wc -l').readlines()
info_image_server = os.popen('netstat -anp |grep 8898 |wc -l').readlines()
process['server_process'] = info_server[-1]
process['image_server_process'] = info_image_server[-1]
return process
# 拨打电话
def call_phone(phone):
"""
发送语音验证码 ( 鼎信 )
免费获取调用次数地址 https://market.aliyun.com/products/56928004/cmapi026600.html?spm=5176.2020520132.101.2.51547218rkAXxy
"""
appcode = '' #从上述网址获取appcode填入即可
API_BY_VOICE_CODE_DINGXIN = 'http://yuyin2.market.alicloudapi.com/dx/voice_notice'
data = {
'tpl_id': 'TP1801174',
'phone': phone,
'param': 'name:{name},msg:{msg}'.format(
name='管理员', msg='服务器状态达警戒线,请注意')
}
response = request(url=API_BY_VOICE_CODE_DINGXIN, method='POST', data=data,
headers={'Authorization': 'APPCODE {}'.format(appcode)})
result = response.json()
response_message = result.get('return_code')
if response.status_code in [400, 401, 403]:
print(result)
return False
if response.status_code == 200 and result.get('return_code') == '00000':
print('发送语音成功')
return True
else:
return False
if __name__ == '__main__':
while True:
time.sleep(5) # 5s获取一次
job()
由于依赖于flask,将get_sys_info.py放入flask根目录,执行
nohup python get_sys_info.py & # 后台运行
ps -ef |grep python # 查看脚本进程状态