{df_html}
这个只是我作为数据库和开发小白自己的一个解决思路,如果有更恰当的思路,欢迎评论或私聊呀~
业务数据库被异常调用时导致慢查询量增大,影响到正常业务使用,业务只能通过nginx超时异常等来进行问题排查,增大了定位和处理问题的难度、时间,尤其商品库或者交易库可能会出现页面无法正常显示的情况。故业务需求为若主库和从库慢查询总量每分钟超过某个阈值之后则发送邮件报警。
--log-dsn
参数将pt-kill结果写入数据库中,否则会出现主从数据不一致,从而导致主从异常中断--log
参数直接将kill掉的记录写入到本地文件中角色_端口_主机区分_时间戳_slow.log
。例如:主库某个时间的文件名为:m_3306_733_20200810114116_slow.log
,表示端口为3306的主库,在主机后两位为733上2020年8月10日11时41分16秒生成的kill日志文件由于markdown粘贴代码,缩进可能有部分问题,复制粘贴后需注意一下,并且这里取消了log模块,可将需要内容打印到使用的log中去
# 公共函数
## 1. 获取数据库连接
from sqlalchemy import create_engine
def adminMySQLConn(User,Pass,Host,Port,DBName):
try:
adminConn="mysql+pymysql://{}:{}@{}:{}/{}?charset=utf8mb4".\
format(User,Pass,Host,Port,DBName)
adminEngine = create_engine(adminConn)
return adminEngine
except Exception as err:
self.log.error("adminMySQLConn: {}".format(err))
#self.log.error("adminMySQLConn: {}".format(err.message))
## 2. 获取当前时间戳
from datetime import datetime
def initCurrentDateTime():
try:
return datetime.now().strftime('%Y%m%d%H%M%S')
except Exception as err:
self.log.error("initCurrentDateTime: {}".format(err.message))
## 3. 远程/本地执行linux命令(在本地执行可以直接使用os.system或subprocess.getstatusoutput,但由于这里是个公共函数,其他程序可能需要远程执行,故这里统一使用paramiko执行)
from paramiko import SSHClient, AutoAddPolicy
def executeSSH(ip, cmds):
try:
client = SSHClient()
client.set_missing_host_key_policy(AutoAddPolicy())
system('kinit -kt /etc/krb5.keytab')
client.connect(ip, look_for_keys=False, gss_auth=True, gss_kex=True)
stdin, stdout, stderr = client.exec_command(cmds)
# result = stdout.readlines() # 获取命令执行结果,返回的数据是一个list
_result = stdout.read().decode() # 命令执行结果
_status = stdout.channel.recv_exit_status() # 命令执行状态码
client.close()
return _status, _result
except Exception as e:
# print(e)
raise e
由于这里存在一个管理库,上面有所有数据库的信息,则通过pymysql去查看并获取对应信息即可。
# 输入信息,即GetHost中的info
Inputinfo = {
"rsPort": 3306'}
# tb_mysql_instance为管理表,其中:rsPort为端口,rsHost为对应数据库所在主机,rsRole为数据库角色:master主库,slave从库
SQLlist = {
"getHost": "select rsHost, rsPort, rsRole from tb_mysql_instance where rsPort = {} and rsRole in ('master', 'slave');"
}
def GetHost(**info):
port = info['rsPort']
insInfoList = []
adminConn = adminMySQLConn()
try:
executeSQL = SQLlist['getHost'].format(port)
insList = adminConn.execute(executeSQL).fetchall()
if insList:
datetime = initCurrentDateTime()
for insInfo in insList:
infoDict = dict(zip(insInfo.keys(), insInfo.values()))
if infoDict['rsRole'] == 'master':
infoDict['logFile'] = "m_{}_{}_{}_slow.log".format(infoDict['rsPort'],''.join(infoDict['rsHost'].split('.')[2:]) , datetime)
else:
infoDict['logFile'] = "s_{}_{}_{}_slow.log".format(infoDict['rsPort'],''.join(infoDict['rsHost'].split('.')[2:]) , datetime)
infoDict['ptkillLogFile'] = "./SlowLog/{}".format(infoDict['logFile'])
insInfoList.append(infoDict)
msg = "Get {} pt-kill master and slave host info success.".format(port)
return True, insInfoList
else:
msg = "Get {} pt-kill master and slave host info failed. SQL:{} insList: ".format(port, executeSQL), insList
return False, msg
except Exception as err:
msg = "Get {} pt-kill master and slave host info err: ".format(port), err
return False, msg
# 输出结果示例:
[{
'rsHost': '1.1.1.1', 'rsPort': 3306, 'rsRole': 'master', 'logFile': 'm_3306_11_20200810084828_slow.log', 'ptkillLogFile': '/path/SlowLog/m_3306_11_20200810084828_slow.log'}, {
'rsHost': '2.2.2.2', 'rsPort': 3306, 'rsRole': 'slave', 'logFile': 's_3306_22_20200810084828_slow.log', 'ptkillLogFile': '/path/SlowLog/s_3306_22_20200810084828_slow.log'}, {
'rsHost': '3.3.3.3', 'rsPort': 3306, 'rsRole': 'slave', 'logFile': 's_3306_33_20200810084828_slow.log', 'ptkillLogFile': '/path/SlowLog/s_3306_33_20200810084828_slow.log'}]
# 后面的info都一直表示获取初始化信息后的里面的字典格式
# 例如:{'rsHost': '1.1.1.1', 'rsPort': 3306, 'rsRole': 'master', 'logFile': 'm_3306_11_20200810084828_slow.log', 'ptkillLogFile': '/path/SlowLog/m_3306_11_20200810084828_slow.log'}
# 子函数
## 执行pt-kill命令,pt-kill一次只执行60s
def ExecutePtkillCmd(**info):
try:
ptkillCmd = "/opt/soft/percona-toolkit-2.2.14/bin/pt-kill --no-version-check " \
"--host {rsHost} --port {rsPort} --user 'dba' --password '5d63f33c10b8f430'" \
" --busy-time 2 --match-state='Sending data|Sorting result' --victim all " \
"--interval 1 --run-time 60 --daemonize --kill --print --log={ptkillLogFile}".format(**info)
status, ret = executeSSH('10.148.16.25', ptkillCmd)
if status == 0:
msg = "Execute {rsHost}:{rsPort} pt-kill command success.".format(**info)
return True, msg
else:
msg = "Execute {rsHost}:{rsPort} pt-kill command failed, Cmd:".format(**info), ptkillCmd
return False, msg
except Exception as err:
msg = "Execute {rsHost}:{rsPort} pt-kill command error.".format(**info), err
return False, msg
## 获取慢SQL数量,并写入info中
def GetFileRegixCount(**info):
try:
logPwd = info['ptkillLogFile']
process = Popen(['grep', '^# [0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}', logPwd], stdout=PIPE)
info['slowSQLCount'] = len((process.stdout).readlines())
msg = "Get {rsHost}:{rsPort} slow log count success.".format(**info)
return True, info
except Exception as err:
msg = "Get {rsHost}:{rsPort} slow log count err:".format(**info), err
log.error(msg)
return False, msg
### 输出info格式类似为:
{
'rsHost': '1.1.1.1', 'rsPort': 3306, 'rsRole': 'master', 'logFile': 'm_3306_11_20200810084828_slow.log', 'ptkillLogFile': '/path/SlowLog/m_3306_11_20200810084828_slow.log', 'slowSQLCount': 0}
## 每60s触发一次pt-kill的执行
from time import sleep
def killSlowSql(info):
try:
status, msg = ExecutePtkillCmd(**info)
if status is False:
return status, msg
sleep(60)
return GetFileRegixCount(**info)
except Exception as err:
msg = "kill {rsHost}:{rsPort} {ptkillLogFile} err:".format(**info), err
return False, msg
## 多进程执行pt-kill
from multiprocessing import Pool
def main(*HostInfoList):
try:
pool = Pool(8)
res_l = []
infoList = []
for info in HostInfoList:
base = pool.apply_async(killSlowSql, (info, ))
res_l.append(base)
pool.close()
pool.join()
for res in res_l:
ret = res.get()
infoList.append(ret[1])
return True, infoList
except Exception as err:
return False, err
由于python自带html表格样式有些许丑,所以参考这个小姐姐的表格前端页面来进行了修改。
import pandas as pd
head = \
"""
"""
# 转换为表格需要的输入
## 输入格式类似为result=[[1,2,3],['a','b','c']], title=['id', 'name']
def convert_to_html(result,title):
d = {
}
index = 0
for t in title:
d[t] = result[index]
index +=1
df = pd.DataFrame(d)
#如数据过长,可能在表格中无法显示,加上pd.set_option语句可以避免这一情况
pd.set_option('max_colwidth',200)
pd.set_option('colheader_justify', 'center')
df = df [title]
#h =df.to_html(index=False)
h =df.to_html(col_space=30,border=1,justify='center')
h2 = h.replace('class', 'cellspacing=\"0\" class')
return h2
# 初始化表格
def formatHtmlTable(result, title):
df_html = convert_to_html(result,title)
body = \
"""
慢查询SQL邮件报警
详情请查看附件
{df_html}
""".format(df_html=df_html)
html_msg = "" + head + body + ""
html_msg = html_msg.replace('\n','').encode("utf-8")
return html_msg
import smtplib
from email.mime.text import MIMEText
from email.header import Header
from email.mime.multipart import MIMEMultipart
## 发送邮件,username和password为发送人邮箱的用户名和密码。注意密码为smtp的授权码
def sendMail(sender, receivers, message):
try:
username = '''''''xxxx@qq.com'
password = 'xxxxxx
smtp = smtplib.SMTP(host='xxxx.qq.com', port=25)
smtp.login(username, password)
rdict = smtp.sendmail(sender, receivers, message.as_string())
smtp.quit()
msg = "send mail success"
return True, msg
except smtplib.SMTPException:
msg = "send mail failed"
return False, msg
## 发送报警
def sendMailAlert(*infoList):
sender ='[email protected]' # 发送人邮箱
receivers = ['[email protected]'] # 接收人邮箱列表,可写多个
message = MIMEMultipart()
message['From'] = Header("lichunliang", 'utf-8') # 发送者别名
message['To'] = Header("business_rds", 'utf-8') # 接收者别名
subject = '慢查询邮件告警测试'
message['Subject'] = Header(subject, 'utf-8')
# message.attach(MIMEText(mail_msg1, 'plain', 'utf-8'))
sumCount = 0
for info in infoList:
sumCount += info["slowSQLCount"]
if sumCount > 5:
retList = []
title = ['rsPort', 'rsRole', 'logFile', 'slowSQLCount']
for t in title:
ret = []
[ret.append(info[t]) for info in infoList]
retList.append(ret)
for info in infoList:
## 发送附件
att1 = MIMEText(open(info['ptkillLogFile'], 'rb').read(), 'base64', 'utf-8')
att1["Content-Type"] = 'application/octet-stream'
att1["Content-Disposition"] = 'attachment; filename={}'.format(info['ptkillLogFile'].split('/')[-1])
message.attach(att1)
html_msg = formatHtmlTable(retList,title)
message.attach(MIMEText(html_msg, 'html', 'utf-8'))
return sendMail(sender, receivers, message)
else:
log.info('Dont need sendmail.')