在工作过程中,总有一些需求需要周期性提供数据支撑。目前数据开发的任务有很多工具(dataworks
,schedulerx,crontab)大部分都可以实现定时依赖运行。但是往往还差一步,需要数据的人员大部分是
产品运营等一些决策的同事,但是他们在查询结果方面并不是很擅长,所以他们希望可以以表格的方式
直接提供给他们。每天数据导出对于数据开发的同学也很麻烦,故开发一个工具,可以将查询的结果直接
定时发送给提需求的同学。
依赖环境: python环境(作者使用的是python3.9)
所需依赖包:
pip install pymysql (mysql)
pip install pymssql (sqlserver)
pip install pyodps (dataworks)
pip install pandas
pip install easyemail (邮箱模块)
smtplib(python内置模块,不需要安装)
odps模块实现: 连接 –
#!/usr/bin/env python
# encoding: utf-8
'''
@Time: 2022/7/20 20:16
@Project: mmb
@File: Db_Odps.py
@Author: rk
@Software: pycharm
@Desc:
'''
from odps import ODPS
from odps.models import Partition
class Db_Odps():
# 获取ODPS连接
def getOdpsConnect(self):
endpoint = 'http://service.cn-shenzhen.maxcompute.aliyun-inc.com/api'
# endpoint = "https://service.cn-shenzhen.maxcompute.aliyun.com/api"
accessId = '********'
accessKey = '********'
project = '***'
odps = ODPS(access_id=accessId, secret_access_key=accessKey, project=project, endpoint=endpoint)
print('odps连接成功')
return odps
# 基于odps表解析配置信息
def setOdpsTableInfos(self, odps, tables):
# 存储表信息
table_infos = dict()
# 获取列注释,作为excel中字段名称
for sheet, table, bizdate in tables:
cols = odps.get_table(table).schema.columns
col_comments = []
for col in cols:
if not isinstance(col, Partition):
col_comments.append(col.comment)
partition = 'dt={}'.format(bizdate)
# print('{}表---指定分区bizdate={}'.format(table,bizdate))
table_info = {
'table': table,
'cols': col_comments,
'partition': partition
}
table_infos['{}({})'.format(sheet, bizdate)] = table_info
print('获取配置...', table_infos.keys())
return table_infos
# 获取Odps数据
def getOdpsData(self, odps, table, partition, cols):
df = odps.get_table(table).get_partition(partition).to_df().to_pandas()
df.columns = cols
print('获取{}表信息---分区为{}'.format(table, partition))
# print(df.head(5))
return df
# odps运行
def runOdps(self, tables):
# 获取连接
odps = self.getOdpsConnect()
table_infos = self.setOdpsTableInfos(odps, tables)
# 存储数据
df_maps = dict()
# 获取每个表数据并添加到df_maps
for name, table_info in table_infos.items():
# 获取数据
df = self.getOdpsData(odps, table_info['table'], table_info['partition'], table_info['cols'])
df_maps[name] = df
return df_maps
mysql模块实现:
#!/usr/bin/env python
# encoding: utf-8
'''
@Time: 2022/7/20 20:21
@Project: mmb
@File: Db_mysql.py
@Author: rk
@Software: pycharm
@Desc:
'''
import pandas as pd
import pymysql
class Db_Mysql():
# 获取Mysql连接
def getMysqlConnect(self):
host = '********'
port = 3306
user = '******'
password = '******'
database = '***'
connect = pymysql.connect(host=host
, port=port
, user=user
, password=password
, database=database
, charset='utf8') # 服务器名,账户,密码,数据库名,字符编码
# 创建一个游标对象
cursor = connect.cursor()
return cursor, connect
# 关闭Mysql连接
def closeMysql(self, connect, cursor):
cursor.close()
connect.close()
# 基于Mysql表解析配置信息
def setMysqlTableInfos(self, cursor, tables):
# 存储表信息
table_infos = dict()
# 获取列注释,作为excel中字段名称
for sheet, table, bizdate in tables:
cols_sql = '''
select `table_name`, `column_name`, column_comment
from information_schema.columns
where table_schema='{}' and table_name='{}';
'''.format(table.split('.')[0], table.split('.')[1])
cursor.execute(cols_sql)
cols = cursor.fetchall()
col_comments = []
for col in cols:
col_comments.append(col[2])
table_info = {
'table': table,
'cols': col_comments
}
table_infos['{}({})'.format(sheet, bizdate)] = table_info
print('获取配置...', table_infos.keys())
return table_infos
# 获取Mysql数据
def getMysqlData(self, cursor, table, cols):
sql = 'select * from {};'.format(table)
cursor.execute(sql)
data = cursor.fetchall()
df = pd.DataFrame(data, columns=cols)
# print(df)
return df
# Mysql运行
def runMysql(self, tables):
# 获取连接
cursor, connect = self.getMysqlConnect()
table_infos = self.setMysqlTableInfos(cursor, tables)
# 存储数据
df_maps = dict()
# 获取每个表数据并添加到df_maps
for name, table_info in table_infos.items():
# 获取数据
df = self.getMysqlData(cursor, table_info['table'], table_info['cols'])
df_maps[name] = df
# print(df_maps)
return df_maps
sqlserver模块实现:
#!/usr/bin/env python
# encoding: utf-8
'''
@Time: 2022/7/20 20:26
@Project: mmb
@File: Db_Sqlserver.py
@Author: rk
@Software: pycharm
@Desc:
'''
import pandas as pd
import pymssql
class Db_SqlServer():
# 获取SQLServer连接
def getSqlserverConnect(self):
host = '********'
port = '****'
user = '******'
password = '******'
database = '***'
connect = pymssql.connect(host=host
, port=port
, user=user
, password=password
, database=database
, charset='utf8') # 服务器名,账户,密码,数据库名,字符编码
# 创建一个游标对象
cursor = connect.cursor()
return cursor, connect
# 关闭SQLserver连接
def closeSqlserver(self, connect, cursor):
cursor.close()
connect.close()
# 基于sqlserver表解析配置信息
def setSqlserverTableInfos(self, cursor, tables):
# 存储表信息
table_infos = dict()
# 获取列注释,作为excel中字段名称
for sheet, table, bizdate in tables:
cols_sql = '''
select a.name, b.name, c.value
from dbo.sysobjects a
left join dbo.syscolumns b
on a.id = b.id
left join sys.extended_properties c
on a.id = c.major_id AND b.colid = c.minor_id
where a.name = '{}'
'''.format(table.split('.')[1])
cursor.execute(cols_sql)
cols = cursor.fetchall()
col_comments = []
for col in cols:
col_comments.append(col[2])
table_info = {
'table': table,
'cols': col_comments
}
table_infos['{}({})'.format(sheet, bizdate)] = table_info
print('获取配置...', table_infos.keys())
return table_infos
# 获取SQLserver数据
def getSqlserverData(self, cursor, table, cols):
sql = 'select * from {};'.format(table)
cursor.execute(sql)
data = cursor.fetchall()
df = pd.DataFrame(data, columns=cols)
# print(df)
return df
# sqlserver运行
def runSqlserver(self,tables):
# 获取连接
cursor, connect = self.getSqlserverConnect()
table_infos = self.setSqlserverTableInfos(cursor, tables)
# 存储数据
df_maps = dict()
# 获取每个表数据并添加到df_maps
for name, table_info in table_infos.items():
# 获取数据
df = self.getSqlserverData(cursor, table_info['table'],table_info['cols'])
df_maps[name] = df
return df_maps
统一调用模块实现:
#!/usr/bin/env python
# encoding: utf-8
'''
@Time: 2022/7/20 20:30
@Project: mmb
@File: Db_GetData.py
@Author: rk
@Software: pycharm
@Desc:
'''
import datetime
import os
import pandas as pd
from Db_Mysql import Db_Mysql
from Db_Odps import Db_Odps
from Db_SqlServer import Db_SqlServer
class Db_GetData():
# 保存数据
def saveData(self, name, df_maps, header=True):
path = os.path.abspath(name)
writer = pd.ExcelWriter(name)
for sheet, df in df_maps.items():
if df.columns.isnull().all():
header = False
df.to_excel(writer, sheet_name=sheet, index=False, header=header)
writer.save()
print('数据保存到:{}'.format(path))
return path
# 删除历史数据
def deleteHistoryFile(self, save_file, day=3):
path, file_name = os.path.split(os.path.abspath(save_file))
# 当前生成文件的最新时间
file_dt = datetime.datetime.strptime(file_name.split('_')[1].split('.')[0], '%Y%m%d')
# 当前生成文件名称中的关键词
file_key = file_name.split('_')[0]
for file_path, _, files in os.walk(path):
for file in files:
if file_key in file and len(file.split('_')) > 1:
# 历史文件的文件生成时间
dt = datetime.datetime.strptime(file.split('_')[1].split('.')[0], '%Y%m%d')
if (file_dt - dt).days > day:
print('{}文件大于{}天删除'.format(file, day))
os.remove(os.path.join(file_path, file))
# 运行
def run(self, save_path, tables, type='odps'):
df_maps = dict()
if type == 'odps':
db_odps = Db_Odps()
df_maps = db_odps.runOdps(tables)
elif type == 'sqlserver':
db_sqlserver = Db_SqlServer()
df_maps = db_sqlserver.runSqlserver(tables)
elif type == 'mysql':
db_mysql = Db_Mysql()
df_maps = db_mysql.runMysql(tables)
# 批量保存数据
save_path = self.saveData(save_path, df_maps)
# 清除前3天历史数据
self.deleteHistoryFile(save_path)
邮箱正文功能模块实现:
#!/usr/bin/env python
# encoding: utf-8
'''
@Time: 2022/4/29 15:33
@Project: pythonProject
@File: EmailContent.py
@Author: rk
@Software: pycharm
@Desc: 邮箱正文 添加内容/附件
'''
from email.mime.multipart import MIMEMultipart
from email.header import Header
from email.mime.text import MIMEText
from email.mime.image import MIMEImage
import os
class EmailContent:
def __init__(self, header, emailSubject, toReceivers, ccReceivers):
# 邮件对象
self.msg = MIMEMultipart()
# 添加发件人头
self.msg['From'] = Header(header, 'utf-8')
# 添加邮件主题
self.msg['Subject'] = Header(emailSubject, "utf-8")
# 添加收件人
self.msg['To'] = ";".join(toReceivers)
# 添加抄送人
self.msg["Cc"] = ";".join(ccReceivers)
def addBody(self, info, bodyType='string', img_path=None):
"""
添加不同的邮件正文的实例
1. body为字符串:(如)"这是一个邮件正文内容"
2. body为html格式的字符串:(如)"第一段
第二段
"
3. body正文中包含有图片:
"""
print('邮件类型:',bodyType)
mimeText = None
if bodyType == "string":
mimeText = MIMEText(info, "plain", "utf-8")
elif bodyType == "html":
mimeText = MIMEText(info, "html", "utf-8")
elif "image" in bodyType:
mimeText = MIMEText(info, "html", "utf-8")
if not img_path:
print('请输入图片路径img_path')
return
# 读取图片,并设置图片id用于邮件正文引用
filePath, imgName = os.path.split(img_path)
print('imgpath: ',img_path)
with open(img_path, "rb") as fp:
mimeImage = MIMEImage(fp.read())
mimeImage.add_header("Content-ID", imgName)
self.msg.attach(mimeImage)
else:
print('指定类型错误,目前只支持string/html/image')
return
self.msg.attach(mimeText)
def addAttachment(self,attachment_path):
"""
添加附件
:return:
"""
_, fileName = os.path.split(attachment_path)
print("追加附件: ", fileName)
enclosure = MIMEText(open(attachment_path, 'rb').read(), 'base64', 'utf-8')
enclosure['Content-Type'] = 'application/octet-stream'
enclosure.add_header("Content-Disposition", "attachment", filename=("gbk", "", fileName))
self.msg.attach(enclosure)
邮箱发送功能实现
#!/usr/bin/env python
# encoding: utf-8
'''
@Time: 2022/7/21 19:10
@Project: mmb
@File: SendEmail.py
@Author: rk
@Software: pycharm
@Desc: 邮箱发送功能
'''
import smtplib
from email_utils.EmailContent import EmailContent
class SendEmail:
def __init__(self,smtpHost,port,senduser,password,buglevel=False):
'''
登录信息
:param smtpHost: SMTP的服务器信息
:param port: port = 25 【不使用TLS】smtplib.SMTP【TLS禁用时使用】
port = 465 【使用TLS】smtplib.SMTP_SSL【开启TLS时使用】
:param user: 用户地址
:param password: 用户密码
:param buglevel: 设置debug模块
'''
if port == 25:
self.smtpServer = smtplib.SMTP(smtpHost,port)
elif port == 465:
self.smtpServer = smtplib.SMTP_SSL(smtpHost,port)
else:
print('暂不支持此端口,请调整')
# 设置debug模块
self.smtpServer.set_debuglevel(buglevel)
# 登录
self.senderAdr = senduser
self.smtpServer.login(senduser, password)
print('发送用户({})登录成功'.format(senduser))
def setSubject(self,toReceivers,ccReceivers,header,emailSubject):
self.toAddrs = toReceivers + ccReceivers
self.emailContent = EmailContent(header, emailSubject, toReceivers, ccReceivers)
def setContent(self, info, bodyType='string', img_path=None, attachment_file=None):
self.emailContent.addBody(info, bodyType, img_path)
if attachment_file:
self.emailContent.addAttachment(attachment_file)
def send(self):
if not self.emailContent:
print('请先配置接收信息(setSubject&setContent)')
return
message = self.emailContent.msg
# 发送
self.smtpServer.sendmail(self.senderAdr,self.toAddrs,message.as_string())
print('已发送成功')
# 终止SMTP会话
self.smtpServer.quit()
#!/usr/bin/env python
# encoding: utf-8
'''
@Time: 2022/5/5 9:35
@Project: pythonProject
@File: email_main.py
@Author: rk
@Software: pycharm
@Desc: 邮箱发送实现
'''
import datetime
from email_send.Db_GetData import Db_GetData
from email_utils.SendEmail import SendEmail
def send_email():
# SMTP的服务器信息|用户信息 linux 465
smtpHost = "smtp.exmail.qq.com"
port = 25
senduser = "***@******.com"
senderPwd = "******"
# 登录
sender = SendEmail(smtpHost, port, senduser, senderPwd, buglevel=False)
print('邮箱登录成功')
return sender
def formatParam(save_path,bizdate=None):
# 设置时间
if not bizdate:
now_time = datetime.datetime.now()
bizdate = (now_time + datetime.timedelta(days=-1)).strftime('%Y%m%d')
# 给文件名追加时间 统计表.xlsx -> 统计表_20220505.xlsx
if bizdate not in save_path:
save_path = '{}_{}.{}'.format(save_path.split('.')[0], bizdate, save_path.split('.')[1])
return bizdate, save_path
if __name__== '__main__':
# 将统计数据写入文件
df = Db_GetData()
# 文件保存路径
save_path = '统计表.xlsx'
# 格式化参数 bizdate, sava_path bizdate默认前一天
# bizdate不指定,默认会在生成文件名后追加 统计表.xlsx -> 统计表_20220505.xlsx
bizdate, save_path = formatParam(save_path)
# 获取时间bizdate-1
bizdate_1 = (datetime.datetime.strptime(bizdate, "%Y%m%d") + datetime.timedelta(days=-1)).strftime('%Y%m%d')
# sheet页名称 表名称 分区日期
tables = [('sheet1', 'tabel1', bizdate_1)
, ('sheet2', 'table2', bizdate)]
# 将结果保存到指定路径
df.run(save_path, tables, type='odps')
# 将文件作为附件发送到邮箱
sender = send_email()
# 配置接收者/标题
toReceivers = ["******@163.com"]
ccReceivers = []
header = '每日数据统计'
subject = "邮箱主题"
sender.setSubject(toReceivers, ccReceivers, header, subject)
# 发送内容及附件
info = '{}的统计数据'.format(bizdate)
sender.setContent(info,attachment_file=save_path)
# 发送
sender.send()
随着时代的发展,任务自动化开发后面会越来越多。该工具支持多源配置,更像是一个模板,只需要调整少量参数即可以满足不同的需求,简化了后期类似需求的开发,保证了需求方的高效响应。以上开发均为自己琢磨开发,大家有什么好的建议意见,欢迎大家提出。后面我会再进行改进优化,为大家提供更好的工具。
具体代码可参考:git仓库-email_send