需求:这是2017年大概12月份去省厅出差时遇到的,当时有一大批邮件类型的文件,有人想把它的内容解析出来然后存到数据库里面进行查询。当时的难点就是把邮件内容分离出来存到数据库
解决:这个不算难,只是编码问题稍微麻烦点,重点是客户的数据不能带回家看
# -*- encoding: utf-8 -*-
import email
import re
import os
import multiprocessing
import time
import pymysql
"""
FoxMail文件处理
"""
class FoxMailPrase(object):
def __init__(self, fp):
self.fp = fp
def prase(self):
# 数据打包
d = dict()
# 读取eml文件
# fp = open(import_path, "r", encoding='utf-8')
# 转换成email对象
# fp = open(ph, "r")
msg = email.message_from_file(self.fp)
'''
主题
'''
try:
# 邮件主题的处理
# [('"[email protected]" <[email protected]>', None)]
subject1 = msg.get("Subject")
# [(b'\xb5\xe7\xb7\xe7\xc9\xc8\xb7\xa2\xc9\xe4\xb5\xe3\xb7\xb8\xb5\xc3\xc9\xcf\xb7\xa2\xc9\xfa', 'gb2312')]
subject_tuple = email.header.decode_header(subject1)
# 编码处理
code = subject_tuple[0][1]
subject_content = subject_tuple[0][0]
if subject1 is not None:
if code:
subject_decode = subject_content.decode(code)
else:
if isinstance(subject_content, str):
subject_decode = subject_content
else:
subject_decode = bytes.decode(subject_content)
else:
subject_decode = ""
except Exception as e:
subject_decode = str(subject1)
d['subject'] = subject_decode
'''
发件人
'''
try:
# =?gbk?B?18rUtLmyz+2wyQ==?=
send = msg.get("From")
if send is not None:
# [(b'\xd7\xca\xd4\xb4\xb9\xb2\xcf\xed\xb0\xc9', 'gbk'), (b' ', None)]
send_tuple = email.header.decode_header(send)
# print(send_tuple)
sends = []
for e in send_tuple:
if e[1]:
sends.append(e[0].decode(e[1]))
else:
if isinstance(e[0], str):
sends.append(e[0])
else:
# bytes转str
sends.append(bytes.decode(e[0]))
from_user = ""
for s in sends:
from_user += s + ", "
else:
from_user = ""
except Exception as e:
from_user = send
d['from'] = from_user
'''
抄送人
'''
try:
cc = msg.get("Cc")
if cc is not None:
receive_tuple = email.header.decode_header(cc)
ccs = []
for e in receive_tuple:
if e[1]:
ccs.append(e[0].decode(e[1]))
else:
if isinstance(e[0], str):
ccs.append(e[0])
else:
# bytes转str
ccs.append(bytes.decode(e[0]))
cc_user = ",".join(ccs)
else:
cc_user = ""
except Exception as e:
cc_user = cc
d['cc'] = cc_user
'''
收件人
'''
try:
receive = msg.get("To")
if receive is not None:
receive_tuple = email.header.decode_header(receive)
receives = []
for e in receive_tuple:
if e[1]:
receives.append(e[0].decode(e[1]))
else:
if isinstance(e[0], str):
receives.append(e[0])
else:
# bytes转str
receives.append(bytes.decode(e[0]))
to_user = ", ".join(receives)
else:
to_user = ""
except Exception as e:
to_user = receive
d['to'] = to_user
'''
时间
'''
add_d = msg.get("Date")
if add_d is None:
add_date = ""
else:
add_date = add_d
d['add_time'] = add_date
'''
IP处理
'''
# 找出相关ip
# (1)把Message对象转换成str
def ip_prase(header):
msg_str = str(msg)
# 按 换行符 分组
lines = re.split("\n", msg_str)
ips = []
for line in lines:
# 如果是Received
if re.match(r"^"+header, line):
# 匹配ip
ip = re.findall(r"\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b", line)
if ip:
ips.append(ip[0])
if len(ips) == 0:
new_ip = ""
else:
# 去除重复,保留顺序
ips2 = sorted(set(ips), key=ips.index)
# 把list数据类型处理成字符串
new_ip = ", ".join(ips2)
return new_ip
received_ip = ip_prase("Received:")
originating_ip = ip_prase("X-Originating-IP:")
d['received_ip'] = received_ip
d['originating_ip'] = originating_ip
'''
邮件内容 mime
'''
"""
walk()函数能历遍邮件所有部分,所以通常都把它放到for循环里面使用。
然后再使用is_multipart()函数来判断内容是否有用,打印出有用内容
最后用get_payload(decode=True).decode('utf-8')解码并且打印到控制台。
通常这个循环有两次,第一次是单纯的字符串格式的,能在控制台显示出来的,
第二次循环打印的是像HTML的格式,能在浏览器里查看,就像平时看到的邮件那样
"""
file_names = []
contents = []
try:
for part in msg.walk():
char_set = part.get_param("charset")
# 获取附件名字
name = part.get_param("name")
# 如果ture的话内容是没用的
if not part.is_multipart():
# 如果是附件
if name:
# [('4D20.tmp.png', None)]
# [(b'9Z2[R30(YJ5`$J04W6~1A(K.png', 'gb18030')]
file_tuple = email.header.decode_header(name)
file_names.append(file_tuple)
# 如果不是附件
else:
# 把编码加到集合
contents.append(char_set)
content = part.get_payload(decode=True)
contents.append(content)
except Exception as e:
file_names = []
contents = []
# 附件名称解码
if len(file_names) == 0:
files = ""
else:
filename_list = []
for e in file_names:
f = e[0][0]
# print(isinstance(f, bytes))
if isinstance(f, str):
# print(f)
filename_list.append(f)
else:
try:
m = f.decode(str(e[0][1]))
except Exception as e:
m = str(f)
filename_list.append(m)
# 附件名称变成可存储的字符串
files = ", ".join(filename_list)
d['files'] = files
# 邮件内容解码
content_decode = ""
if len(contents) != 0:
if contents[1]:
c = contents[0]
if c == None:
content_decode = str(contents[1])
else:
try:
content_decode = str(contents[1].decode(c))
except Exception as e:
try:
if c[:2].lower() == "gb":
content_decode = str(contents[1].decode("gbk"))
elif c[:3].lower() == "utf":
content_decode = str(contents[1].decode("UTF-8"))
else:
content_decode = str(contents[1])
except Exception as e:
content_decode = str(contents[1])
else:
content_decode = ""
ft = HtmlDeal()
d['content'] = ft.filter_tags(content_decode)
# print("主题:", subject_decode)
# print("发件人:", from_user)
# print("抄送人:", cc_user)
# print("收件人:", to_user)
# print("时间:", add_date)
# print("发送IP:", originating_ip)
# print("接收IP:", received_ip)
# print("附件:", files)
# print("内容:", ft.filter_tags(str(content_decode)))
# print(d)
return d
"""
文件目录遍历
"""
class FileList(object):
def getListFiles(self, path):
ret = []
# 便利目录下出所有文件
for root, dirs, files in os.walk(path):
for filespath in files:
ret.append(os.path.join(root, filespath))
# 找出所有以 .eml 结尾的文件
ret2 = []
for each in ret:
if each[-4:] == ".eml":
ret2.append(each)
return ret2
'''
数据存储处理
'''
class MysqlConn(object):
def __init__(self, host, user, pwd, db):
self.host = host
self.user = user
self.pwd = pwd
self.db = db
def execute(self, from_user, cc_user, to_user, subject, content, add_time, files, received_ip, originating_ip, url,):
connection = pymysql.connect(host=self.host,
user=self.user,
password=self.pwd,
db=self.db,
charset='utf8',
cursorclass=pymysql.cursors.DictCursor)
try:
with connection.cursor() as cursor:
# Create a new record
sql = "INSERT INTO email_list VALUES (0,"\
+"\""+ connection.escape_string(from_user) +"\"" + "," \
+ "\"" + connection.escape_string(cc_user) + "\"" + "," \
+ "\"" + connection.escape_string(to_user) + "\"" + ","\
+"\""+ connection.escape_string(subject) +"\""+","\
+"\"" + connection.escape_string(content) + "\""+","\
+ "\"" + connection.escape_string(add_time) + "\"" + ","\
+ "\"" + connection.escape_string(files) + "\"" + "," \
+ "\"" + received_ip + "\"" + "," \
+ "\"" + originating_ip + "\"" + "," \
+ "\"" + connection.escape_string(url) + "\"" + ")" + ";"
print(sql)
cursor.execute(sql)
# connection is not autocommit by default. So you must commit to save
# your changes.
connection.commit()
finally:
connection.close()
def create_table(self):
connection = pymysql.connect(host=self.host,
user=self.user,
password=self.pwd,
db=self.db,
charset='utf8',
cursorclass=pymysql.cursors.DictCursor)
try:
with connection.cursor() as cursor:
# Create a new record
sql = "CREATE TABLE IF NOT EXISTS `email_list` (\
`id` int(11) NOT NULL AUTO_INCREMENT,\
`subject` varchar(100) DEFAULT NULL,\
`from` varchar(100) DEFAULT NULL,\
`cc` varchar(5000) DEFAULT NULL,\
`to` varchar(5000) DEFAULT NULL,\
`content` text,\
`sendtime` varchar(50) DEFAULT NULL,\
`files` varchar(255) DEFAULT NULL,\
`received_ip` varchar(255) DEFAULT NULL,\
`originating_ip` varchar(255) DEFAULT NULL,\
`url` varchar(255) DEFAULT NULL,\
PRIMARY KEY (`id`)\
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;"
# print(sql)
cursor.execute(sql)
# connection is not autocommit by default. So you must commit to save
# your changes.
connection.commit()
finally:
connection.close()
"""
处理html标签及空白行
"""
class HtmlDeal(object):
# 过滤HTML中的标签
# 将HTML中标签等信息去掉
# @param htmlstr HTML字符串.
def filter_tags(self, htmlstr):
# 先过滤CDATA
re_cdata = re.compile('//]*//\]\]>', re.I) # 匹配CDATA
re_script = re.compile('<\s*script[^>]*>[^<]*<\s*/\s*script\s*>', re.I) # Script
re_style = re.compile('<\s*style[^>]*>[^<]*<\s*/\s*style\s*>', re.I) # style
re_br = re.compile('
') # 处理换行
re_h = re.compile('?\w+[^>]*>') # HTML标签
re_comment = re.compile('') # HTML注释
s = re_cdata.sub('', htmlstr) # 去掉CDATA
s = re_script.sub('', s) # 去掉SCRIPT
s = re_style.sub('', s) # 去掉style
s = re_br.sub('\n', s) # 将br转换为换行
s = re_h.sub('', s) # 去掉HTML 标签
s = re_comment.sub('', s) # 去掉HTML注释
# 去掉多余的空行
blank_line = re.compile('\n+')
s = blank_line.sub('\n', s)
s = self.replaceCharEntity(s) # 替换实体
sn = re.compile("\n\s*\n")
s = sn.sub("\n", s)
return s
# 替换常用HTML字符实体.
# 使用正常的字符替换HTML中特殊的字符实体.
# 你可以添加新的实体字符到CHAR_ENTITIES中,处理更多HTML字符实体.
# @param htmlstr HTML字符串.
def replaceCharEntity(self, htmlstr):
CHAR_ENTITIES = {'nbsp': ' ', '160': ' ',
'lt': '<', '60': '<',
'gt': '>', '62': '>',
'amp': '&', '38': '&',
'quot': '"', '34': '"', }
re_charEntity = re.compile(r'?(?P\w+);')
sz = re_charEntity.search(htmlstr)
while sz:
entity = sz.group() # entity全称,如>
key = sz.group('name') # 去除&;后entity,如>为gt
try:
htmlstr = re_charEntity.sub(CHAR_ENTITIES[key], htmlstr, 1)
sz = re_charEntity.search(htmlstr)
except KeyError:
# 以空串代替
htmlstr = re_charEntity.sub('', htmlstr, 1)
sz = re_charEntity.search(htmlstr)
return htmlstr
"""
下面的方法处理主要是为了实现多进程,mian方法里面一般是不定义方法的,
即使语法正确也会出问题
"""
# 读取配置文件
def read_url():
lines = []
# 获取url.txt的路径
new_path = os.path.join(os.getcwd(), "url.txt")
with open(new_path, "r") as f:
while 1:
line = f.readline().replace("\n", "")
lines.append(line)
if not line:
break
return lines
# 创建数据表
def create_table():
print("正在创建MySQL数据库表格......")
time.sleep(2)
r = read_url()
db = MysqlConn(r[1], r[2], r[3], r[4])
db.create_table()
# 插入数据
def insert_table(mail, path2):
r = read_url()
db = MysqlConn(r[1], r[2], r[3], r[4])
db.execute(mail["subject"], mail["from"], mail["cc"], mail["to"], mail["content"],
mail["add_time"], mail["files"], mail["received_ip"], mail['originating_ip'], path2)
# 文件完整路径集合
def file_list():
# 调用方法创建数据表
create_table()
r = read_url()
filelist = FileList()
files = filelist.getListFiles(r[0])
return files
# 解析邮件
def prase_mail(path1):
try:
fp1 = open(path1, "r")
mailprase1 = FoxMailPrase(fp1)
mail1 = mailprase1.prase()
insert_table(mail1, path1)
except Exception as e:
try:
fp2 = open(path1, "rb")
mailprase2 = FoxMailPrase(fp2)
mail2 = mailprase2.prase()
insert_table(mail2, path1)
except Exception as e:
try:
fp3 = open(path1, "r", encoding='utf-8', errors="ignore")
mailprase3 = FoxMailPrase(fp3)
mail3 = mailprase3.prase()
if mail3["subject"] == "None" and mail3["to"] == "" and mail3["from"] == "":
# 处理有bom的utf-8文件
fp3 = open(path1, "r", encoding='utf-8-sig', errors="ignore")
mailprase3 = FoxMailPrase(fp3)
mail3 = mailprase3.prase()
insert_table(mail3, path1)
except Exception as e:
with open(os.path.join(os.getcwd(), "error_log.txt"), "a") as f:
f.write(path1 + "\n")
print(e)
finally:
print(path1)
if __name__ == '__main__':
# 解决多进程程序pyinstaller打包后出现的问题(开启一大堆新进程)
multiprocessing.freeze_support()
# 多进程处理数据
pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
for path in file_list():
pool.apply_async(prase_mail, (path, ))
# prase_mail(path)
pool.close()
pool.join()
print("总共有 %s 个文件!" % (len(file_list())))
"""
下面的是原始的单线程处理方法,单进程处理文件的实现方式
"""
"""
# 读取配置文件
def read_url():
lines = []
new_path = os.getcwd() + "\\url.txt"
with open(new_path, "r") as f:
while 1:
line = f.readline().replace("\n", "")
lines.append(line)
if not line:
break
return lines
read_url = read_url()
# print(read_url)
filelist = FileList()
files = filelist.getListFiles(read_url[0])
db = MysqlConn(read_url[1], read_url[2], read_url[3], read_url[4])
db.create_table()
total = len(files)
current_count = 1
for path in files:
try:
fp1 = open(path, "r")
mailprase1 = FoxMailPrase(fp1)
mail = mailprase1.prase()
# print(mailprase1)
db.execute(mail["subject"], mail["from"], mail["to"], mail["content"],
mail["add_time"], mail["files"], mail["received_ip"], mail['originating_ip'], path)
print("总共 %s 个文件,第 %s 个文件完成解析,路径是:%s" % (total, current_count, path))
current_count += 1
except Exception as e:
try:
fp2 = open(path, "r", encoding='utf-8')
mailprase2 = FoxMailPrase(fp2)
mail = mailprase2.prase()
# print(mailprase2)
db.execute(mail["subject"], mail["from"], mail["to"], mail["content"],
mail["add_time"], mail["files"], mail["received_ip"], mail['originating_ip'], path)
print("总共 %s 个文件,第 %s 个文件完成解析,路径是:%s" % (total, current_count, path))
current_count += 1
except Exception as e:
with open(os.getcwd() + "\\error_log.txt", "a") as f:
f.write(path + "\n")
print("总共 %s 个文件,第 %s 个文件解析出现错误,路径是:%s" % (total, current_count, path))
current_count += 1
print(e)
"""
配置文件:url.txt
文件说明:(文件路径、IP、用户、密码、数据库名)
D:\share
127.0.0.1
xiang
123456
tree30