邮件类型的文件解析

需求:这是2017年大概12月份去省厅出差时遇到的,当时有一大批邮件类型的文件,有人想把它的内容解析出来然后存到数据库里面进行查询。当时的难点就是把邮件内容分离出来存到数据库
解决:这个不算难,只是编码问题稍微麻烦点,重点是客户的数据不能带回家看

# -*- encoding: utf-8 -*-
import email
import re
import os
import multiprocessing
import time

import pymysql


"""
    FoxMail文件处理
"""


class FoxMailPrase(object):
    def __init__(self, fp):
        self.fp = fp

    def prase(self):
        # 数据打包
        d = dict()
        # 读取eml文件
        # fp = open(import_path, "r", encoding='utf-8')
        # 转换成email对象
        # fp = open(ph, "r")
        msg = email.message_from_file(self.fp)

        '''
        主题
        '''
        try:
            # 邮件主题的处理
            # [('"[email protected]" <[email protected]>', None)]
            subject1 = msg.get("Subject")
            # [(b'\xb5\xe7\xb7\xe7\xc9\xc8\xb7\xa2\xc9\xe4\xb5\xe3\xb7\xb8\xb5\xc3\xc9\xcf\xb7\xa2\xc9\xfa', 'gb2312')]
            subject_tuple = email.header.decode_header(subject1)
            # 编码处理
            code = subject_tuple[0][1]
            subject_content = subject_tuple[0][0]
            if subject1 is not None:
                if code:
                    subject_decode = subject_content.decode(code)
                else:
                    if isinstance(subject_content, str):
                        subject_decode = subject_content
                    else:
                        subject_decode = bytes.decode(subject_content)
            else:
                 subject_decode = ""
        except Exception as e:
            subject_decode = str(subject1)
        d['subject'] = subject_decode

        '''
        发件人
        '''

        try:
            # =?gbk?B?18rUtLmyz+2wyQ==?= 
            send = msg.get("From")
            if send is not None:
                # [(b'\xd7\xca\xd4\xb4\xb9\xb2\xcf\xed\xb0\xc9', 'gbk'), (b' ', None)]
                send_tuple = email.header.decode_header(send)
                # print(send_tuple)
                sends = []
                for e in send_tuple:
                    if e[1]:
                        sends.append(e[0].decode(e[1]))
                    else:
                        if isinstance(e[0], str):
                            sends.append(e[0])
                        else:
                            # bytes转str
                            sends.append(bytes.decode(e[0]))

                    from_user = ""
                    for s in sends:
                        from_user += s + ", "
            else:
                from_user = ""
        except Exception as e:
            from_user = send
        d['from'] = from_user

        '''
        抄送人
        '''

        try:
            cc = msg.get("Cc")
            if cc is not None:
                receive_tuple = email.header.decode_header(cc)
                ccs = []
                for e in receive_tuple:
                    if e[1]:
                        ccs.append(e[0].decode(e[1]))
                    else:
                        if isinstance(e[0], str):
                            ccs.append(e[0])
                        else:
                            # bytes转str
                            ccs.append(bytes.decode(e[0]))
                    cc_user = ",".join(ccs)
            else:
                cc_user = ""
        except Exception as e:
            cc_user = cc
        d['cc'] = cc_user

        '''
        收件人
        '''

        try:
            receive = msg.get("To")
            if receive is not None:
                receive_tuple = email.header.decode_header(receive)
                receives = []
                for e in receive_tuple:
                    if e[1]:
                        receives.append(e[0].decode(e[1]))
                    else:
                        if isinstance(e[0], str):
                            receives.append(e[0])
                        else:
                            # bytes转str
                            receives.append(bytes.decode(e[0]))
                to_user = ", ".join(receives)
            else:
                to_user = ""
        except Exception as e:
            to_user = receive
        d['to'] = to_user

        '''
        时间
        '''

        add_d = msg.get("Date")
        if add_d is None:
            add_date = ""
        else:
            add_date = add_d
        d['add_time'] = add_date

        '''
        IP处理
        '''
        # 找出相关ip
        # (1)把Message对象转换成str

        def ip_prase(header):
            msg_str = str(msg)
            # 按 换行符 分组
            lines = re.split("\n", msg_str)
            ips = []
            for line in lines:
                # 如果是Received
                if re.match(r"^"+header, line):
                    # 匹配ip
                    ip = re.findall(r"\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b", line)
                    if ip:
                        ips.append(ip[0])
            if len(ips) == 0:
                new_ip = ""
            else:
                # 去除重复,保留顺序
                ips2 = sorted(set(ips), key=ips.index)
                # 把list数据类型处理成字符串
                new_ip = ", ".join(ips2)
            return new_ip

        received_ip = ip_prase("Received:")
        originating_ip = ip_prase("X-Originating-IP:")
        d['received_ip'] = received_ip
        d['originating_ip'] = originating_ip

        '''
        邮件内容 mime
        '''
        """
        walk()函数能历遍邮件所有部分,所以通常都把它放到for循环里面使用。
        然后再使用is_multipart()函数来判断内容是否有用,打印出有用内容
        最后用get_payload(decode=True).decode('utf-8')解码并且打印到控制台。
        通常这个循环有两次,第一次是单纯的字符串格式的,能在控制台显示出来的,
        第二次循环打印的是像HTML的格式,能在浏览器里查看,就像平时看到的邮件那样
        """

        file_names = []
        contents = []
        try:
            for part in msg.walk():
                char_set = part.get_param("charset")
                # 获取附件名字
                name = part.get_param("name")
                # 如果ture的话内容是没用的
                if not part.is_multipart():
                    # 如果是附件
                    if name:
                        # [('4D20.tmp.png', None)]
                        # [(b'9Z2[R30(YJ5`$J04W6~1A(K.png', 'gb18030')]
                        file_tuple = email.header.decode_header(name)
                        file_names.append(file_tuple)
                    # 如果不是附件
                    else:
                        # 把编码加到集合
                        contents.append(char_set)
                        content = part.get_payload(decode=True)
                        contents.append(content)
        except Exception as e:
            file_names = []
            contents = []

        # 附件名称解码
        if len(file_names) == 0:
            files = ""
        else:
            filename_list = []
            for e in file_names:
                f = e[0][0]
                # print(isinstance(f, bytes))
                if isinstance(f, str):
                    # print(f)
                    filename_list.append(f)
                else:
                    try:
                        m = f.decode(str(e[0][1]))
                    except Exception as e:
                        m = str(f)
                    filename_list.append(m)
            # 附件名称变成可存储的字符串
            files = ", ".join(filename_list)

        d['files'] = files

        # 邮件内容解码
        content_decode = ""
        if len(contents) != 0:
            if contents[1]:
                c = contents[0]
                if c == None:
                    content_decode = str(contents[1])
                else:
                    try:
                        content_decode = str(contents[1].decode(c))
                    except Exception as e:
                        try:
                            if c[:2].lower() == "gb":
                                content_decode = str(contents[1].decode("gbk"))
                            elif c[:3].lower() == "utf":
                                content_decode = str(contents[1].decode("UTF-8"))
                            else:
                                content_decode = str(contents[1])
                        except Exception as e:
                            content_decode = str(contents[1])
            else:
                content_decode = ""
        
        ft = HtmlDeal()
        d['content'] = ft.filter_tags(content_decode)

        # print("主题:", subject_decode)
        # print("发件人:", from_user)
        # print("抄送人:", cc_user)
        # print("收件人:", to_user)
        # print("时间:", add_date)
        # print("发送IP:", originating_ip)
        # print("接收IP:", received_ip)
        # print("附件:", files)
        # print("内容:", ft.filter_tags(str(content_decode)))
        # print(d)

        return d


"""
文件目录遍历
"""


class FileList(object):
    def getListFiles(self, path):
        ret = []
        # 便利目录下出所有文件
        for root, dirs, files in os.walk(path):
            for filespath in files:
                ret.append(os.path.join(root, filespath))
        # 找出所有以 .eml 结尾的文件
        ret2 = []
        for each in ret:
            if each[-4:] == ".eml":
                ret2.append(each)
        return ret2


'''
数据存储处理
'''


class MysqlConn(object):
    def __init__(self, host, user, pwd, db):
        self.host = host
        self.user = user
        self.pwd = pwd
        self.db = db

    def execute(self, from_user, cc_user, to_user, subject, content, add_time, files, received_ip, originating_ip, url,):
        connection = pymysql.connect(host=self.host,
                                     user=self.user,
                                     password=self.pwd,
                                     db=self.db,
                                     charset='utf8',
                                     cursorclass=pymysql.cursors.DictCursor)

        try:
            with connection.cursor() as cursor:
                # Create a new record
                sql = "INSERT INTO email_list VALUES (0,"\
                      +"\""+ connection.escape_string(from_user) +"\"" + "," \
                      + "\"" + connection.escape_string(cc_user) + "\"" + "," \
                      + "\"" + connection.escape_string(to_user) + "\"" + ","\
                      +"\""+ connection.escape_string(subject) +"\""+","\
                      +"\"" + connection.escape_string(content) + "\""+","\
                      + "\"" + connection.escape_string(add_time) + "\"" + ","\
                      + "\"" + connection.escape_string(files) + "\"" + "," \
                      + "\"" + received_ip + "\"" + "," \
                      + "\"" + originating_ip + "\"" + "," \
                      + "\"" + connection.escape_string(url) + "\"" + ")" + ";"
                print(sql)
                cursor.execute(sql)

            # connection is not autocommit by default. So you must commit to save
            # your changes.
            connection.commit()
        finally:
            connection.close()

    def create_table(self):
        connection = pymysql.connect(host=self.host,
                                     user=self.user,
                                     password=self.pwd,
                                     db=self.db,
                                     charset='utf8',
                                     cursorclass=pymysql.cursors.DictCursor)
        try:
            with connection.cursor() as cursor:
                # Create a new record
                sql = "CREATE TABLE IF NOT EXISTS `email_list` (\
                          `id` int(11) NOT NULL AUTO_INCREMENT,\
                          `subject` varchar(100) DEFAULT NULL,\
                          `from` varchar(100) DEFAULT NULL,\
                          `cc` varchar(5000) DEFAULT NULL,\
                          `to` varchar(5000) DEFAULT NULL,\
                          `content` text,\
                          `sendtime` varchar(50) DEFAULT NULL,\
                          `files` varchar(255) DEFAULT NULL,\
                          `received_ip` varchar(255) DEFAULT NULL,\
                          `originating_ip` varchar(255) DEFAULT NULL,\
                          `url` varchar(255) DEFAULT NULL,\
                          PRIMARY KEY (`id`)\
                        ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;"

                # print(sql)
                cursor.execute(sql)

            # connection is not autocommit by default. So you must commit to save
            # your changes.
            connection.commit()
        finally:
            connection.close()


"""
处理html标签及空白行
"""


class HtmlDeal(object):
    # 过滤HTML中的标签
    # 将HTML中标签等信息去掉
    # @param htmlstr HTML字符串.
    def filter_tags(self, htmlstr):
        # 先过滤CDATA
        re_cdata = re.compile('//]*//\]\]>', re.I)  # 匹配CDATA
        re_script = re.compile('<\s*script[^>]*>[^<]*<\s*/\s*script\s*>', re.I)  # Script
        re_style = re.compile('<\s*style[^>]*>[^<]*<\s*/\s*style\s*>', re.I)  # style
        re_br = re.compile('')  # 处理换行
        re_h = re.compile(']*>')  # HTML标签
        re_comment = re.compile('')  # HTML注释
        s = re_cdata.sub('', htmlstr)  # 去掉CDATA
        s = re_script.sub('', s)  # 去掉SCRIPT
        s = re_style.sub('', s)  # 去掉style
        s = re_br.sub('\n', s)  # 将br转换为换行
        s = re_h.sub('', s)  # 去掉HTML 标签
        s = re_comment.sub('', s)  # 去掉HTML注释
        # 去掉多余的空行
        blank_line = re.compile('\n+')
        s = blank_line.sub('\n', s)
        s = self.replaceCharEntity(s)  # 替换实体
        sn = re.compile("\n\s*\n")
        s = sn.sub("\n", s)
        return s

    # 替换常用HTML字符实体.
    # 使用正常的字符替换HTML中特殊的字符实体.
    # 你可以添加新的实体字符到CHAR_ENTITIES中,处理更多HTML字符实体.
    # @param htmlstr HTML字符串.
    def replaceCharEntity(self, htmlstr):
        CHAR_ENTITIES = {'nbsp': ' ', '160': ' ',
                         'lt': '<', '60': '<',
                         'gt': '>', '62': '>',
                         'amp': '&', '38': '&',
                         'quot': '"', '34': '"', }

        re_charEntity = re.compile(r'&#?(?P\w+);')
        sz = re_charEntity.search(htmlstr)
        while sz:
            entity = sz.group()  # entity全称,如>
            key = sz.group('name')  # 去除&;后entity,如>为gt
            try:
                htmlstr = re_charEntity.sub(CHAR_ENTITIES[key], htmlstr, 1)
                sz = re_charEntity.search(htmlstr)
            except KeyError:
                # 以空串代替
                htmlstr = re_charEntity.sub('', htmlstr, 1)
                sz = re_charEntity.search(htmlstr)
        return htmlstr


"""
下面的方法处理主要是为了实现多进程,mian方法里面一般是不定义方法的,
即使语法正确也会出问题
"""


# 读取配置文件
def read_url():
    lines = []
    # 获取url.txt的路径
    new_path = os.path.join(os.getcwd(), "url.txt")
    with open(new_path, "r") as f:
        while 1:
            line = f.readline().replace("\n", "")
            lines.append(line)
            if not line:
                break
    return lines


# 创建数据表
def create_table():
    print("正在创建MySQL数据库表格......")
    time.sleep(2)
    r = read_url()
    db = MysqlConn(r[1], r[2], r[3], r[4])
    db.create_table()


# 插入数据
def insert_table(mail, path2):
    r = read_url()
    db = MysqlConn(r[1], r[2], r[3], r[4])
    db.execute(mail["subject"], mail["from"], mail["cc"], mail["to"], mail["content"],
               mail["add_time"], mail["files"], mail["received_ip"], mail['originating_ip'], path2)


# 文件完整路径集合
def file_list():
    # 调用方法创建数据表
    create_table()
    r = read_url()
    filelist = FileList()
    files = filelist.getListFiles(r[0])
    return files


# 解析邮件
def prase_mail(path1):
    try:
        fp1 = open(path1, "r")
        mailprase1 = FoxMailPrase(fp1)
        mail1 = mailprase1.prase()
        insert_table(mail1, path1)
    except Exception as e:
        try:
            fp2 = open(path1, "rb")
            mailprase2 = FoxMailPrase(fp2)
            mail2 = mailprase2.prase()
            insert_table(mail2, path1)
        except Exception as e:
            try:
                fp3 = open(path1, "r", encoding='utf-8', errors="ignore")
                mailprase3 = FoxMailPrase(fp3)
                mail3 = mailprase3.prase()
                if mail3["subject"] == "None" and mail3["to"] == "" and mail3["from"] == "":
                    # 处理有bom的utf-8文件
                    fp3 = open(path1, "r", encoding='utf-8-sig', errors="ignore")
                    mailprase3 = FoxMailPrase(fp3)
                    mail3 = mailprase3.prase()
                insert_table(mail3, path1)
            except Exception as e:
                with open(os.path.join(os.getcwd(), "error_log.txt"), "a") as f:
                    f.write(path1 + "\n")
                print(e)
    finally:
        print(path1)

if __name__ == '__main__':
    # 解决多进程程序pyinstaller打包后出现的问题(开启一大堆新进程)
    multiprocessing.freeze_support()
    # 多进程处理数据
    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())

    for path in file_list():
        pool.apply_async(prase_mail, (path, ))
        # prase_mail(path)
    pool.close()
    pool.join()

    print("总共有 %s 个文件!" % (len(file_list())))
"""
下面的是原始的单线程处理方法,单进程处理文件的实现方式
"""

"""
    # 读取配置文件
    def read_url():
        lines = []
        new_path = os.getcwd() + "\\url.txt"
        with open(new_path, "r") as f:
            while 1:
                line = f.readline().replace("\n", "")
                lines.append(line)
                if not line:
                    break
        return lines

    read_url = read_url()
    # print(read_url)

    filelist = FileList()
    files = filelist.getListFiles(read_url[0])

    db = MysqlConn(read_url[1], read_url[2], read_url[3], read_url[4])
    db.create_table()

    total = len(files)
    current_count = 1

    for path in files:
        try:
            fp1 = open(path, "r")
            mailprase1 = FoxMailPrase(fp1)
            mail = mailprase1.prase()
            # print(mailprase1)
            db.execute(mail["subject"], mail["from"], mail["to"], mail["content"],
                       mail["add_time"], mail["files"], mail["received_ip"], mail['originating_ip'], path)
            print("总共 %s 个文件,第 %s 个文件完成解析,路径是:%s" % (total, current_count, path))
            current_count += 1
        except Exception as e:
            try:
                fp2 = open(path, "r", encoding='utf-8')
                mailprase2 = FoxMailPrase(fp2)
                mail = mailprase2.prase()
                # print(mailprase2)
                db.execute(mail["subject"], mail["from"], mail["to"], mail["content"],
                           mail["add_time"], mail["files"], mail["received_ip"], mail['originating_ip'], path)
                print("总共 %s 个文件,第 %s 个文件完成解析,路径是:%s" % (total, current_count, path))
                current_count += 1
            except Exception as e:
                with open(os.getcwd() + "\\error_log.txt", "a") as f:
                    f.write(path + "\n")
                print("总共 %s 个文件,第 %s 个文件解析出现错误,路径是:%s" % (total, current_count, path))
                current_count += 1
                print(e)
"""

配置文件:url.txt
文件说明:(文件路径、IP、用户、密码、数据库名)
D:\share
127.0.0.1
xiang
123456
tree30

你可能感兴趣的:(Python)