Python定时任务 获取邮件附件PDF 解析成PNG 保存到本地

一、开发环境

win10操作系统
Mysql数据库
python3.7
PyMySQL==0.9.2
PyPDF2==1.26.0
PythonMagick==0.9.19
Wand==0.4.4
gs924w64
对应环境需要包下载点击即可 https://pan.baidu.com/s/10izLvFOD0IafTrmQDf-x1A


二、实现目标

  • 定时获取邮箱中邮件的标题以及附件pdf
  • 下载附件到本地,解析PDF成PNG保存到本地
  • 插入记录到数据库,并保存最新下载的邮件,以便下次获取新邮件

三、代码实现

1、fetch_mail.py(获取邮件)
# -*- coding: utf-8 -*-

import email

from email.parser import Parser
from email.header import decode_header
from email.utils import parseaddr

import poplib
import util as u
import do_email_list as do_list
import datetime
import change_pdf_to_png as cptp


# 输入邮件地址, 口令和POP3服务器地址:
username = '[email protected]'
password = 'xxx'
pop3_server = 'pop3.mail.aliyun.com'


time = datetime.datetime.now().strftime("%Y%m%d")
mkpath = "C:\\img\email\%s" % time
u.mkdir(mkpath)


def guess_charset(msg):
    charset = msg.get_charset()
    if charset is None:
        content_type = msg.get('Content-Type', '').lower()
        pos = content_type.find('charset=')
        if pos >= 0:
            charset = content_type[pos + 8:].strip()
    return charset


def decode_str(s):
    value, charset = decode_header(s)[0]
    if charset:
        value = value.decode(charset)
    return value


def print_info(msg, indent=0):
    if indent == 0:
        for header in ['From', 'To', 'Subject']:
            value = msg.get(header, '')
            if value:
                if header == 'Subject':
                    value = decode_str(value)
                else:
                    hdr, addr = parseaddr(value)
                    name = decode_str(hdr)
                    value = u'%s <%s>' % (name, addr)
            print('%s%s: %s' % ('  ' * indent, header, value))
    if msg.is_multipart():
        parts = msg.get_payload()
        for n, part in enumerate(parts):
            print('%spart %s' % ('  ' * indent, n))
            print('%s--------------------' % ('  ' * indent))
            print_info(part, indent + 1)
    else:
        content_type = msg.get_content_type()
        if content_type == 'text/plain' or content_type == 'text/html':
            content = msg.get_payload(decode=True)
            charset = guess_charset(msg)
            if charset:
                content = content.decode(charset)
            print('%sText: %s' % ('  ' * indent, content + '...'))
        else:
            print('%sAttachment: %s' % ('  ' * indent, content_type))


def get_email_headers(msg):
    # 邮件的From, To, Subject存在于根对象上:
    headers = {}
    for header in ['From', 'To', 'Subject', 'Date']:
        value = msg.get(header, '')
        if value:
            if header == 'Date':
                headers['date'] = value
            if header == 'Subject':
                # 需要解码Subject字符串:
                subject = decode_str(value)
                headers['subject'] = subject
            else:
                # 需要解码Email地址:
                hdr, addr = parseaddr(value)
                name = decode_str(hdr)
                value = u'%s <%s>' % (name, addr)
                if header == 'From':
                    from_address = value
                    headers['from'] = from_address
                else:
                    to_address = value
                    headers['to'] = to_address
        content_type = msg.get_content_type()
        print('head content_type: ', content_type)
    return headers


# indent用于缩进显示:
def get_email_content(message, base_save_path, id_card):
    j = 0
    flag = False
    content = ''
    attachment_files = []
    for part in message.walk():
        if not part.is_multipart():
            j = j + 1
            file_name = part.get_filename()
            content_type = part.get_content_type()
            # 保存附件
            if file_name and flag is False:  # Attachment
                flag = True
                # # Decode filename
                # h = email.header.Header(file_name)
                # dh = email.header.decode_header(h)
                # my_code = part.get_content_charset()
                # filename = dh[0][0]
                # encode_str = dh[0][1]
                # if encode_str is not None:
                # filename = filename.decode("ASCII", my_code)
                data = part.get_payload(decode=True)
                    # att_file = open(base_save_path + filename, 'wb')
                att_file = open(base_save_path + "QQ.pdf", 'wb')
                # att_file = open(base_save_path + filename, 'wb')
                # url = base_save_path + filename
                url = base_save_path + "QQ.pdf"
                # url = base_save_path + filename
                attachment_files.append("QQ.pdf")
                # attachment_files.append(filename)
                att_file.write(data)
                att_file.close()
                try:
                    img_info = cptp.run_convert(url, time, id_card)

                    if img_info:
                        if do_list.get_now_exist_email(id_card)['count_sfzhm'] <= 0:
                            do_list.insert_email_list(id_card, img_info["pagenum"], "%s/%s" % (time, id_card))
                except:
                    print("这个有问题 跳过")
            elif content_type == 'text/plain' or content_type == 'text/html':
                # 保存正文
                data = part.get_payload(decode=True)
                charset = guess_charset(part)
                if charset:
                    charset = charset.strip().split(';')[0]
                    print('charset:', charset)
                    data = data.decode(charset)
                content = data
    return content, attachment_files

def get_email():
    # 连接到POP3服务器:
    server = poplib.POP3(pop3_server)
    # 可以打开或关闭调试信息:
    server.set_debuglevel(1)
    # 可选:打印POP3服务器的欢迎文字:
    print(server.getwelcome().decode('utf-8'))
    # 身份认证:
    server.user(username)
    server.pass_(password)
    # stat()返回邮件数量和占用空间:
    print('Messages: %s. Size: %s' % server.stat())
    # list()返回所有邮件的编号:
    resp, mails, octets = server.list()
    # 可以查看返回的列表类似[b'1 82923', b'2 2184', ...]
    # print(mails)
    print('------ resp ------')
    print(resp)  # +OK 46 964346 响应的状态 邮件数量 邮件占用的空间大小
    print('------ mails ------')
    print(mails)  # 所有邮件的编号及大小的编号list,['1 2211', '2 29908', ...]
    print('------ octets ------')
    print(octets)
    # 获取最新一封邮件, 注意索引号从1开始:
    length = len(mails)
    last_num = do_list.get_last_email_list()
    if last_num != -1:
        last_email_no = u.get_list(mails, last_num)

        """
            每次获取最新邮件ID  根据邮件ID  获取下标  然后切割数组循环最新邮件
            1、更新 最新email_no  以便下次获取手环报告
            2、下载附件 解析成图片保存到数据库后 删除本地文件
        """
        if len(mails)-1 >= last_email_no:
            new_mails = mails[last_email_no:]
            for i in range(len(new_mails)-1):
                no_info = str(new_mails[i+1], encoding="utf-8").split(' ', 1)
                do_list.insert_email_no(no_info[1], no_info[0])
                resp, lines, octets = server.retr(last_email_no+i+2)
                # lines存储了邮件的原始文本的每一行,
                # 可以获得整个邮件的原始文本:
                msg_content = b'\n'.join(lines).decode('utf-8')
                # 把邮件内容解析为Message对象:
                msg = Parser().parsestr(msg_content)

                # 但是这个Message对象本身可能是一个MIMEMultipart对象,即包含嵌套的其他MIMEBase对象,
                # 嵌套可能还不止一层。所以我们要递归地打印出Message对象的层次结构:
                print('-------------------- 邮件信息开始 --------------------')
                base_save_path = '%s/' % mkpath
                msg_headers = get_email_headers(msg)
                if msg_headers['subject']:
                    if len(msg_headers['subject'].split("_")) == 2:
                        if u.check_id_card(msg_headers['subject'].split("_")[1]):
                            content, attachment_files = get_email_content(msg, base_save_path, msg_headers['subject'].split("_")[1])
                            print('subject:', msg_headers['subject'])
                            print('from_address:', msg_headers['from'])
                            print('to_address:', msg_headers['to'])
                            print('date:', msg_headers['date'])
                            print('content:', content)
                            print('attachment_files: ', attachment_files)
                            print('-------------------- 邮件信息结束 --------------------')
                        else:
                            print('-------------------- 邮件标题不符合规则结束 --------------------')
                    elif len(msg_headers['subject'].split("_")) == 1:
                        if u.check_id_card(msg_headers['subject'].split("_")[0]):
                            content, attachment_files = get_email_content(msg, base_save_path, msg_headers['subject'].split("_")[0])
                            print('subject:', msg_headers['subject'])
                            print('from_address:', msg_headers['from'])
                            print('to_address:', msg_headers['to'])
                            print('date:', msg_headers['date'])
                            print('content:', content)
                            print('attachment_files: ', attachment_files)
                            print('-------------------- 邮件信息结束 --------------------')
                        else:
                            print('-------------------- 邮件标题不符合规则结束 --------------------')
                    else:
                        print('-------------------- 邮件标题不以_规格 --------------------')
                else:
                    print('-------------------- 邮件没有标题 --------------------')
    else:
        print('-------------------- 未获取到最后数字 --------------------')
    # resp, lines, octets = server.retr(index)
    # lines存储了邮件的原始文本的每一行,
    # 可以获得整个邮件的原始文本:
    # msg_content = b'\r\n'.join(lines).decode('utf-8')
    # 稍后解析出邮件:
    # msg = Parser().parsestr(msg_content)
    # print_info(msg)
    # 可以根据邮件索引号直接从服务器删除邮件:
    # server.dele(index)
    # 关闭连接:
    server.quit()

# get_email()


2、change_pdf_to_png.py(pdf转png)
# -*- coding: utf-8 -*-
import io, os
from wand.image import Image
from wand.color import Color
from PyPDF2 import PdfFileReader, PdfFileWriter
memo = {}


def get_pdf_reader(filename):
    reader = None
    # reader = memo.get(filename, None)
    # if reader is None:
    reader = PdfFileReader(filename, strict=False)
    memo[filename] = reader
    return reader

# pdf转PNG
def run_convert(filename, path, idcard, res=120):
    pdfile = get_pdf_reader(filename)
    pagenum = pdfile.getNumPages()
    for i in range(pagenum):
        pageObj = pdfile.getPage(i)
        dst_pdf = PdfFileWriter()
        dst_pdf.addPage(pageObj)
        pdf_bytes = io.BytesIO()
        dst_pdf.write(pdf_bytes)
        pdf_bytes.seek(0)
        img = Image(file=pdf_bytes, resolution=res)
        img.format = 'png'
        img.compression_quality = 90
        img.background_color = Color("white")
        # img_path = '%s_%d.png' % (filename[:filename.rindex('')], i+1)
        img_path = 'C:/img/email/%s/%s_%d.png' % (path, idcard, i+1)
        img.save(filename=img_path)
        img.destroy()
    file_info = {
        'pagenum': pagenum,
        'img_path': img_path
    }
    # 如果文件已存在就删除文件
    if os.path.exists('C:/img/email/%s/QQ.pdf' % path):
        # 删除文件,可使用以下两种方法。
        os.remove('C:/img/email/%s/QQ.pdf' % path)
    return file_info

# run_convert('C:/img/email/20180907/QQ.pdf', '20180907', '320483199311213811')
3、do_email_list.py(操作数据库)
import pymysql
import datetime


config = {
    'host': '192.168.10.40',
    'port': 3306,
    'user': 'root',
    'password': 'thinkon133',
    'db': 'examdb',
    'charset': 'utf8mb4',
    'cursorclass': pymysql.cursors.DictCursor,
}
config_main = {
    'host': '192.168.10.89',
    'port': 3306,
    'user': 'root',
    'password': 'thinkon133',
    'db': 'galaxyledappdb',
    'charset': 'utf8mb4',
    'cursorclass': pymysql.cursors.DictCursor,
}




def insert_email_list(id_card, pagenum,  img_path):
    db = pymysql.connect(**config_main)
    cursor = db.cursor()
    try:
        sql = "insert into upfile(kh_sfzhm, filespage, fileurl) values ('%s','%s','%s')" % (id_card, pagenum, img_path)
        cursor.execute(sql)
        db.commit()
        # results = cursor.fetchall()
    except:
        print("Error: 插入unable to fetch data")
        db.rollback()
    db.close()

def get_now_exist_email(id_card):
    db = pymysql.connect(**config_main)
    cursor = db.cursor()
    start_time = datetime.datetime.now().strftime("%Y-%m-%d 00:00:00")
    end_time = datetime.datetime.now().strftime("%Y-%m-%d 23:59:59")
    try:
        sql = "select count(kh_sfzhm) as count_sfzhm from upfile where kh_sfzhm=%s and createtime >= '%s' and  \
        createtime <= '%s'" % (id_card, start_time, end_time)
        cursor.execute(sql)
        result = cursor.fetchone()
        return result
    except:
        print("Error: 查询重复unable to fetch data")
    db.close()
    return 0


def insert_email_no(list_no, num):
    db = pymysql.connect(**config_main)
    cursor = db.cursor()
    sql = "insert email_list(list_no, num) values (%s, %s)" % (list_no, num)
    try:
        cursor.execute(sql)
        db.commit()
        # results = cursor.fetchall()
    except:
        print("Error: unable to fetch data")
        db.rollback()
    db.close()


def get_last_email_list():
    db = pymysql.connect(**config_main)
    cursor = db.cursor()
    try:
        sql = "select list_no, num from email_list order by id desc limit 1"
        cursor.execute(sql)
        results = cursor.fetchall()
        if results:
            return results[0]['num']
        else:
            return -1
    except:
        print("Error: 获取最后一个IDunable to fetch data")
    db.close()
    return -1

if __name__ == '__main__':
    get_last_email_list()

4、util.py(工具类)
import re


def get_list(mails, email_id):
    """获取最后一次 邮件列表下标"""
    index = [x for x in range(len(mails)) if str(mails[x], encoding="utf-8").split(' ', 1)[1] == email_id]
    if index:
        return index[0]
    return 0


# 创建目录
def mkdir(path):
    # 引入模块
    import os

    # 去除首位空格
    path = path.strip()
    # 去除尾部 \ 符号
    path = path.rstrip("\\")

    # 判断路径是否存在
    # 存在     True
    # 不存在   False
    isExists = os.path.exists(path)

    # 判断结果
    if not isExists:
        # 如果不存在则创建目录
        # 创建目录操作函数
        os.makedirs(path)

        print
        path + ' 创建成功'
        return True
    else:
        # 如果目录存在则不创建,并提示目录已存在
        print
        path + ' 目录已存在'
        return False


# 验证身份证
def check_id_card(id_card):
    r = r'^([1-9]\d{5}[12]\d{3}(0[1-9]|1[012])(0[1-9]|[12][0-9]|3[01])\d{3}[0-9xX])$'
    if len(re.findall(r, id_card)) != 0:
        return True
    return False

5、scheduler.py(定时任务)
import schedule
import time
import fetch_mail as fm
import datetime


def work():
    print("嘿嘿")


# 每20分钟执行一次
schedule.every(20).minutes.do(fm.get_email) 
while True:
    schedule.run_pending()
    time.sleep(60)  # 每一分钟睡眠一次
    print("现在的时间:%s" % datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

最后执行scheduler.py则可以每20分钟获取一次邮件。


如有问题可在下方留言

你可能感兴趣的:(Python)