用python收取邮件的一个实例

背景

有一个工作邮箱,会接收许多人不断地投递的简历。由于邮件数量比较多,因此产生了一个需求。要求自动将邮件从邮件服务器取回到本地,并将邮件的基本信息存入本地的sqlite数据库。邮件的正文以单独文件的形式存放在文件夹下。

实现

备注:在python2.7下测试运行正常,如果用python3,可能需要对代码稍做修改。

1,邮件配置参数文件
mail.conf

[mail163]
#此外应写上你实际的帐号与密码
user = xxxx@163.com
password = xxxxx
pop3_server = pop3.163.com

[sqlite]
dir = sqlite
fileName = mailLog.db

2.sqlite数据表的结构
用python收取邮件的一个实例_第1张图片
3.从邮件服务器收取邮件的python代码
mailManager.py

# -*- coding:utf-8 -*- 
# 读取邮件并解码存入日志数据库

import poplib
import email
import ConfigParser
import os, sys, string,time
from email.parser import Parser
from email.header import decode_header
from email.utils import parseaddr
from logHelper import LogHelper

#获取解码后的邮件体
def getBody(msg,guessedCharset='gb2312'):
    bodyText =''
    if (msg.is_multipart()):
        parts = msg.get_payload()
        for n, part in enumerate(parts): 
            try:
                bodyText+=getBody(part)
            except UnicodeDecodeError,e:
                print e.message
    else:
        content_type = msg.get_content_type()
        if content_type=='text/plain' or content_type=='text/html':
            content = msg.get_payload(decode=True)            
            #尝试进行解码
            bodyText = decodeString(content, guessedCharset,'body',guessedCharset,)
        else:
            bodyText = ''
    return bodyText

#解码邮件头中包含的字符串
def decode_strInHeader(s,guessedCharset='gb2312'):
    #对邮件头字符串,获取其内容与编码格式
    value, charset = decode_header(s)[0]   
    #print(value,charset) 
    return decodeString(value, charset,'header',guessedCharset)

#解码字符串
def decodeString(s,charset,extra='header',guessedCharset='gb2312'):    
    value = s
    if charset is None:
        charset = guessedCharset
    if charset:
        #去除编码格式中可能存在的干扰元素,常见的是双引号
        charset = charset.strip('"')
        charset = charset.strip("'")
        try:
            value = value.decode(charset)
        except:
            if(charset=='gb2312'):
                #尝试用比gb2312更大的字符集gbk进行解码
                try:
                    value = value.decode('gbk')
                except:
                    print("decode error in decodeString!",'gbk',extra)
            elif (charset=='utf8'):
                #尝试忽略掉解码错误
                try:
                    value = value.decode('utf8',errors='ignore')
                except:
                    print("decode error in decodeString!",'gbk',extra)
            else:
                #从目前解码邮件的实践来看,如果不是gb231编码,就是utf-8编码 
                print("decode error in decodeString!",charset,extra)
    return value

#获得msg的编码,猜测编码格式
def guess_charset(msg):
    charset = msg.get_charset()
    if charset is None:
        content_type = msg.get('Content-Type', '').lower()
        pos = content_type.find('charset=')
        if pos >= 0:
            charset = content_type[pos + 8:].strip()
    return charset

#当前日期字符串
def today():
    return time.strftime("%Y-%m-%d", time.localtime()) 

#确保文件夹存在
def ensureDir(dir):
    if not os.path.exists(dir):
        os.mkdir(dir)   

#登记一封邮件
def logOneMail(server,index,dir,logHelper,parseScope='new'):
    print('log Mail:', index)
    resp, lines, octets = server.retr(index)
    
    # lines存储了邮件的原始文本的每一行,合并得到原始文本
    msgRaw = b'\r\n'.join(lines)    
    #创建message对象,这个时候也会做基本的解码,得到message结构体
    msg = email.message_from_string(msgRaw) 
    #在需要时,可输出整个message结构体,观察有哪些键值对
    #print msg

    #推测邮件的编码格式
    guessedCharset = guess_charset(msg)

    #如果subject存在就返回相应的值,否则返回''
    subjectRaw = msg.get("subject",'') 
    subject = decode_strInHeader(subjectRaw,guessedCharset)
    #print subject

    fromAddrRaw = msg.get("from",'') 
    var1,var2 = parseaddr(fromAddrRaw)
    fromAddr = decode_strInHeader(var1,guessedCharset)
    #print fromAddr

    toAddrRaw = msg.get("to",'') 
    var1,var2 = parseaddr(toAddrRaw)
    toAddr = decode_strInHeader(var1,guessedCharset)
    #print toAddr

    messageIDRaw= msg.get("Message-ID",'');
    messageID = decode_strInHeader(messageIDRaw,guessedCharset)
    print('mail message id:', messageID)
    
    uniqueIDRaw= msg.get("uniqueid",'');
    uniqueID = decode_strInHeader(uniqueIDRaw,guessedCharset)
    #print uniqueID

    dateStrRaw= msg.get("Date",'');
    dateStr = decode_strInHeader(dateStrRaw,guessedCharset)
    #print dateStr

    #将邮件主体内容写入文件
    baseName = messageID.strip();
    baseName = baseName.replace('<', '')
    baseName = baseName.replace('>', '')

    #以日期为文件夹,存放邮件正文
    curDir = dir+'/'+today()+'/'
    ensureDir(curDir)
    contentFile = curDir + '/'+baseName+'.html'
    if not os.path.exists(contentFile):
        outFile =open(contentFile,'w')
        outFile.write(getBody(msg,guessedCharset))
        outFile.close()

    #检查是否到了解析范围的结尾处
    if parseScope == 'new' and logHelper.msgExists(messageID):        
        return 'scopeEnd'

    #将邮件信息写入日志数据库
    logHelper.append(messageID,fromAddr,subject,contentFile,dateStr)
    return 'ok'


#登记邮件,从邮件服务器中取出最近的一些邮件,
#parseScope='all',则取出所有的邮件,'new',取出新收到的邮件,或者取出只定数量的新邮件
#progressKey是批操作计数器的标识值,目前保留备用
def logTheMails(progressKey,parseScope='new'):

    #读取配置文件
    cf = ConfigParser.ConfigParser()
    cf.read("mail.conf")

    user = cf.get("mail163", "user")
    password = cf.get("mail163", "password")
    pop3_server = cf.get("mail163", "pop3_server")

    # 连接到POP3服务器:
    server = poplib.POP3(pop3_server)
    # 可以打开或关闭调试信息:
    #server.set_debuglevel(1)
    # 打印POP3服务器的欢迎消息:
    #print(server.getwelcome())
    # 身份认证:
    server.user(user)
    server.pass_(password)
    #stat()返回邮件数量和占用空间:
    #print('Messages: %s. Size: %s' % server.stat())

    #连接日志数据库
    dbFileFullName =  cf.get("sqlite", "dir")+'/'+cf.get("sqlite", "fileName")
    logHelper=LogHelper(dbFileFullName)

    # list()返回所有邮件的编号:
    resp, mails, octets = server.list()

    #邮件服务器邮箱中的邮件总数
    total = len(mails)

    if parseScope == 'all':
        logCount = total
    elif parseScope == 'new':
        logCount = total
    else:
        logCount = int(parseScope)

    # 获取最近的 logCount 份邮件, 注意索引号从1开始,最新的索引是len(mails):
    receivedCount = 0
    for indexAsc in range(0,logCount):
        index = total - indexAsc 
        #登记一封邮件
        flag = logOneMail(server,index,cf.get("sqlite", "dir"),logHelper,parseScope)
        if flag =='scopeEnd':
            break
        receivedCount +=1          

    # 关闭到邮件服务器的连接:
    server.quit()

    return receivedCount

#登记邮件,取出起始索引号与结束索引号之间的一些邮件
def logMailsByIndex(beginIndex,endIndex):

    #读取配置文件
    cf = ConfigParser.ConfigParser()
    cf.read("mail.conf")

    user = cf.get("mail163", "user")
    password = cf.get("mail163", "password")
    pop3_server = cf.get("mail163", "pop3_server")

    # 连接到POP3服务器:
    server = poplib.POP3(pop3_server)
    # 可以打开或关闭调试信息:
    #server.set_debuglevel(1)
    # 打印POP3服务器的欢迎消息:
    #print(server.getwelcome())
    # 身份认证:
    server.user(user)
    server.pass_(password)
    #stat()返回邮件数量和占用空间:
    #print('Messages: %s. Size: %s' % server.stat())

    #连接日志数据库
    dbFileFullName =  cf.get("sqlite", "dir")+'/'+cf.get("sqlite", "fileName")
    logHelper=LogHelper(dbFileFullName)

    # list()返回所有邮件的编号:
    resp, mails, octets = server.list()

    #邮件服务器邮箱中的邮件总数
    total = len(mails)

    if beginIndex > total:
        beginIndex = total

    if endIndex > total:
        endIndex = total    

    # 获取最近的 logCount 份邮件, 注意索引号从1开始,最新的索引是len(mails):
    receivedCount = 0
    for index in range(beginIndex,endIndex+1):
        
        #登记一封邮件
        flag = logOneMail(server,index,cf.get("sqlite", "dir"),logHelper)
        if flag =='scopeEnd':
            break
        receivedCount +=1          

    # 关闭到邮件服务器的连接:
    server.quit()

    return receivedCount

4.根据命令行参数,读取指定时间范围内的邮件的代码
fetchMails.py

# -*- coding:utf-8 -*- 
#读取邮件
import os, sys, string
import time
import getopt
import mailManager
reload(sys)
sys.setdefaultencoding( "utf-8" )  

#解析命令行参数,得到进度计数器的key和邮箱代号(留作备用)
#scope指示是解析全部邮件(all)还是只解析新收到的邮件(new)
#如果给出一个数字,则解析最近收到的指定数目的邮件
try:
	opts, args = getopt.getopt(sys.argv[1:],'p:m:s:',['progKey=','mailBoxIdx=','scope='])
except getopt.GetoptError:
	print('error:','options invalid')
	sys.exit()

progressKey = ''
parseScope = 'new'
for k, v in opts:
	if k in ("-p", "--progKey"):
		progressKey = v
	elif k in ("-m", "--mailBoxIdx"):
		mailBoxIndex = int(v)
	elif k in ("-s", "--scope"):
		parseScope = v


print('oldCwd:',os.getcwd())
#将工作目录切换到当前文件所在的目录
os.chdir(os.path.dirname(os.path.abspath(__file__))) 
print('newCwd:',os.getcwd())

print
print('fetch mails : begin...')
print 

startTime = time.time()

if progressKey == '':
	progressKey = 'tempKey1'

#取回邮件并登记到sqlite数据库
receivedCount = mailManager.logTheMails(progressKey,parseScope)

print
print ('receivedCount:',receivedCount)
print

endTime = time.time()
print('used time/minutes: ',(endTime-startTime)/60)

你可能感兴趣的:(python,sqlite,编程语言)