有一个工作邮箱,会接收许多人不断地投递的简历。由于邮件数量比较多,因此产生了一个需求。要求自动将邮件从邮件服务器取回到本地,并将邮件的基本信息存入本地的sqlite数据库。邮件的正文以单独文件的形式存放在文件夹下。
备注:在python2.7下测试运行正常,如果用python3,可能需要对代码稍做修改。
1,邮件配置参数文件
mail.conf
[mail163]
#此外应写上你实际的帐号与密码
user = xxxx@163.com
password = xxxxx
pop3_server = pop3.163.com
[sqlite]
dir = sqlite
fileName = mailLog.db
2.sqlite数据表的结构
3.从邮件服务器收取邮件的python代码
mailManager.py
# -*- coding:utf-8 -*-
# 读取邮件并解码存入日志数据库
import poplib
import email
import ConfigParser
import os, sys, string,time
from email.parser import Parser
from email.header import decode_header
from email.utils import parseaddr
from logHelper import LogHelper
#获取解码后的邮件体
def getBody(msg,guessedCharset='gb2312'):
bodyText =''
if (msg.is_multipart()):
parts = msg.get_payload()
for n, part in enumerate(parts):
try:
bodyText+=getBody(part)
except UnicodeDecodeError,e:
print e.message
else:
content_type = msg.get_content_type()
if content_type=='text/plain' or content_type=='text/html':
content = msg.get_payload(decode=True)
#尝试进行解码
bodyText = decodeString(content, guessedCharset,'body',guessedCharset,)
else:
bodyText = ''
return bodyText
#解码邮件头中包含的字符串
def decode_strInHeader(s,guessedCharset='gb2312'):
#对邮件头字符串,获取其内容与编码格式
value, charset = decode_header(s)[0]
#print(value,charset)
return decodeString(value, charset,'header',guessedCharset)
#解码字符串
def decodeString(s,charset,extra='header',guessedCharset='gb2312'):
value = s
if charset is None:
charset = guessedCharset
if charset:
#去除编码格式中可能存在的干扰元素,常见的是双引号
charset = charset.strip('"')
charset = charset.strip("'")
try:
value = value.decode(charset)
except:
if(charset=='gb2312'):
#尝试用比gb2312更大的字符集gbk进行解码
try:
value = value.decode('gbk')
except:
print("decode error in decodeString!",'gbk',extra)
elif (charset=='utf8'):
#尝试忽略掉解码错误
try:
value = value.decode('utf8',errors='ignore')
except:
print("decode error in decodeString!",'gbk',extra)
else:
#从目前解码邮件的实践来看,如果不是gb231编码,就是utf-8编码
print("decode error in decodeString!",charset,extra)
return value
#获得msg的编码,猜测编码格式
def guess_charset(msg):
charset = msg.get_charset()
if charset is None:
content_type = msg.get('Content-Type', '').lower()
pos = content_type.find('charset=')
if pos >= 0:
charset = content_type[pos + 8:].strip()
return charset
#当前日期字符串
def today():
return time.strftime("%Y-%m-%d", time.localtime())
#确保文件夹存在
def ensureDir(dir):
if not os.path.exists(dir):
os.mkdir(dir)
#登记一封邮件
def logOneMail(server,index,dir,logHelper,parseScope='new'):
print('log Mail:', index)
resp, lines, octets = server.retr(index)
# lines存储了邮件的原始文本的每一行,合并得到原始文本
msgRaw = b'\r\n'.join(lines)
#创建message对象,这个时候也会做基本的解码,得到message结构体
msg = email.message_from_string(msgRaw)
#在需要时,可输出整个message结构体,观察有哪些键值对
#print msg
#推测邮件的编码格式
guessedCharset = guess_charset(msg)
#如果subject存在就返回相应的值,否则返回''
subjectRaw = msg.get("subject",'')
subject = decode_strInHeader(subjectRaw,guessedCharset)
#print subject
fromAddrRaw = msg.get("from",'')
var1,var2 = parseaddr(fromAddrRaw)
fromAddr = decode_strInHeader(var1,guessedCharset)
#print fromAddr
toAddrRaw = msg.get("to",'')
var1,var2 = parseaddr(toAddrRaw)
toAddr = decode_strInHeader(var1,guessedCharset)
#print toAddr
messageIDRaw= msg.get("Message-ID",'');
messageID = decode_strInHeader(messageIDRaw,guessedCharset)
print('mail message id:', messageID)
uniqueIDRaw= msg.get("uniqueid",'');
uniqueID = decode_strInHeader(uniqueIDRaw,guessedCharset)
#print uniqueID
dateStrRaw= msg.get("Date",'');
dateStr = decode_strInHeader(dateStrRaw,guessedCharset)
#print dateStr
#将邮件主体内容写入文件
baseName = messageID.strip();
baseName = baseName.replace('<', '')
baseName = baseName.replace('>', '')
#以日期为文件夹,存放邮件正文
curDir = dir+'/'+today()+'/'
ensureDir(curDir)
contentFile = curDir + '/'+baseName+'.html'
if not os.path.exists(contentFile):
outFile =open(contentFile,'w')
outFile.write(getBody(msg,guessedCharset))
outFile.close()
#检查是否到了解析范围的结尾处
if parseScope == 'new' and logHelper.msgExists(messageID):
return 'scopeEnd'
#将邮件信息写入日志数据库
logHelper.append(messageID,fromAddr,subject,contentFile,dateStr)
return 'ok'
#登记邮件,从邮件服务器中取出最近的一些邮件,
#parseScope='all',则取出所有的邮件,'new',取出新收到的邮件,或者取出只定数量的新邮件
#progressKey是批操作计数器的标识值,目前保留备用
def logTheMails(progressKey,parseScope='new'):
#读取配置文件
cf = ConfigParser.ConfigParser()
cf.read("mail.conf")
user = cf.get("mail163", "user")
password = cf.get("mail163", "password")
pop3_server = cf.get("mail163", "pop3_server")
# 连接到POP3服务器:
server = poplib.POP3(pop3_server)
# 可以打开或关闭调试信息:
#server.set_debuglevel(1)
# 打印POP3服务器的欢迎消息:
#print(server.getwelcome())
# 身份认证:
server.user(user)
server.pass_(password)
#stat()返回邮件数量和占用空间:
#print('Messages: %s. Size: %s' % server.stat())
#连接日志数据库
dbFileFullName = cf.get("sqlite", "dir")+'/'+cf.get("sqlite", "fileName")
logHelper=LogHelper(dbFileFullName)
# list()返回所有邮件的编号:
resp, mails, octets = server.list()
#邮件服务器邮箱中的邮件总数
total = len(mails)
if parseScope == 'all':
logCount = total
elif parseScope == 'new':
logCount = total
else:
logCount = int(parseScope)
# 获取最近的 logCount 份邮件, 注意索引号从1开始,最新的索引是len(mails):
receivedCount = 0
for indexAsc in range(0,logCount):
index = total - indexAsc
#登记一封邮件
flag = logOneMail(server,index,cf.get("sqlite", "dir"),logHelper,parseScope)
if flag =='scopeEnd':
break
receivedCount +=1
# 关闭到邮件服务器的连接:
server.quit()
return receivedCount
#登记邮件,取出起始索引号与结束索引号之间的一些邮件
def logMailsByIndex(beginIndex,endIndex):
#读取配置文件
cf = ConfigParser.ConfigParser()
cf.read("mail.conf")
user = cf.get("mail163", "user")
password = cf.get("mail163", "password")
pop3_server = cf.get("mail163", "pop3_server")
# 连接到POP3服务器:
server = poplib.POP3(pop3_server)
# 可以打开或关闭调试信息:
#server.set_debuglevel(1)
# 打印POP3服务器的欢迎消息:
#print(server.getwelcome())
# 身份认证:
server.user(user)
server.pass_(password)
#stat()返回邮件数量和占用空间:
#print('Messages: %s. Size: %s' % server.stat())
#连接日志数据库
dbFileFullName = cf.get("sqlite", "dir")+'/'+cf.get("sqlite", "fileName")
logHelper=LogHelper(dbFileFullName)
# list()返回所有邮件的编号:
resp, mails, octets = server.list()
#邮件服务器邮箱中的邮件总数
total = len(mails)
if beginIndex > total:
beginIndex = total
if endIndex > total:
endIndex = total
# 获取最近的 logCount 份邮件, 注意索引号从1开始,最新的索引是len(mails):
receivedCount = 0
for index in range(beginIndex,endIndex+1):
#登记一封邮件
flag = logOneMail(server,index,cf.get("sqlite", "dir"),logHelper)
if flag =='scopeEnd':
break
receivedCount +=1
# 关闭到邮件服务器的连接:
server.quit()
return receivedCount
4.根据命令行参数,读取指定时间范围内的邮件的代码
fetchMails.py
# -*- coding:utf-8 -*-
#读取邮件
import os, sys, string
import time
import getopt
import mailManager
reload(sys)
sys.setdefaultencoding( "utf-8" )
#解析命令行参数,得到进度计数器的key和邮箱代号(留作备用)
#scope指示是解析全部邮件(all)还是只解析新收到的邮件(new)
#如果给出一个数字,则解析最近收到的指定数目的邮件
try:
opts, args = getopt.getopt(sys.argv[1:],'p:m:s:',['progKey=','mailBoxIdx=','scope='])
except getopt.GetoptError:
print('error:','options invalid')
sys.exit()
progressKey = ''
parseScope = 'new'
for k, v in opts:
if k in ("-p", "--progKey"):
progressKey = v
elif k in ("-m", "--mailBoxIdx"):
mailBoxIndex = int(v)
elif k in ("-s", "--scope"):
parseScope = v
print('oldCwd:',os.getcwd())
#将工作目录切换到当前文件所在的目录
os.chdir(os.path.dirname(os.path.abspath(__file__)))
print('newCwd:',os.getcwd())
print
print('fetch mails : begin...')
print
startTime = time.time()
if progressKey == '':
progressKey = 'tempKey1'
#取回邮件并登记到sqlite数据库
receivedCount = mailManager.logTheMails(progressKey,parseScope)
print
print ('receivedCount:',receivedCount)
print
endTime = time.time()
print('used time/minutes: ',(endTime-startTime)/60)