# coding:utf-8
from login163 import *
from xml.parsers import expat
import MySQLdb
class mail163(Login163):
    '''
    get 'limit' unread mails at once,the data format is xml
    if 'subject' in xml data,then return the data,else return None
    '''
    def get_unread_mail(self,start,limit):
        postdata = {
            'var':'1falsedatetrue'+str(start)+''+str(limit)+'falsefalsetruetrue'
        }
        postdata = urllib.urlencode(postdata)
        url = 'http://twebmail.mail.163.com/js5/s?sid='+self.sid+'&func=mbox:listMessages&deftabclick=t2&deftabclick=undefined&from=toolbar&type=unread&mboxentry=1'
        req = urllib2.Request(url=url,data=postdata,headers=self.headers)
        res = urllib2.urlopen(req).read()
        if 'subject' in res:
            return res
        else:
            return None
    '''
    xml data format,then return the data
    '''
    def format(self,xml_data):
        pattern = re.compile(r'.*?||.*?',re.S)
        xml_data = pattern.sub('',xml_data)
        pattern = re.compile(r'.*?;(.*?@.*?)&.*?')
        xml_data = pattern.sub(r'\1',xml_data)
        pattern = re.compile(r'.*?;(.*?@.*?)&.*?')
        xml_data = pattern.sub(r'\1',xml_data)
        return xml_data
#db connect
class Db_Connect(object):
    def __init__(self, db_host, user, pwd, db_name, charset="utf8",  use_unicode = True):
        try:
            self.conn = MySQLdb.Connection(db_host, user, pwd, db_name, charset=charset , use_unicode=use_unicode)
        except MySQLdb.OperationalError,e:
            print 'Connect %s Failed' % db_host
            print e.args
            sys.exit(1)
    def insert(self, sql):
        try:
            n = self.conn.cursor().execute(sql)
            return n
        except MySQLdb.Warning, e:
            print e.args
        except MySQLdb.IntegrityError,e:
            print e.args
    def close(self):
        self.conn.close()
class Mail_Handler(object):
    def __init__(self,data,db_conn):
        self.flag = False       # control the data update
        self.mail = {}          # a mail info
        self.curr_attrib = ''
        self.data = data        # xml data
        self.db_conn = db_conn
    def start(self,name,attributes):
        if name == 'object':
            self.mail = {}
        # get the value of the attribute
        # sdosod0sdfsd
        # the value is "id"
        values = attributes.values()
        if len(values):
            self.curr_attrib = values[0]
            self.flag = True
    def end(self,name):
        sql = "insert into mails(id, from_mail, to_mail, subject, size) values('%s', '%s', '%s', '%s', %d)"
        fields = ('id','from','to','subject','size')
        if name == 'object':
            #print self.mail
            values = [self.mail[i] for i in fields]
            values[-1] = int(values[-1]) # the size type is int
            values = tuple(values)
            #print values
            #print sql % values
            self.db_conn.insert(sql % values)
        self.flag = False
    def character(self,data):
        if self.flag:
            self.mail[self.curr_attrib] = data
    def parser(self):
        p = expat.ParserCreate()
        p.StartElementHandler = self.start
        p.EndElementHandler = self.end
        p.CharacterDataHandler = self.character
        p.Parse(self.data)      # parse xml data
def main():
    flag = True
    db_conn = Db_Connect('192.168.110.142','admin','admin','test')
    username = raw_input('Enter you email:')
    password = getpass.getpass('Enter you password:')
    login = mail163(username,password)
    sid = login.login() # login the 163 mail for getting sid
    # login success
    if sid:
        start = 0   # the start page
        limit = 5   # read 5 unread mails at once
        while flag:
            res = login.get_unread_mail(start,limit)
            if res is None:
                flag = False
            else:
                res = login.format(res)  # use re module format data
                parser = Mail_Handler(res,db_conn)  # use expat parse xml
                parser.parser()
                start += limit
    db_conn.close()
if __name__ == '__main__':
    main() 
  




这段代码读取163邮箱未读邮件标题并将数据插入mysql数据库,使用了expat进行数据处理。Login163类是爬虫这一节当中的类。

你可能感兴趣的:(xml,python,expat,python)