xml数据处理--expat模块使用

# coding:utf-8
from login163 import *
from xml.parsers import expat
import MySQLdb
class mail163(Login163):
    '''
    get 'limit' unread mails at once,the data format is xml
    if 'subject' in xml data,then return the data,else return None
    '''
    def get_unread_mail(self,start,limit):
        postdata = {
            'var':'<?xml version="1.0"?><object><int name="fid">1</int><boolean name="skipLockedFolders">false</boolean><string name="order">date</string><boolean name="desc">true</boolean><int name="start">'+str(start)+'</int><int name="limit">'+str(limit)+'</int><boolean name="topFirst">false</boolean><object name="filterFlags"><boolean name="read">false</boolean></object><boolean name="returnTotal">true</boolean><boolean name="returnTag">true</boolean></object>'
        }
        postdata = urllib.urlencode(postdata)
        url = 'http://twebmail.mail.163.com/js5/s?sid='+self.sid+'&func=mbox:listMessages&deftabclick=t2&deftabclick=undefined&from=toolbar&type=unread&mboxentry=1'
        req = urllib2.Request(url=url,data=postdata,headers=self.headers)
        res = urllib2.urlopen(req).read()
        if 'subject' in res:
            return res
        else:
            return None
    '''
    xml data format,then return the data
    '''
    def format(self,xml_data):
        pattern = re.compile(r'<object name="ctrls">.*?</object>|<object name="flags" />|<object name="flags">.*?</object>',re.S)
        xml_data = pattern.sub('',xml_data)
        pattern = re.compile(r'<string name="from">.*?;(.*?@.*?)&.*?</string>')
        xml_data = pattern.sub(r'<string name="from">\1</string>',xml_data)
        pattern = re.compile(r'<string name="to">.*?;(.*?@.*?)&.*?</string>')
        xml_data = pattern.sub(r'<string name="to">\1</string>',xml_data)
        return xml_data
#db connect
class Db_Connect(object):
    def __init__(self, db_host, user, pwd, db_name, charset="utf8",  use_unicode = True):
        try:
            self.conn = MySQLdb.Connection(db_host, user, pwd, db_name, charset=charset , use_unicode=use_unicode)
        except MySQLdb.OperationalError,e:
            print 'Connect %s Failed' % db_host
            print e.args
            sys.exit(1)
    def insert(self, sql):
        try:
            n = self.conn.cursor().execute(sql)
            return n
        except MySQLdb.Warning, e:
            print e.args
        except MySQLdb.IntegrityError,e:
            print e.args
    def close(self):
        self.conn.close()
class Mail_Handler(object):
    def __init__(self,data,db_conn):
        self.flag = False       # control the data update
        self.mail = {}          # a mail info
        self.curr_attrib = ''
        self.data = data        # xml data
        self.db_conn = db_conn
    def start(self,name,attributes):
        if name == 'object':
            self.mail = {}
        # get the value of the attribute
        # <string name="id">sdosod0sdfsd</string>
        # the value is "id"
        values = attributes.values()
        if len(values):
            self.curr_attrib = values[0]
            self.flag = True
    def end(self,name):
        sql = "insert into mails(id, from_mail, to_mail, subject, size) values('%s', '%s', '%s', '%s', %d)"
        fields = ('id','from','to','subject','size')
        if name == 'object':
            #print self.mail
            values = [self.mail[i] for i in fields]
            values[-1] = int(values[-1]) # the size type is int
            values = tuple(values)
            #print values
            #print sql % values
            self.db_conn.insert(sql % values)
        self.flag = False
    def character(self,data):
        if self.flag:
            self.mail[self.curr_attrib] = data
    def parser(self):
        p = expat.ParserCreate()
        p.StartElementHandler = self.start
        p.EndElementHandler = self.end
        p.CharacterDataHandler = self.character
        p.Parse(self.data)      # parse xml data
def main():
    flag = True
    db_conn = Db_Connect('192.168.110.142','admin','admin','test')
    username = raw_input('Enter you email:')
    password = getpass.getpass('Enter you password:')
    login = mail163(username,password)
    sid = login.login() # login the 163 mail for getting sid
    # login success
    if sid:
        start = 0   # the start page
        limit = 5   # read 5 unread mails at once
        while flag:
            res = login.get_unread_mail(start,limit)
            if res is None:
                flag = False
            else:
                res = login.format(res)  # use re module format data
                parser = Mail_Handler(res,db_conn)  # use expat parse xml
                parser.parser()
                start += limit
    db_conn.close()
if __name__ == '__main__':
    main()




这段代码读取163邮箱未读邮件标题并将数据插入mysql数据库,使用了expat进行数据处理。Login163类是爬虫这一节当中的类。

你可能感兴趣的:(xml,python,expat)