python-爬虫,邮件

1,爬虫程序

crawler.py

# -* - coding: UTF-8 -* -
#!/usr/bin/python 
import sys, urllib, hashlib, htmllib, os, formatter, string
 
class Parser(htmllib.HTMLParser):
    def __init__(self, verbose = 0):
        self.anchors = {}
        f = formatter.NullFormatter()
        htmllib.HTMLParser.__init__(self, f, verbose)
 
    def anchor_bgn(self, href, name, type):
        self.save_bgn()
        self.anchor = href
 
    def anchor_end(self):
        text = string.strip(self.save_end())
        if self.anchor and text:
            self.anchors[text] = self.anchors.get(text, []) + [self.anchor]

#返回url的主目录,比如http://www.baidu.com/123返回http://www/baidu.com
def getmainurl(url):
 
    ind = url.find('/',len('http://'))
    if ind > 0 :
        return url[:ind]
    else :
        return url
#根据url返回其他所有的url 
def getURL(url, html, queue):
    p = Parser()
    try :
        p.feed(html)
        p.close()
    except:
        return
    mainurl = getmainurl(url)
    print "mainurl:"+mainurl
    #对于取到的其他url,如果已经有了(MD5加密相同)就不要再加了,否则加入队列
    for k, v in p.anchors.items():
        for u in v :
            if not u.startswith('http://'):
                if (mainurl[-1] != '/' and u[0] != '/') :
                    u = mainurl + '/' + u
                else :
                    u = mainurl + u
            hashNum = hashlib.md5(u);
            hashNum.digest()
            filename = hashNum.hexdigest()
            filename = filename + ".html"
            filepath = "./" + filename
            if os.path.isfile(filepath) == False:
                queue.append(u);
def BFS():
    queue = ["http://www.baidu.com/"]
    while len(queue) != 0 :
        url = queue.pop(0);
        print "%s:"%len(queue)+url
        try:
            wp = urllib.urlopen(url)
        except:
            print url, "can not open this url"
            wp.close()
            continue
        content = wp.read()
        wp.close()

        #使用hash值来作为文件名
        hashNum = hashlib.md5(url);
        hashNum.digest()
        filename = hashNum.hexdigest()
        filename = filename + ".html"
        filepath = "./html/" + filename
        if os.path.isfile(filepath) == False:
            fp = open(filepath,"w")
            fp.write(content)
            fp.close()
        else :
            continue
        getURL(url, content, queue)
 
def main():
    #建立存储文件夹
    dir = "./html"
    try:
        if not os.path.exists(dir):
            os.mkdir(dir)
    except:
        print "Failed to create directory in %s"%dir
        exit()
    #广度优先遍历
    while True :
        BFS()
 
if __name__ == '__main__' :
    os.system("rm -rf html")
    main()
       

2,发邮件程序

conf.py

mail_from="[email protected]"
mail_to="[email protected]"
mail_cc=""
mail_title="test mail"

send_mail.py

#!/bin/env python2.7
# -* - coding: UTF-8 -* -
import conf
import os
import commands
import time
import email
import smtplib
from email.Message import Message
import datetime

def send_mail(report):
    fromList = conf.mail_from
    toList = conf.mail_to
    ccList = conf.mail_cc
    mail_title = conf.mail_title + "[" +\
        datetime.datetime.now().strftime('%Y-%m-%d') + "]"
    emailSender = SMTP_SSL('smtp.ops.xxx-inc.com')
    emailSender.SendHTML('', '', fromList, toList, ccList, mail_title, report)

class SMTP_SSL(smtplib.SMTP):
    def __init__(self, host=''):
        self.host = host

    def SendHTML(self,account,passwd,fromList,toList,ccList,subject,content):
        msg = Message()
        msg['Mime-Version']='1.0'
        msg['Content-Type']='text/html'
        msg['From'] = fromList
        msg['To'] = toList
        msg['CC'] = ccList
        msg['Subject'] = subject
        msg['Date']    = email.Utils.formatdate()
        msg.set_payload(content)
        smtp = smtplib.SMTP(host=self.host, port=25)
        smtp.sendmail(fromList, toList.split(',')+ccList.split(','), msg.as_string())
        smtp.quit()

def start():
    for i in range(1,11):
        context = "大家high起来,哟哟,切克闹!跟着我来数:%d"%i
        send_mail(context)

if __name__ == "__main__":
    start()


你可能感兴趣的:(python)