crawler.py
# -* - coding: UTF-8 -* - #!/usr/bin/python import sys, urllib, hashlib, htmllib, os, formatter, string class Parser(htmllib.HTMLParser): def __init__(self, verbose = 0): self.anchors = {} f = formatter.NullFormatter() htmllib.HTMLParser.__init__(self, f, verbose) def anchor_bgn(self, href, name, type): self.save_bgn() self.anchor = href def anchor_end(self): text = string.strip(self.save_end()) if self.anchor and text: self.anchors[text] = self.anchors.get(text, []) + [self.anchor] #返回url的主目录,比如http://www.baidu.com/123返回http://www/baidu.com def getmainurl(url): ind = url.find('/',len('http://')) if ind > 0 : return url[:ind] else : return url #根据url返回其他所有的url def getURL(url, html, queue): p = Parser() try : p.feed(html) p.close() except: return mainurl = getmainurl(url) print "mainurl:"+mainurl #对于取到的其他url,如果已经有了(MD5加密相同)就不要再加了,否则加入队列 for k, v in p.anchors.items(): for u in v : if not u.startswith('http://'): if (mainurl[-1] != '/' and u[0] != '/') : u = mainurl + '/' + u else : u = mainurl + u hashNum = hashlib.md5(u); hashNum.digest() filename = hashNum.hexdigest() filename = filename + ".html" filepath = "./" + filename if os.path.isfile(filepath) == False: queue.append(u); def BFS(): queue = ["http://www.baidu.com/"] while len(queue) != 0 : url = queue.pop(0); print "%s:"%len(queue)+url try: wp = urllib.urlopen(url) except: print url, "can not open this url" wp.close() continue content = wp.read() wp.close() #使用hash值来作为文件名 hashNum = hashlib.md5(url); hashNum.digest() filename = hashNum.hexdigest() filename = filename + ".html" filepath = "./html/" + filename if os.path.isfile(filepath) == False: fp = open(filepath,"w") fp.write(content) fp.close() else : continue getURL(url, content, queue) def main(): #建立存储文件夹 dir = "./html" try: if not os.path.exists(dir): os.mkdir(dir) except: print "Failed to create directory in %s"%dir exit() #广度优先遍历 while True : BFS() if __name__ == '__main__' : os.system("rm -rf html") main()
2,发邮件程序
conf.py
mail_from="[email protected]" mail_to="[email protected]" mail_cc="" mail_title="test mail"
send_mail.py
#!/bin/env python2.7 # -* - coding: UTF-8 -* - import conf import os import commands import time import email import smtplib from email.Message import Message import datetime def send_mail(report): fromList = conf.mail_from toList = conf.mail_to ccList = conf.mail_cc mail_title = conf.mail_title + "[" +\ datetime.datetime.now().strftime('%Y-%m-%d') + "]" emailSender = SMTP_SSL('smtp.ops.xxx-inc.com') emailSender.SendHTML('', '', fromList, toList, ccList, mail_title, report) class SMTP_SSL(smtplib.SMTP): def __init__(self, host=''): self.host = host def SendHTML(self,account,passwd,fromList,toList,ccList,subject,content): msg = Message() msg['Mime-Version']='1.0' msg['Content-Type']='text/html' msg['From'] = fromList msg['To'] = toList msg['CC'] = ccList msg['Subject'] = subject msg['Date'] = email.Utils.formatdate() msg.set_payload(content) smtp = smtplib.SMTP(host=self.host, port=25) smtp.sendmail(fromList, toList.split(',')+ccList.split(','), msg.as_string()) smtp.quit() def start(): for i in range(1,11): context = "大家high起来,哟哟,切克闹!跟着我来数:%d"%i send_mail(context) if __name__ == "__main__": start()