crawler.py
# -* - coding: UTF-8 -* -
#!/usr/bin/python
import sys, urllib, hashlib, htmllib, os, formatter, string
class Parser(htmllib.HTMLParser):
def __init__(self, verbose = 0):
self.anchors = {}
f = formatter.NullFormatter()
htmllib.HTMLParser.__init__(self, f, verbose)
def anchor_bgn(self, href, name, type):
self.save_bgn()
self.anchor = href
def anchor_end(self):
text = string.strip(self.save_end())
if self.anchor and text:
self.anchors[text] = self.anchors.get(text, []) + [self.anchor]
#返回url的主目录,比如http://www.baidu.com/123返回http://www/baidu.com
def getmainurl(url):
ind = url.find('/',len('http://'))
if ind > 0 :
return url[:ind]
else :
return url
#根据url返回其他所有的url
def getURL(url, html, queue):
p = Parser()
try :
p.feed(html)
p.close()
except:
return
mainurl = getmainurl(url)
print "mainurl:"+mainurl
#对于取到的其他url,如果已经有了(MD5加密相同)就不要再加了,否则加入队列
for k, v in p.anchors.items():
for u in v :
if not u.startswith('http://'):
if (mainurl[-1] != '/' and u[0] != '/') :
u = mainurl + '/' + u
else :
u = mainurl + u
hashNum = hashlib.md5(u);
hashNum.digest()
filename = hashNum.hexdigest()
filename = filename + ".html"
filepath = "./" + filename
if os.path.isfile(filepath) == False:
queue.append(u);
def BFS():
queue = ["http://www.baidu.com/"]
while len(queue) != 0 :
url = queue.pop(0);
print "%s:"%len(queue)+url
try:
wp = urllib.urlopen(url)
except:
print url, "can not open this url"
wp.close()
continue
content = wp.read()
wp.close()
#使用hash值来作为文件名
hashNum = hashlib.md5(url);
hashNum.digest()
filename = hashNum.hexdigest()
filename = filename + ".html"
filepath = "./html/" + filename
if os.path.isfile(filepath) == False:
fp = open(filepath,"w")
fp.write(content)
fp.close()
else :
continue
getURL(url, content, queue)
def main():
#建立存储文件夹
dir = "./html"
try:
if not os.path.exists(dir):
os.mkdir(dir)
except:
print "Failed to create directory in %s"%dir
exit()
#广度优先遍历
while True :
BFS()
if __name__ == '__main__' :
os.system("rm -rf html")
main()
2,发邮件程序
conf.py
mail_from="[email protected]"
mail_to="[email protected]"
mail_cc=""
mail_title="test mail"
send_mail.py
#!/bin/env python2.7
# -* - coding: UTF-8 -* -
import conf
import os
import commands
import time
import email
import smtplib
from email.Message import Message
import datetime
def send_mail(report):
fromList = conf.mail_from
toList = conf.mail_to
ccList = conf.mail_cc
mail_title = conf.mail_title + "[" +\
datetime.datetime.now().strftime('%Y-%m-%d') + "]"
emailSender = SMTP_SSL('smtp.ops.xxx-inc.com')
emailSender.SendHTML('', '', fromList, toList, ccList, mail_title, report)
class SMTP_SSL(smtplib.SMTP):
def __init__(self, host=''):
self.host = host
def SendHTML(self,account,passwd,fromList,toList,ccList,subject,content):
msg = Message()
msg['Mime-Version']='1.0'
msg['Content-Type']='text/html'
msg['From'] = fromList
msg['To'] = toList
msg['CC'] = ccList
msg['Subject'] = subject
msg['Date'] = email.Utils.formatdate()
msg.set_payload(content)
smtp = smtplib.SMTP(host=self.host, port=25)
smtp.sendmail(fromList, toList.split(',')+ccList.split(','), msg.as_string())
smtp.quit()
def start():
for i in range(1,11):
context = "大家high起来,哟哟,切克闹!跟着我来数:%d"%i
send_mail(context)
if __name__ == "__main__":
start()