[python] 爬取乌云知识库的标题

代码如下

import urllib2,httplib
import re

def getinfo(url):
	header = {
	'Host': 'drops.wooyun.org',
	'Connection': 'keep-alive',
	'Cache-Control': 'max-age=0',
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
	'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.111 Safari/537.36',
	'Referer': 'http://drops.wooyun.org/category/tips',
	'Accept-Encoding': 'deflate',
	'Accept-Language': 'zh-CN,zh;q=0.8',
	'Cookie': '__cfduid=d53915c99accdcad55597f1505f45d41d1437057232; wy_uid=05413VWUQtWG87uQ9rQqjQ1%2FBsoMcwzmmY6NUcmZ9GYd; wy_pwd=548chtxV21nx%2FTqMxuearSIbX%2BGf7OXFZbP8OSfT4ZXwbLGD5%2FQ5%2BYyOO%2FrVvGRxHqLDnS7rHOdfololbg; Hm_lvt_c12f88b5c1cd041a732dea597a5ec94c=1441114384,1441198545,1441638408,1441720289; PHPSESSID=2vsqsq4ta4phl8c8us6ekp1tg3; Hm_lvt_9fc41da6a2322bdd80563c9d5a4bdb1d=1441720306,1441801056,1441804034,1442054546; Hm_lpvt_9fc41da6a2322bdd80563c9d5a4bdb1d=1442055432; wordpress_logged_in_7065d11a793a3ec8482214fcc4f0a55b=%E7%8E%8B%E9%B9%8F%40HIT%7C1442228058%7C53c2ec073a1c556b783f94fcc013d4ec',
	'If-Modified-Since': 'Sat, 12 Sep 2015 10:54:18 GMT'
	}
	req = urllib2.Request(url,None,header)
	response = urllib2.urlopen(req)
	htmlpage = response.read()
	
	title = re.compile(r'title="Permanent Link to .*?">',re.DOTALL).findall(htmlpage)
	title_num = len(title)
	for i in range(title_num):
		fobj = open('result.txt','a')
		#print title[i][25:-2]
		print >> fobj,title[i][25:-2]
		fobj.close()
	print "Finish getting "+url

if __name__=="__main__":
	page_num = raw_input("Input the PageNum:")
	page_num = int(page_num)
	for i in range(1,page_num+1):
		url="http://drops.wooyun.org/category/tips/page/"+str(i)
		getinfo(url)


先写这么多

你可能感兴趣的:(python)