百度贴吧爬虫

#!/usr/bin/env python
#coding=utf-8
import httplib2
import json
from lxml import etree
def replace(s):
	s= s.replace('/p/','http://tieba.baidu.com/p/')
	return s

def openhttp(url):
	h2 = httplib2.Http('.cache')
	(resp2,html) = h2.request(url,'GET')
	return html

def store_file(reply_sum):
	filehandle.write(reply_sum['topic'])
	filehandle.write('\n')
	try:
		for i in reply_sum['every_floor']:
			filehandle.write(str(i['floor']))
			filehandle.write('\t')
			filehandle.write(str(i['id']))
			filehandle.write('\t')
			filehandle.write(i['name'])
			filehandle.write('\t')
			filehandle.write(i['content'])
			filehandle.write('\t')
			filehandle.write(i['time'])
			filehandle.write('\n')
	except:
		print('')

def parse_link(topic,link):
	original_link = link
	sub_html = openhttp(link)
	now_page = etree.HTML(sub_html.decode('gbk'))
	total_page = int(now_page.xpath(u'//*[@class="l_reply_num"]/span')[0].text)
	#print total_page
	print "共有页码数:%d" %(total_page)
	print 'start=========================='
	print "主题是:%s" %(topic.encode('utf8'))
	floor = 0
	reply_dict = {}
	reply_list =[]
	reply_sum = {}
	reply_sum['topic'] = topic.encode('utf8')
	n = 1
	while(total_page>=n):
		link = original_link
		link = link + '?pn='+str(n)
		print '准备检索的url'
		print link
		sub_html = openhttp(link)
		now_page = etree.HTML(sub_html.decode('gbk'))
		replies = now_page.xpath(u'//*[@class="l_post "]|//*[@class="l_post noborder"]')
		for reply in replies:
			try:
				print "层数:%d" %(floor)
				contents = reply.xpath(u'descendant::div[@class="d_post_content j_d_post_content"]')
				json_str = reply.attrib['data-field']
				author_data = json.loads(json_str)
			
				author_id = author_data["author"]["id"]
				author_name = author_data["author"]["name"]
				author_time = author_data["content"]["date"]
				for content in contents:
					print ''
				reply_dict['floor'] = floor+1
				reply_dict['id']= author_id 
				reply_dict['content'] = content.text.encode('utf8')
				reply_dict['time'] = author_time.encode('utf8')
				reply_dict['name'] = author_name.encode('utf8')
			except:
				print('')
			reply_list+=[reply_dict]
			reply_dict = {}
			
			 	
			floor = floor +1
		n = n+1
	for i in reply_list:
				for m in i:
					print i[m]
	reply_sum['every_floor'] = reply_list
	store_file(reply_sum)


#http://tieba.baidu.com/p/2259628273?pn=2d_post_content j_d_post_content
def main():
	
	pn = 0
	while pn < 50:
		url = 'http://tieba.baidu.com/f?kw=%B0%CD%C0%E5%B5%BA&tp='+str(pn)
		print url
		main_html = openhttp(url)
		l = []
		page = etree.HTML(main_html.decode('gbk'))
		p = page.xpath(u'//a[@target="_blank"][@class="j_th_tit"]')
		print p[0].values()
		for h in p:
			l =  h.values()
			link = replace(l[0])
			topic = l[1]
			parse_link(topic,link)
			print topic
		pn = pn+50
	

if __name__=='__main__':
	filehandle = open('aaaaaaaaa.txt','w')
	main()
	filehandle.close()






你可能感兴趣的:(百度贴吧爬虫)