上次,我们用requests 和 xpath爬取了极客学院的课程,感觉还是不过瘾,今天我们再来爬一下百度贴吧妈妈吧里面的话题,看看妈妈们都喜欢讨论什么吧!
爬取前我们先看一下我们的目标:
1.抓取百度贴吧妈妈吧的话题
2.抓取每一个话题的发布人、发布时间、发布标题、发布内容和回贴数目
import requests
url = 'http://tieba.baidu.com/f?kw=%E5%A6%88%E5%A6%88&ie=utf-8&pn=0'
html = requests.get(url)
print html.text
import requests
from lxml import etree
url = 'http://tieba.baidu.com/f?kw=%E5%A6%88%E5%A6%88&ie=utf-8&pn=0'
html = requests.get(url)
selector = etree.HTML(html.text)
content_field = selector.xpath('//li[@class=" j_thread_list clearfix"]')
item = {}
for each in content_field:
reply_num = each.xpath('div/div[@class="col2_left j_threadlist_li_left"]/span/text()')[0]
list_title = each.xpath('div/div[@class="col2_right j_threadlist_li_right "]/div[@class="threadlist_lz clearfix"]/div[@class="threadlist_title pull_left j_th_tit "]/a/text()')[0]
author = each.xpath('div/div[@class="col2_right j_threadlist_li_right "]/div[@class="threadlist_lz clearfix"]/div[@class="threadlist_author pull_right"]/span[@class="tb_icon_author "]/a/text()')[0]
create_time = each.xpath('div/div[@class="col2_right j_threadlist_li_right "]/div[@class="threadlist_lz clearfix"]/div[@class="threadlist_author pull_right"]/span[@class="pull-right is_show_create_time"]/text()')[0]
content = each.xpath('div/div[@class="col2_right j_threadlist_li_right "]/div[@class="threadlist_detail clearfix"]/div[@class="threadlist_text pull_left"]/div/text()')[0]
print reply_num
print list_title
print author
print author
print create_time
print content
list_title = each.xpath('div/div[@class="col2_right j_threadlist_li_right "]/div[@class="threadlist_lz clearfix"]/div[@class="threadlist_title pull_left j_th_tit "]/a/text()')[0]
IndexError: list index out of range
好吧,看来有的地方匹配不出来,出现了错误,我们把这个过滤掉就好了,这个我们用一下try.....except.....,什么意思呢,出现问题,过,继续往下走。。。
import requests
from lxml import etree
url = 'http://tieba.baidu.com/f?kw=%E5%A6%88%E5%A6%88&ie=utf-8&pn=0'
html = requests.get(url)
selector = etree.HTML(html.text)
content_field = selector.xpath('//li[@class=" j_thread_list clearfix"]')
item = {}
for each in content_field:
try:
reply_num = each.xpath('div/div[@class="col2_left j_threadlist_li_left"]/span/text()')[0]
list_title = each.xpath('div/div[@class="col2_right j_threadlist_li_right "]/div[@class="threadlist_lz clearfix"]/div[@class="threadlist_title pull_left j_th_tit "]/a/text()')[0]
author = each.xpath('div/div[@class="col2_right j_threadlist_li_right "]/div[@class="threadlist_lz clearfix"]/div[@class="threadlist_author pull_right"]/span[@class="tb_icon_author "]/a/text()')[0]
create_time = each.xpath('div/div[@class="col2_right j_threadlist_li_right "]/div[@class="threadlist_lz clearfix"]/div[@class="threadlist_author pull_right"]/span[@class="pull-right is_show_create_time"]/text()')[0]
content = each.xpath('div/div[@class="col2_right j_threadlist_li_right "]/div[@class="threadlist_detail clearfix"]/div[@class="threadlist_text pull_left"]/div/text()')[0]
print reply_num
print list_title
print author
print author
print create_time
print content
except Exception, e:
continue
# _*_ coding:utf-8 _*_
from lxml import etree
import requests
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
#内容输出
def towrite(contentdict):
f.writelines(u'回帖数目:' + unicode(contentdict['reply_num']) + '\n')
f.writelines(u'发布标题:' + unicode(contentdict['topic_title']) + '\n')
f.writelines(u'发布内容:' + unicode(contentdict['topic_content']) + '\n')
f.writelines(u'发布人:' + unicode(contentdict['user_name']) + '\n')
f.writelines(u'发布时间:' + str(contentdict['topic_time']) + '\n\n')
#爬虫主体
def spider(url):
html = requests.get(url)
selector = etree.HTML(html.text)
content_field = selector.xpath('//li[@class=" j_thread_list clearfix"]')
item = {}
for each in content_field:
try:
item['reply_num'] = each.xpath('div/div[@class="col2_left j_threadlist_li_left"]/span/text()')[0]
item['topic_title'] = each.xpath('div/div[@class="col2_right j_threadlist_li_right "]/div[@class="threadlist_lz clearfix"]/div[@class="threadlist_title pull_left j_th_tit "]/a/text()')[0]
item['user_name'] = each.xpath('div/div[@class="col2_right j_threadlist_li_right "]/div[@class="threadlist_lz clearfix"]/div[@class="threadlist_author pull_right"]/span[@class="tb_icon_author "]/a/text()')[0]
item['topic_time'] = each.xpath('div/div[@class="col2_right j_threadlist_li_right "]/div[@class="threadlist_lz clearfix"]/div[@class="threadlist_author pull_right"]/span[@class="pull-right is_show_create_time"]/text()')[0]
content = (each.xpath('div/div[@class="col2_right j_threadlist_li_right "]/div[@class="threadlist_detail clearfix"]/div[@class="threadlist_text pull_left"]/div/text()')[0]).split()
item['topic_content'] = ''.join(content)
towrite(item)
except Exception,e:
continue
if __name__ == '__main__':
f = open('content.txt','a')
page = []
#循环用来生产不同页数的链接
for x in range(10):
i = x*50
newpage = 'http://tieba.baidu.com/f?kw=%E5%A6%88%E5%A6%88&ie=utf-8&pn=' + str(i)
print u"第%d页"%(x+1)
print newpage
BDTBspier = spider(newpage)