直接百度乔丹的URL是这样的:
root-url:
http://baike.baidu.com/link?url=Htp0lISd4ld46oenEs4HvYh13u8iTht9YxBpUY8e3QECSsxXYQRg_yr7R_wvRos3kWflwekkcn_5VuZsgEhFPY_oQAKj8xla3F6MXbCoDnirERpJEyVuHZ3uGGvpmfwVz_rsdygPH5wirnODhhSv2_
恩,的确非常恶心。我们需要寻找一个root-url,但是这样恶心的URL确实不能正常工作。
然后我用了一个笨办法,直接百度科比,然后找到乔丹的链接,查看元素并作为HTML编辑,OK,本尊出现:
http://baike.baidu.com/view/19096.htm
有了它后,就可以进行分析和编码了。
从之前的学习记录看,我们至少需要调度器spider_man,URL管理器url_manager,HTML解析器html_parser,HTML下载器html_downloader
计划输出结果保存成一个文件,所以,我们的文件结构是:
查看网页上其他链接的格式,右击——查看元素——作为HTML编辑:
<a target="_blank" href="/view/32594.htm">芝加哥公牛队</a>
学习了BeautifulSoup官网上相关的筛选语法后,为了精确的定位我们想要的信息,可以这样:
new_urls=set()
#<a target="_blank" href="/view/32594.htm">芝加哥公牛队</a>
links=soup.find_all('a',href=re.compile(r"/view/\d+\.htm")) #正则匹配
links=soup.find_all(target="_blank")
编码部分:
html_downloader.py:
#coding:utf8
import urllib2
class HtmlDownloader(object):
def download(self,url):
if url is None:
return None
response = urllib2.urlopen(url)
if response.getcode()!=200:
return None
#print response.read()
return response.read()
outputer.py:
#coding:utf8
class HtmlOutputer(object):
def __init__(self):
self.datas=[]
def collect_data(self,data):
if data is None:
return
self.datas.append(data)
def output_html(self):
fout=open('output.html','w')
fout.write('<html>')
fout.write('<body>')
fout.write('<table border="1px solid #000">')
for data in self.datas:
fout.write('<tr>')
fout.write('<td>%s</td>' %data['url'])
fout.write('<td>%s</td>' %data['title'].encode('utf-8'))
fout.write('</tr>')
fout.write('</table>')
fout.write('</body>')
fout.write('</html>')
fout.close()
html_parser.py:
#coding:utf8
from bs4 import BeautifulSoup
import re
import urlparse
class HtmlParser(object):
def _get_new_urls(self,page_url,soup):
new_urls=set()
#<a target="_blank" href="/view/32594.htm">芝加哥公牛队</a>
links=soup.find_all('a',href=re.compile(r"/view/\d+\.htm")) #正则匹配
links=soup.find_all(target="_blank")
for link in links:
new_url=link['href']
new_full_url=urlparse.urljoin(page_url,new_url)
new_urls.add(new_full_url)
#print new_urls
return new_urls
def _get_new_data(self,page_url,soup):
res_data={}
res_data['url']=page_url
#<dd class="lemmaWgt-lemmaTitle-title">
#<h1>迈克尔·乔丹</h1>
title_node=soup.find('dd',class_="lemmaWgt-lemmaTitle-title").find("h1")
res_data['title']=title_node.get_text()
return res_data
def parse(self,page_url,html_cont):
if page_url is None or html_cont is None:
return
soup=BeautifulSoup(html_cont,'html.parser',from_encoding='utf-8')
#print soup
new_urls=self._get_new_urls(page_url,soup)
print new_urls
new_data=self._get_new_data(page_url,soup)
return new_urls,new_data
url_manager.py:
#coding:utf8
class UrlManager(object):
def __init__(self):
self.new_urls=set()
self.old_urls=set()
def add_new_url(self,url):
if url is None:
return
if url not in self.new_urls and url not in self.old_urls:
self.new_urls.add(url)
def add_new_urls(self,urls):
if urls is None or len(urls)==0:
return
for url in urls :
self.add_new_url(url)
def has_new_url(self):
return len(self.new_urls)!=0
def get_new_url(self):
new_url=self.new_urls.pop()
self.old_urls.add(new_url)
return new_url
spider_man.py:
#coding: utf8
import url_manager,html_downloader,html_outputer,html_parser
class SpiderMain(object):
"""docstring for SpiderMain"""
def __init__(self):
self.urls=url_manager.UrlManager()
self.downloader=html_downloader.HtmlDownloader()
self.parser=html_parser.HtmlParser()
self.outputer=html_outputer.HtmlOutputer()
def craw(self,root_url):
count=1 #爬取第几个URL
self.urls.add_new_url(root_url)
while self.urls.has_new_url():
try:
new_url=self.urls.get_new_url()
print 'NO.%d: %s' % (count,new_url)
html_cont=self.downloader.download(new_url)
# 解析得到URL和数据
new_urls, new_data=self.parser.parse(new_url,html_cont)
print new_urls
#print new_data
self.urls.add_new_urls(new_urls)
self.outputer.collect_data(new_data)
if count==20:
break
count=count+1
except Exception, e:
print e
#print count
self.outputer.output_html()
if __name__ == "__main__":
root_url="http://baike.baidu.com/view/19096.htm"
obj_spider=SpiderMain()
obj_spider.craw(root_url);
获得结果:
http://baike.baidu.com/view/19096.htm | 迈克尔·乔丹 |
http://baike.baidu.com/view/1091423.htm | 杰里·斯隆 |
http://baike.baidu.com/view/62675.htm | 卡尔·马龙 |
http://baike.baidu.com/view/571202.htm | 公国 |
http://baike.baidu.com/subview/4466937/15093391.htm | 赛季 |
http://baike.baidu.com/view/582.htm | NBA |
http://baike.baidu.com/view/1795775.htm | 篮圈 |
http://baike.baidu.com/view/1124818.htm | 迈克尔·里德 |
http://baike.baidu.com/view/36806.htm | 蒂姆·邓肯 |
http://baike.baidu.com/view/1140.htm | NBA季前赛 |