爬虫的结果
crow 1:https://baike.baidu.com/view/21087.html
crow 2:http://v.baidu.com/
crow failed
crow 2:http://map.baidu.com/
crow failed
crow 2:https://baike.baidu.com/ziran
crow failed
crow 2:https://baike.baidu.com/kedou/
crow failed
crow 2:https://baike.baidu.com/uc/favolemma
crow failed
crow 2:https://baike.baidu.com/redirect/bd57D_QYRJrWtaYXMHfo4iHnUkk-10lbsmyUNTIBCyStIVCwG90uXOqxPuW_YBCkKIYcdmiuJg
crow failed
crow 2:https://baike.baidu.com/item/%E9%A9%AC%E7%89%B9%E5%88%A9
crow 3:https://baike.baidu.com/item/%E5%BA%94%E7%94%A8%E7%A8%8B%E5%BA%8F%E7%BC%96%E7%A8%8B%E6%8E%A5%E5%8F%A3
crow 4:https://baike.baidu.com/feiyi?fr=dhlfeiyi
#初始化个个器材
def __init__(self):
self.urls=url_manage.UrlManager();
self.downloder=html_downloader.HtmlDownloader();
self.parser=html_parser.Html_Parser();
self.outputer=html_outputer.Html_Outputer()
#爬虫的调度程序
def craw(self,root_url):
count=1;
self.urls.add_new_url(root_url);
while self.urls.has_new_url:
try:
new_url=self.urls.get_new_url()
print 'crow %d:%s'%(count,new_url)
html_cont=self.downloder.download(new_url)
new_urls ,new_data=self.parser.parse(new_url,html_cont)
self.urls.add_new_urls(new_urls );
self.outputer.collect_data(new_data)
if count==100:
break;
count=count+1;
except:
print 'crow failed'
self.outputer.output_html()
if __name__=='__main__':
#入口url
root_url='https://baike.baidu.com/view/21087.html';
obj_spider=SpiderMain();
#启动爬虫
obj_spider.craw(root_url)
if __name__=='__main__':
#入口url
root_url='https://baike.baidu.com/view/21087.html';
obj_spider=SpiderMain();
#启动爬虫
obj_spider.craw(root_url)
#!/user/bin/env python
# _*_ coding:utf-8 _*_
#URL管理器
class UrlManager():
def __init__(self):
self.new_urls=set();
self.odd_urls=set();
def add_new_url(self,url):
if url is None:
return
if url not in self.new_urls and url not in self.odd_urls:
self.new_urls.add(url)
def add_new_urls(self,urs):
if urs is None and len(urs)==0:
return
for url in urs:
self.add_new_url(url)
def has_new_url(self):
return len(self.new_urls)!=0;
def get_new_url(self):
new_url=self.new_urls.pop();
self.odd_urls.add(new_url)
return new_url
#!/user/bin/env python
# _*_ coding:utf-8 _*_
#HTML下载器
import urllib2
class HtmlDownloader():
#传进来一个要下载的URl
def download(self,url):
#如果url为空,则返回空
if url is None:
return None;
response=urllib2.urlopen(url);
# 获取请求值
if response.getcode()!=200:
return None;
else:
#返回html字符串
return response.read()
#!/user/bin/env python
# _*_ coding:utf-8 _*_
#html解析器
from bs4 import BeautifulSoup
import re,urlparse
class Html_Parser():
#获取新的URL
def _get_new_urls(self,page_url,soup):
#存储URL
new_urls=set();
#获取所有的URl
#/view/\d+\.htm
links=soup.find_all('a',href=re.compile(r''))
for link in links:
#获取每一个URL
new_url=link['href']
#上面获取的URl不完整要拼接
#urljoin 这个函数能够按照page_url的格式拼接字符串
new_full_url=urlparse.urljoin(page_url,new_url)
new_urls.add(new_full_url)
return new_urls
#解析数据 我们需要解析title和soup两个数据
def _get_new_data(self,page_url,soup):
res_data={}
res_data['url']=page_url;
#匹配title节点
title_node=soup.find('dd',class_='lemmaWgt-lemmaTitle-title',).find('h1')
#获取title的文本信息
res_data['title']=title_node.get_text();
summary_node=soup.find('div',class_='lemma-summary')
res_data['summary'] = summary_node.get_text();
return res_data
#从cont中解析出两个数据(新的URL列表和数据)
def parse(self,page_url,html_cont):
if page_url is None and html_cont is None:
return;
soup=BeautifulSoup(html_cont,'html.parser',from_encoding='utf-8')
new_urls=self._get_new_urls(page_url,soup)
#进行解析出新的数据
new_data = self._get_new_data(page_url, soup)
return new_urls,new_data
#!/user/bin/env python
# _*_ coding:utf-8 _*_
import url_manage,html_downloader,html_parser,html_outputer
class SpiderMain(object):
#初始化个个器材
def __init__(self):
self.urls=url_manage.UrlManager();
self.downloder=html_downloader.HtmlDownloader();
self.parser=html_parser.Html_Parser();
self.outputer=html_outputer.Html_Outputer()
#爬虫的调度程序
def craw(self,root_url):
count=1;
self.urls.add_new_url(root_url);
while self.urls.has_new_url:
try:
new_url=self.urls.get_new_url()
print 'crow %d:%s'%(count,new_url)
html_cont=self.downloder.download(new_url)
new_urls ,new_data=self.parser.parse(new_url,html_cont)
self.urls.add_new_urls(new_urls );
self.outputer.collect_data(new_data)
if count==100:
break;
count=count+1;
except:
print 'crow failed'
self.outputer.output_html()
if __name__=='__main__':
#入口url
root_url='https://baike.baidu.com/view/21087.html';
obj_spider=SpiderMain();
#启动爬虫
obj_spider.craw(root_url)
#!/user/bin/env python
# _*_ coding:utf-8 _*_、
#输出爬出来的数据
class Html_Outputer():
#用于数据的维护
def __init__(self):
self.datas=[]
#用于收集数据
def collect_data(self,data):
if data is None:
return
self.datas.append(data)
pass
#用于将收集好的数据写到HTML中
def output_html(self):
#建立一个文件,写的模式
font=open('outhtml.html','w')
font.write("")
font.write("")
font.write("")
for data in self.datas:
font.write("")
font.write("%s "%data['url'])
font.write("%s " % data['title'].encode('utf-8'))
font.write("%s " % data['summary'].encode('utf-8'))
font.write(" ")
font.write("
")
font.write("")
font.write("")
font.close();
pass