python网络应用入门:网络爬虫的使用

************************************************* ** 转发请注明原文,尊重原创 ** 原文来自: blog.csdn.net/clark_xu 徐长亮的专栏 *************************************************

 

概要:

(1)urllib.request.Request的使用

(2)BeautifulSoup的使用

(3)网络爬虫的设计概要:

(4)网络爬虫的代码:

一、网络爬虫的设计概要:

python网络应用入门:网络爬虫的使用_第1张图片

 

二、urllib.request.Request对象的简单使用:

初始化对象:

request=urllib.request.Request(url)

添加

request.add_header('user-agent','Mozilla/5.0')

获得响应

response1=urllib.request.urlopen(request)

三、三种访问网站的方式:

# coding=gbk

'''

Created on 2016414

 

@author: clark

'''

 

import urllib.request

import http.cookiejar

 

url="http://www.baidu.com"

print('the first method')

response1=urllib.request.urlopen(url)

print(response1.getcode())

print(len(response1.read()))

 

print('the second method')

request=urllib.request.Request(url)

request.add_header('user-agent','Mozilla/5.0')

response1=urllib.request.urlopen(request)

print(response1.getcode())

print(len(response1.read()))

 

print('the third method')

ci=http.cookiejar.CookieJar() 

opener=urllib.request.build_opener(urllib.request.HTTPCookieProcessor(ci))

urllib.request.install_opener(opener)

response1=urllib.request.urlopen(url)

print(response1.getcode())

print(len(response1.read()))

print(ci)

四、beautifulsoup4的使用

# coding=utf-8

 

'''

Created on 2016/4/15

 

@author: clark

'''

import urllib.request

from bs4 import BeautifulSoup

import http.cookiejar

 

url="http://blog.csdn.net/clark_xu"

url="http://www.baidu.com"

request=urllib.request.Request(url)

request.add_header('user-agent','Mozilla/5.0')

response1=urllib.request.urlopen(request)

 

print('the first method')

print(response1.getcode())

print(len(response1.read().decode('UTF-8')))

html_doc=response1.read().decode('UTF-8')

print(type(html_doc))

print(html_doc)

 

soup = BeautifulSoup(html_doc,"html.parser",from_encoding='utf-8')

print('the second method')

print(soup.prettify())

 

 

print('the third method')

ci=http.cookiejar.CookieJar() 

opener=urllib.request.build_opener(urllib.request.HTTPCookieProcessor(ci))

urllib.request.install_opener(opener)

response1=urllib.request.urlopen(url)

print(response1.getcode())

print(len(response1.read()))

print(ci)

五、网络爬虫的代码

1.网页下载模块:baike_spider.html_downloader

<span style="font-family:KaiTi_GB2312;"><em>#coding:utf8
'''
Created on 2016��4��15��

@author: clark
'''
import urllib.request

class HtmlDownloader(object):
    
    def download(self,url):
        if url is None:
            return None
        
        request=urllib.request.Request(url)
        request.add_header('user-agent','Mozilla/5.0')
        response=urllib.request.urlopen(request)
        
        if response.getcode()!=200:
            return None
        return response.read()
</em></span>    
    


2. 解析结果,网页输出模块baike_spider.html_outputer

<em><span style="font-family:KaiTi_GB2312;">#coding:utf8

'''
Created on 2016��4��15��

@author: clark
'''
class HtmlOutputer(object):
    def __init__(self):
        self.datas=[]
    
    def collect_data(self,data):
        if data is None:
            return
        self.datas.append(data)
    
    def output_html(self):
        fout=open('output.html','w')
        fout.write("<html>")
        fout.write("<body>")
        fout.write("<table>")
        for data in self.datas:
            fout.write("<tr>")
            fout.write("<td>%s</td>"%data['url'].encode('utf-8'))
            fout.write("<td>%s</td>"%data['title'].encode('utf-8'))
            fout.write("<td>%s</td>"%data['summary'].encode('utf-8'))
            fout.write("</tr>")
        fout.write("</table>")
        fout.write("</body>")
        fout.write("</html>")
        fout.close()
    </span></em>


3. 网页解析模块:baike_spider.html_parser

<span style="font-family:KaiTi_GB2312;"><em>#coding:utf8

'''
Created on 2016��4��15��

@author: clark
'''
from bs4 import BeautifulSoup
import re
from baike_spider import urlparse
class HtmlParser(object):
    
    def _get_new_urls(self,page_url,soup):
        new_urls=set()
        links=soup.find_all('a',href=re.compile(r"/wiki"))
        for link in links:
            new_url=link['href']
            #new_full_url=urlparse.urljoin(page_url,new_url)
            new_full_url='http://www.liaoxuefeng.com'+new_url
            new_urls.add(new_full_url)  
        return new_urls
     
    def _get_new_data(self,page_url,soup):
        res_data={}
        res_data['url']=page_url
        title_node=soup.find('u1',class_="uk-nav uk-nav-side")
        res_data['title']=title_node.get_text()
        summay_node=soup.find('div',class_='lemma-summary')
        res_data['summary']=summay_node.get_text()
        return res_data
    
    def parse(self,page_url,html_cont):
        if page_url is None or html_cont is None:
            return
        soup = BeautifulSoup(html_cont,"html.parser",from_encoding='utf-8')
        new_urls=self._get_new_urls(page_url,soup)
        new_data=self._get_new_data(page_url,soup)
        return new_urls,new_data</em>
</span>        


4. 页面超链接url管理模块:baike_spider.url_manger

<span style="font-family:KaiTi_GB2312;"><em>#coding:utf8

'''
Created on 2016��4��15��

@author: clark
'''
from bs4 import BeautifulSoup
import re
from baike_spider import urlparse
class HtmlParser(object):
    
    def _get_new_urls(self,page_url,soup):
        new_urls=set()
        links=soup.find_all('a',href=re.compile(r"/wiki"))
        for link in links:
            new_url=link['href']
            #new_full_url=urlparse.urljoin(page_url,new_url)
            new_full_url='http://www.liaoxuefeng.com'+new_url
            new_urls.add(new_full_url)  
        return new_urls
     
    def _get_new_data(self,page_url,soup):
        res_data={}
        res_data['url']=page_url
        title_node=soup.find('u1',class_="uk-nav uk-nav-side")
        res_data['title']=title_node.get_text()
        summay_node=soup.find('div',class_='lemma-summary')
        res_data['summary']=summay_node.get_text()
        return res_data
    
    def parse(self,page_url,html_cont):
        if page_url is None or html_cont is None:
            return
        soup = BeautifulSoup(html_cont,"html.parser",from_encoding='utf-8')
        new_urls=self._get_new_urls(page_url,soup)
        new_data=self._get_new_data(page_url,soup)
        return new_urls,new_data</em>
</span>        


5. 页面超链接url解析模块:baike_spider.urlparse

<em><span style="font-family:KaiTi_GB2312;">#coding:utf8

class urlparse(object):
    
    def urljoin(self,page_url,new_url):
        new_full_url=page_url+new_url
        return new_full_url</span></em>


6. 爬虫调度模块:程序入口baike_spider.spider_main

<span style="font-family:KaiTi_GB2312;"><em>#coding:utf8

from baike_spider import url_manger, html_downloader, html_parser,html_outputer

class SpiderMain(object):
    def __init__(self):
        self.urls=url_manger.UrlManager()
        self.downloader=html_downloader.HtmlDownloader()
        self.parser=html_parser.HtmlParser()
        self.outputer=html_outputer.HtmlOutputer()
        
    def craw(self,root_url):
        count=1
        self.urls.add_new_url(root_url)
        while self.urls.has_new_url():
            try:
                new_url=self.urls.get_new_url()
                print('craw %d:%s'%(count,new_url))
                html_count=self.downloader.download(new_url)
                new_urls,new_data=self.parser.parse(new_url,html_count)
                self.urls.add_new_urls(new_urls)
                self.outputer.collect_data(new_data)
                
                if count==1000:
                    break
                count=count+1
            except:
                print('craw failed')
        self.outputer.output_html()
    
if __name__=="__main__":
    #root_url="http://baike.baidu.com/view/21087.htm"
    root_url='http://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000'
    obj_spider=SpiderMain()
    obj_spider.craw(root_url)</em></span>


 

 

 

 

 

你可能感兴趣的:(爬虫,beautifulsoup)