Python爬虫基本框架

想想Python的import导包,很方便,爬虫也可以自己建立模块,把网址给他,模块解析返回网页信息,在爬取大型网页时很方便。安装我们写爬虫程序的内容,可以分为URL管理器、HTML下载器、HTML解析器、数据存储器、爬虫调度器

URL管理器

class UrlManager():
    #初始化连个空集合
    def __init__(self):
        self.new_urls = set()
        self.old_urls = set()

    #添加url到待爬取url集合中
    def add_new_url(self, url):
        if url is None:
            return
        if url not in self.new_urls and url not in self.old_urls:
            self.new_urls.add(url)
    #判断是否有新的url
    def has_new_url(self):
        return self.new_url_size() != 0
    #在未被爬取的URL集合中获取一个url连接
    def get_new_url(self):
        new_url = self.new_urls.pop()
        self.old_urls.add(new_url)
        return new_url
    #把爬取到的链接添加到待爬取url集合中
    def add_new_urls(self,urls):
        if urls is None or len(urls) == 0:
            return
        for url in urls:
            self.add_new_url(url)
    #获得待爬取url大小
    def new_url_size(self):
        return len(self.new_urls)
    #获得已经爬取的url大小
    def old_url_size(self):
        return len(self.old_urls)

HTML下载器

import requests

class HtmlDownloader():

    def download(self,url):
        #判断url是否为空
        if url is None:
            return None

        headers = {
            'User-Agent':'Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko) Chrome/70.0.3538.77Safari/537.36'
        }

        res = requests.get(url,headers=headers)
        # print(res.text)
        if res.status_code == 200:
            res.encoding = 'utf-8'
            return res.text
        else:
            return None

HTML解析器

import re
from urllib import request
from bs4 import BeautifulSoup

class HtmlParser(object):
    def parser(self,page_url,html_cont):
        #判断传入的url和网页内容是否为空
        if page_url is None or html_cont is None:
            return
        #构建soup对象
        soup = BeautifulSoup(html_cont,'html.parser',from_encoding='utf-8')
        new_urls = self._get_new_urls(soup)
        new_data = self._get_new_data(page_url,soup)
        return new_urls,new_data

    def _get_new_urls(self,soup):
        new_urls = set()
        #抽取符合要求的a标签,原书代码不可用,百度已经更新
        # links = soup.find_all('a',href=re.compile(r'/view/\d+\.htm'))
        links = soup.find_all('a', href=re.compile(r'/item/.*'))
        for link in links:
            #提取标签的href属性
            new_url = link['href']
            #拼接成完整网址
            # print(new_url)
            base_url = 'https://baike.baidu.com'
            new_full_url = request.urljoin(base_url,new_url)
            # print(new_full_url)
            new_urls.add(new_full_url)
        return new_urls
    
    def _get_new_data(self,page_url,soup):
        data={}
        data['url']=page_url
        title = soup.find('dd',class_='lemmaWgt-lemmaTitle-title').find('h1')
        print(title.string)
        data['title']=title.string
        summary = soup.find('div',class_='lemma-summary').find('div',class_='para').find('a')
        print(summary.string)
        data['summary']=summary.string
        return data

数据存储器

import codecs

class DataOutput():

    def __init__(self):
        self.datas = []

    def store_data(self,data):
        if data is None:
            return
        self.datas.append(data)

    def output_html(self):
        #以网页格式写入数据,数据为表
        fout = codecs.open('bike.html','w',encoding='utf-8')
        fout.write("")
        fout.write("")
        fout.write("")
        fout.write("")

        for data in self.datas:
            fout.write("")
            fout.write(""%data['url'])
            fout.write(""%data['title'])
            fout.write(""%data['summary'])
            fout.write("")
            self.datas.remove(data)

        fout.write("")
        fout.write("")
        fout.write("
%s%s%s
") fout.close()

爬虫调度器

from spider_base.DataOutput import DataOutput
from spider_base.HtmlDownloader import HtmlDownloader
from spider_base.HtmlParser import HtmlParser
from spider_base.UrlManager import UrlManager

class SpiderMan():

    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser= HtmlParser()
        self.output = DataOutput()

    def crawl(self,root_url):

        self.manager.add_new_url(root_url)
        #爬取数据100条,不限制就会一直爬下去
        while(self.manager.has_new_url() and self.manager.old_url_size()<100):
            try:
                new_url = self.manager.get_new_url()

                html = self.downloader.download(new_url)

                new_urls,data = self.parser.parser(new_url,html)

                self.manager.add_new_urls(new_urls)

                self.output.store_data(data)

                print('已经抓取的链接数%s'%self.manager.old_url_size())

            except Exception as e:
                print('crawl failed')

        self.output.output_html()

if __name__=='__main__':
    spider_man = SpiderMan()
    spider_man.crawl('https://baike.baidu.com/item/%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB')

你可能感兴趣的:(爬虫基础)