全书网整本小说爬取

工具:

  1. requests模块
  2. beautifulsoup4模块
  3. logging模块
  4. lxml模块

主要流程:

  1. 请求url,返回页面
  2. 解析页面,提取数据
  3. 保存数据

注:在解析上我写了两种不同方式,一种是用Beautifulsoup,另一种是Xpath。源码中的请求头有删改。

源码如下:

import requests
from bs4 import BeautifulSoup
import logging
from lxml import etree

class Spider_Novel():
    def __init__(self,url):
        self.novel_url = url
        self.session = requests.session()
        self.session.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/53.36 (KHTML, like Gecko) Chrome/69.0.34.81 Safari/57.36'}
    def __call__(self, *args, **kwargs):
        """
        执行主函数
        :param args:
        :param kwargs:
        :return:
        """
        self.logger = logging.getLogger('novel')
        self.logger.setLevel(logging.INFO)
        sh = logging.StreamHandler()
        sh.setLevel(logging.INFO)
        format = logging.Formatter(
            '时间:%(asctime)s,'
            '日志级别:%(levelname)s,'
            '日志信息:%(message)s,'
        )
        sh.setFormatter(format)
        self.logger.addHandler(sh)
        response = self.Doneload_html(self.novel_url)
        # title, chapter_urls = self.Parse_title_chapter(response)
        title, chapter_urls = self.Parse_title_chapter_xpath(response)
        self.Retain(title,chapter_urls)
    def Doneload_html(self,novel_url):
        """
        根据url,返回页面的数据
        :param novel_url:
        :return:
        """
        try:
            response = self.session.get(novel_url)
            response.encoding = 'gb2312'
            response = response.text
            return response
        except Exception as e:
            self.logger.error("返回小说主页面出错")
            raise e
    def Parse_title_chapter(self,response):
        """
        页面解析器,从返回的页面中提取出小说名、一个二元列表,里面每个元素分别是章节名,章节url
        :param response:
        :return:
        """
        try:
            soup = BeautifulSoup(response,'lxml')
            title = soup.find_all('div',attrs={'class':'fenlei'})[0].a.string
            div = soup.find_all('div',class_ = 'booklist first')[0]
            chapter_urls = ((i.a['title'],i.a['href']) for i in div.find_all('li'))
            return title,chapter_urls
        except Exception as e:
            self.logger.error("小说主页面解析出错")
            raise e
    def Retain(self,title,chapter_urls):
        """
        传来一个小说章节url的列表,请求每个url,并解析出指定内容,并保存在文件中
        :param chapter_urls:
        :return:
        """
        with open('%s.txt'%title,'w',encoding='utf-8') as f:
            for i in chapter_urls:
                name = i[0]
                url = i[1]
                html = self.Doneload_html(url)
                try:
                    content = self.Parse_content_xpath(html)
                    f.write(name)
                    f.write('\n')
                    f.write(content)
                    f.write('\n')
                    self.logger.info("%s下载成功 此文章地址%s" % (name, url))
                except Exception as e:
                    self.logger.error("%s解析出错"%name)

        self.logger.info("正本%s下载成功"%title)
    def Parse_content(self,html):
        """
        解释小说章节url返回的文本,提取小说内容
        :return:
        """
        soup = BeautifulSoup(html,'lxml')
        content = soup.find_all('div',id='chapter_content')[0].stripped_strings
        content = ''.join(content).replace('screen_content_set();','').replace('\r\n','').replace('readsidebar(\'1\');','')
        return content


    def Parse_title_chapter_xpath(self,response):
        """
        页面解析器,从返回的页面中提取出小说名、一个二元列表,里面每个元素分别是章节名,章节url
        :param response:
        :return:
        """
        try:
            page = etree.HTML(response)
            title = page.xpath("//div[@class='fenlei']/span/a[1]/text()")[0]
            a = page.xpath("//div[@class='booklist first']/ul/li/a")
            # print(a)
            # exit()
            chapter_urls = [(i.xpath("./text()")[0],i.xpath("./@href")[0]) for i in a]
            return title,chapter_urls
        except Exception as e:
            self.logger.error("小说主页面解析出错")
            raise e
    def Parse_content_xpath(self, html):
        """
        解释小说章节url返回的文本,提取小说内容
        :return:
        """
        page = etree.HTML(html)
        content = page.xpath("//div[@id='chapter_content']/text()")[2:]
        content = ''.join(x.strip() for x in content )
        return content
if __name__ == '__main__':
    spider = Spider_Novel('http://www.574aw.com/zhongshengzhiminyishiweitian/')
    spider()

全书网整本小说爬取_第1张图片

你可能感兴趣的:(python3爬虫实练)