python 实现爬取指定小说(两种实现方案 .附源码)

python 实现爬取指定小说实时下载(附源码)

import requests
import re
import time
import random


def download(book_name):
    # 下载小说
    search_real_url = 'https://www.biquge5200.cc/modules/article/search.php?searchkey=' + book_name
    try:
        novel_source = requests.get(search_real_url).text
        reg1 = r'(.*?).*?(.*?)'
        # 所有搜索到的结果(包括小说网址、名称、作者姓名)
        novel_list = re.findall(reg1, novel_source, re.S)
        # 判断是否有结果返回
        if len(novel_list) == 0:
            print('你要找的小说不存在,请检查后重新输入')
    except Exception as e:
        print(e)
    for novel_url, novel_name, novel_author in novel_list:
        if novel_name == book_name:
            print('你即将下载的小说:%s 作者:%s' % (novel_name, novel_author))
            return novel_url, novel_name


def get_chapter(url):
    # 获取章节页面
    try:
        # 章节页面源代码
        chapter_page_source = requests.get(url).text
        reg2 = r'
(.*?)
'
chapter_list = re.findall(reg2, chapter_page_source) # print(chapter_list) except Exception as e: print(e) return chapter_list def get_content(chapter_list, novel_name): """ :param chapter_list: :param novel_name: """ count = 0 length = len(chapter_list) for chapter_url, chapter_name in chapter_list: try: time.sleep(1 + random.random()) content_source = requests.get(chapter_url).text # print(content_source) reg = r'
(.*?)
'
content = re.findall(reg, content_source, re.S)[0] stsdw = re.sub('

(.*?)\(《》\)

'
, ' ', str(content)) print(stsdw) contents = stsdw.replace('

', '\n').replace('  ', '\n\t\t').replace('

', '').replace('

'
, '') print('contents:') print(contents) count += 1 with open(novel_name + '.txt', 'a', encoding='utf-8') as f: f.write(chapter_name + '\n' + contents + '\n' * 2) print('正在写入: ' + chapter_name) print('进度:%0.2f' % (count / length) + '%') except Exception as e: print(e) if __name__ == '__main__': book_name = input('请输入小说名:')#'圣墟' 凡人修仙传 novel_url, novel_name = download(book_name) chapter_list = get_chapter(novel_url) get_content(chapter_list, novel_name) ##以上内容仅供学习使用
实现效果图
爬取 笔趣阁 的《凡人修仙传》,请勿过度爬取他人网站,后果自负。

python 实现爬取指定小说(两种实现方案 .附源码)_第1张图片

实现效果二:
利用爬虫xpath 进行爬取笔趣阁 的《凡人修仙传仙界篇》,该代码更容易理解:
说明:该网站搜索功能崩溃,无法使用 输入搜索功能
可自行打开网站进行,选取后修改类中URL
如下:
  		self.server = 'http://www.biquge.info/书号id/'
        self.target = 'http://www.biquge.info/书号id/'
当然你不想爬取这个网站也行,可执行修改爬取网站,xpath 的总体实现思路以提供
当然绝大部分网站是有反爬策略的,注意合理配置,访问即可
import requests, sys
from lxml import etree
import io
import re

"""
类说明:下载《笔趣阁》网小说《凡人修仙传仙界篇》
Parameters:
    无
Returns:
    无
Modify:
    2019-12-24
"""


class downloader(object):

    def __init__(self):
        self.server = 'http://www.biquge.info/22_22533/'
        self.target = 'http://www.biquge.info/22_22533/'
        self.names = []  # 存放章节名
        self.urls = []  # 存放章节链接
        self.nums = 0  # 章节数
        self.books = 0  #书名


    """
    函数说明:获取下载链接  
    Parameters:
        无
    Returns:
        无
    Modify:
        2019-12-24
    """

    def get_download_url(self):
        req = requests.get(url=self.target)
        # print(req.status_code)
        html = req.content.decode()
        element = etree.HTML(html)
        trs = element.xpath('//*[@id="list"]/dl/dd/a/@href')
        trns = element.xpath('//*[@id="list"]/dl/dd/a/text()')
        name = element.xpath('//*[@id="info"]/h1')[0]
        self.books = name
        # self.author = name
        for tr in trs:
            self.urls.append(self.server + tr)
        for tr in trns:
            self.names.append(tr)
        self.nums = self.urls.__len__()

    """
    函数说明:获取章节内容
    Parameters:
        target - 下载连接(string)
    Returns:
        texts - 章节内容(string)
    Modify:
        2019-12-24
    """

    def get_contents(self, target):
        req = requests.get(url=target)
        contens = req.content.decode('utf-8')#设置编码格式
        trs = etree.HTML(contens)
        text = trs.xpath('//*[@id="content"]/text()')
        stale = io.StringIO()
        for tr in text:
            stale.write('\n\t' + tr)#添加个格式,让文本更合理展示
		re.time(1)#休眠1秒后执行
        return stale.getvalue()

    """
    函数说明:将爬取的文章内容写入文件
    Parameters:
        name - 章节名称(string)
        path - 当前路径下,小说保存名称(string)
        text - 章节内容(string)
    Returns:
        无
    Modify:
        2017-09-13
    """

    def writer(self, name, path, text):
        with open(path, 'a', encoding='utf-8') as f:
            f.write(name + '\n')
            f.writelines(text)
            f.write('\n\n')


if __name__ == "__main__":
    dl = downloader()
    dl.get_download_url()
    print('《'+dl.books.text+'》开始下载:')
    for i in range(dl.nums):
        dl.writer(dl.names[i], dl.books.text+'.txt', dl.get_contents(dl.urls[i]))
        sys.stdout.write("  已下载:%.3f%%" % float(i / dl.nums) + '\r')
        sys.stdout.flush()
    print(dl.books.text+'下载完成')

python 实现爬取指定小说(两种实现方案 .附源码)_第2张图片
最后:本人实现了许多代理池的免费 ip 爬取 网站尽20余个,需要xpth 实现脚本的@该账号

你可能感兴趣的:(python 实现爬取指定小说(两种实现方案 .附源码))