使用BeautifulSoup爬取笔趣阁小说

使用BeautifulSoup爬取笔趣阁小说

    • 代码
    • 实验一下

今天下午学习了一下BeautifulSoup,正好本人书荒,于是以笔趣阁网站为研究对象,就写了个爬小说的代码。放上来供大家参考,也请高手指正。
先放代码:

代码

import urllib.request as ur
from bs4 import BeautifulSoup
import ssl
import re


def get_soup(address):
    '''抓取网页,创建BeautifulSoup对象'''
    context = ssl._create_unverified_context()  # 取消验证
    headers = {'User-Agent': 'Chrome/68.0.3440.84'}
    request = ur.Request(address, headers=headers)
    response = ur.urlopen(request, timeout=20, context=context)
    content = response.read()
    soup = BeautifulSoup(content, 'lxml')
    return soup


def get_chapter_list(book_address, web_address='https://www.biqudu.com/'):
    '''抓取笔趣阁的小说章节目录页面中的小说的章节名称和章节地址'''
    chapter_address_list = []  # 章节地址列表
    chapter_name_list = []  # 章节名称列表
    root_address = web_address + book_address  # 章节目录页面地址
    soup = get_soup(root_address)
    chapterList = soup.find('div', id='list').find_all('a')[12:]  # 抓取章节目录页面中正文章节的信息
    for chapter in chapterList:
        chapter_address_list.append(web_address + chapter['href'])  # 各章节地址的列表
        chapter_name_list.append(chapter.text)  # 各章节名称的列表
    return chapter_name_list, chapter_address_list


def get_text(chapter_address):
    '''抓取章节正文页面中的小说内容'''
    soup = get_soup(chapter_address)
    chapter_content = soup.find('div', id='content').text  # 提取页面中小说章节的正文内容
    pattern = re.compile(r'\u3000\u3000\S+')  # 使用正则表达式找出在‘\u3000\u3000’之后的非空白字符(小说的各个段落)
    sentenceList = pattern.findall(chapter_content)
    chapter_text = '\n'.join(sentenceList).replace('\u3000\u3000', '  ')
    return chapter_text


def crawl_fiction(book_address, object_file):
    '''抓取小说,写入目标文件(txt文件)'''
    chapter_name_list, chapter_address_list = get_chapter_list(book_address)
    num = len(chapter_name_list)
    with open(object_file, 'w', encoding='utf-8') as f:
        i = 1
        for name, address in zip(chapter_name_list, chapter_address_list):
            print('共{}章,  正在爬取第{}章,......{:.2%}'.format(num, i, i / num))
            f.write(name + '\n')
            f.write(get_text(address) + '\n' * 3)
            i += 1
    print('爬取完毕')

实验一下

爬一爬天蚕土豆的《元尊》

# 爬取小说天蚕土豆的小说《元尊》,其笔趣阁目录页地址为https://www.biqudu.com/31_31729/
book_address = '31_31729/'
crawl_fiction(book_address,'元尊.txt')

结果还行:

共597章, 正在爬取第1章,…0.17%
共597章, 正在爬取第2章,…0.34%
共597章, 正在爬取第3章,…0.50%
共597章, 正在爬取第4章,…0.67%
共597章, 正在爬取第5章,…0.84%
共597章, 正在爬取第6章,…1.01%
共597章, 正在爬取第7章,…1.17%
共597章, 正在爬取第8章,…1.34%
共597章, 正在爬取第9章,…1.51%

你可能感兴趣的:(随笔)