1024!爬取小说

这是第一个成功获取到的版本,下个版本预计添加一个字典,将书名与url放入,方便用户直接通过搜索书名来直接下载。目前还没想好书名错误的情况,源码如下:

# !/usr/bin/env python
# -*- coding:utf-8 -*-
#文件  :Module_10_24_novel.py
# author named sunxth
#IDE   pycharm
import os
from urllib import request
from bs4 import BeautifulSoup
import jsonpath
if __name__ == '__main__':
    url = "https://www.biqukan.com/40_40243/"
    file = open("有妖气客栈.txt",'w',encoding = 'utf-8')

    headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; …) Gecko/20100101 Firefox/63.0'
    }
    
    req = request.Request(url = url,headers = headers)
    html = request.urlopen(url)
    html = html.read().decode("gbk","ignore")
    #在解码的时候,设置ignore  会自动忽略非法字符
    soup = BeautifulSoup(html,'lxml')
    #find    find_all == []
    listmain_soup = soup.find_all('div',class_= "listmain")
    chapter_text = BeautifulSoup(str(listmain_soup),'lxml')
    number = (len(chapter_text.dl.contents)-1)/2-12
    index = 1
    print("计算的章节个数",number)
    begin_flag = False
    #遍历dl 标签下面的  父标签下一级
    for child in chapter_text.dl.children:
        if child != "\n":
            #过滤回车
            if child.string == u"《有妖气客栈》正文卷":
                begin_flag = True
            if begin_flag == True and child.a != None:
                download_url = 'https://www.biqukan.com/' + child.a.get('href')
                download_url_req = request.Request(url = download_url,headers = headers)
                download_url_reponse = request.urlopen(download_url)
                #解码
                download_url_html = download_url_reponse.read().decode("gbk","ignore")
                #获得各个章节的名称
                download_name = child.string
                soup_texts = BeautifulSoup(download_url_html,'lxml')
                texts = soup_texts.find_all(id = "content",class_ = "showtxt")
                soup_text = BeautifulSoup(str(texts),'lxml')
                file.write(download_name + "\n\n")
                #将各个章节的名称写入文本
                for each in soup_text.div.text:
                    file.write(each)
                print("下载完成的章节:",download_name)
                #每遍历完成一个章节,换行以保证格式
                file.write('\n')
    file.close()
    print('completed')

你可能感兴趣的:(1024!爬取小说)