爬取小说-单线程

##代码思路
1.获取小说目录页源码
2.获取所有章节链接到集合
3.传入章节的URL,下载章节内容
4.打开每个章节链接并逐一下载并保存内容

import requests
from bs4 import BeautifulSoup

def open_url(url):
"""获取小说HTML格式内容"""
    header = {'User-Agent': str('Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)')}
    request=requests.get(url,header)#访问链接
    response=request.content#源码内容
    response_decode=response.decode('utf-8')#转换格式
    return response_decode

def chapter_url(url):
 """获取小说章节链接"""
    html = open_url(url)
    soup=BeautifulSoup(html,'html.parser')
    chaptrurllist = []
    chapterurl = soup.find('div',id="list").find_all('a')
    for i in chapterurl:
        i=i['href']
        tureurl='https://www.xsbiquge.com/'+i
        chaptrurllist.append(tureurl)
        print(chaptrurllist)
        print(len(chaptrurllist))
    return chaptrurllist

def get_content(url):
"""传入章节的URL,下载章节内容"""
    pagehtml = open_url(url)
    soup = BeautifulSoup(pagehtml,'html.parser')
    chaptername = soup.h1.string
    chaptercontent = soup.find_all('div',id="content")[0].text
    chaptercontent = ' '.join(chaptercontent.split())
    content = chaptercontent.replace(' ','\r\n\n')
    finallcontent = chaptername + '\r\n\n\n' + content
    return finallcontent

def downloadnovel(url):
"""打开每个章节链接并逐一下载并保存内容"""
    pagehtml = open_url(url)
    soup = BeautifulSoup(pagehtml, 'html.parser')
    novelname = soup.h1.string
    auther = soup.p.string
    other = soup.find('div',id="info").find_all('p')
    print(novelname) # 名称
    print(auther)  #作者
    print(other[1].text) #状态
    print(other[-1].text) #最新章节
    print(other[-2].text) #最后更新
    print('开始下载小说')
    chapterlist = chapter_url(url)#传入小说首页,获取所有章节的链接
    lenchapter = len(chapterlist)
    print('这部小说一共有%d 章' % lenchapter)
    count = 1
    for url in chapterlist:
        text = get_content(url)
        with open('斗罗大陆之龙王传说.txt','a+',encoding='utf-8') as f:
            f.write(text + '\r\n\n\n\n')
            a = ((count / lenchapter) * 100)
            print('正在下载第%d章,进度%.2f%%' % (count, a))  # 这里是用来计算进度
            count += 1
    print('下载完成!')


if __name__=='__main__':
    url = 'https://www.xsbiquge.com/66_66414//'
    downloadnovel(url)


原创:https://zhuanlan.zhihu.com/p/80975802

你可能感兴趣的:(爬虫)