import requests
from lxml import etree
import time
'''
思路:
1,确定想要爬取的小说及入口url
2,爬章节链接并通过字符串拼接得到所有章节详情页的
3,爬取书名
4,爬取每章的标题5,爬取每章具体内容的文本
6,将每章小说以章节累加并保存为一个单独的txt文件
'''
# 设置请求头
headers = {'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
url = 'http://www.biquge.info/84_84283/'
def get_html(url):
# 获取网页数据
html = requests.get(url, headers=headers)
html.encoding = 'utf-8'
html_code = html.text
# 解析网页
soup = etree.HTML(html_code)
# 返回解析后的页面内容
return soup
# 获取各章节目录链接
def get_list(url):
soup = get_html(url)
# 查找所有章节的链接
list_box = soup.xpath('//*[@id="list"]/dl/dd/a/@href')
# 新建列表用来储存list的url
book_lists = []
for i in list_box:
# 放进列表里
book_lists.append(url + i)
return book_lists
# 获取书的名称
def get_book_title(url):
soup = get_html(url)
book_title = soup.xpath('//*[@id="info"]/h1/text()')
book_title = str(book_title)
return book_title
# 获取文章页 标题
def get_title(url):
soup = get_html(url)
title = soup.xpath('//*[@id="wrapper"]/div[4]/div/div[2]/h1/text()')
return title
# 获取文章页 正文
def get_novel_content(url):
soup = get_html(url)
# 获得需要的正文内容
content = soup.xpath('//*[@id="content"]/text()')
return content
# 保存到本地
def save_novel(url):
book_lists = get_list(url)
# title = get_title(url)
book_title = get_book_title(url)
num = 1
with open(book_title+'.txt', 'a', encoding='utf-8') as f:
for list_url in book_lists:
chapter_title = get_title(list_url)
# 这个地方写的有问题,标题的标签没有清理干净
for t in chapter_title:
f.write(t)
chapter_content = get_novel_content(list_url)
for c in chapter_content:
f.write(c+"\n")
# time.sleep(2)
print('***第{}章下载完成***'.format(num))
num += 1
f.close()
if __name__ == '__main__':
save_novel(url)
参考: 一个妹子在B站的视频+微信链接https://mp.weixin.qq.com/s?__biz=MzIxOTcyMDM4OQ==&mid=2247483927&idx=1&sn=d4c9fcb6becc3e1d26a8d8385d8c2b99&chksm=97d7bdbda0a034ab3faf0f30ed50a1e35a0a9edcceb9b2ae9a0a6c7e4efd72a64cde07df439f&token=1524452913&lang=zh_CN#rd
比较优雅的代码,看的很舒服,思路很清晰:
https://blog.csdn.net/sinat_34937826/article/details/105562463?utm_medium=distribute.pc_relevant.none-task-blog-BlogCommendFromMachineLearnPai2-10.nonecase&depth_1-utm_source=distribute.pc_relevant.none-task-blog-BlogCommendFromMachineLearnPai2-10.nonecase