概述
- 书虫看小说总是喜欢本地看txt,有的会是想要收藏,还有一些是网页中没有给出具体章节。
根据这些,我们可以将网页中的小说下载下来。
如果其中标有章节但却是按节阅读的时候,我们可以进行正则匹配替换来校准章节名
代码示例
import threading
import time
import codecs
import requests_html
import pyperclip
import urllib.parse
'''
pyperclip粘贴小说网站
https://www.xxx.xxx
处理了requests_html.py中html = html.decode(DEFAULT_ENCODING,'replace')
设置页面编码gb18030
'''
elList = 'ul.clearfix'
elName = 'div.desc h1'
elContent = 'div#mycontent'
begin = time.perf_counter()
class myTread(threading.Thread):
def __init__(self,threadID,name,st):
threading.Thread.__init__ (self)
self.threadID = threadID
self.name = name
self.st = st
def run(self):
print(self.st)
threadget(self.st)
txtcontent = {}
chaptername = []
chapteraddress = []
def getchapter(list):
a_s = list.find('a')
for i, a in enumerate(a_s):
href = a.attrs['href']
chap = a.text
chaptername.append(chap)
content_url = 'https://' + _ip + href
chapteraddress.append(content_url)
def getdetail(url):
page = session.get(url)
page.encoding = 'utf-8'
encoding = page.html.encoding
text = page.html.find(elContent, first=True).text
content = text.replace('app2();\nread2();', '')
return content
def threadget(st):
max = len(chaptername)
while st < max:
url = chapteraddress[st]
content = getdetail(url)
txtcontent[st] = content
st += thread_count
requests_html.DEFAULT_ENCODING = "gb18030"
session = requests_html.HTMLSession()
url = pyperclip.paste()
page = session.get(url)
page.encoding = 'utf-8'
list = page.html.find(elList, first=True)
name = page.html.find(elName, first=True).text
_ip = urllib.parse.urlsplit(url).netloc
getchapter(list)
thread_list = []
thread_count = 16
for id in range(thread_count):
thread1 = myTread(id,str(id),id)
thread_list.append(thread1)
for t in thread_list:
t.daemon = False
t.start()
for t in thread_list:
t.join()
print('\n子线程运行完毕')
txtcontent1 = sorted(txtcontent)
file = codecs.open('D:/爬虫/待替换/'+name+'.txt','w','utf-8')
chaptercount = len(chaptername)
for ch in range(chaptercount):
content = str(txtcontent[txtcontent1[ch]])
file.write(content)
file.close()
end = time.perf_counter()
print('下载完毕,总耗时',end-begin,'秒')
最后
代码仓库-my-py/爬虫/novel