verify=False 忽略证书验证
import requests
# 取消忽略ssl的验证警告
import urllib3
urllib3.disable_warnings()
from lxml import etree
from multiprocessing.dummy import Pool
####获取网页信息####
def get_dow(url):
ip = {
}
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36'
}
req = requests.get(url,headers=headers,proxies=ip,verify=False)
req.encoding = 'gbk'
soup = etree.HTML(req.text)
return (soup)
####多线程下载章节####
def cache_download(url):
dictname = url[-13:-5]
chapter_url = prefix+url
soup = get_dow(chapter_url)
content_name = soup.xpath('//*[@class="bookname"]/h1/text()')[0]
content_text = soup.xpath('//*[@id="content"]/text()')
content_text = ''.join(content_text)
dict[dictname] = [content_name,content_text]
print(content_name)
####顺序合并多线程下载章节####
def cache_text(url):
dictname = url[-13:-5]
content_name = dict[dictname][0]
content_text = dict[dictname][1]
with open(name+'.txt','a',encoding='utf-8') as f:
f.write(content_name+'\n')
f.write(content_text+'\n')
####开始####
if __name__ == '__main__':
print('仅支持:https://www.45zw.la/'+'\n')
dict = {}
a = input('输入要下载的链接码:')
target='https://www.45zw.la/txt/'+a+'/'
prefix = 'https://www.45zw.la/txt/'+a+'/'
soup = get_dow(target)
chapter_url_list = soup.xpath('//*[@id="list"]/dl/dd/a/@href')
name = str(soup.xpath('//*[@id="info"]/h1/text()')[0])
print('\n'+'你须要下载的小说是:'+name+'\n')
chapter_unm = int(input('请输入开始章节:')) +11
thread = int(input('\n'+'要启动多少线程下载:'))
pool = Pool(thread)
pool.map(cache_download,chapter_url_list[chapter_unm:])
pool.close()
pool.join()
pool = Pool(1)
pool.imap(cache_text,chapter_url_list[chapter_unm:])
pool.close()
pool.join()
print('\n'+'....下载完成....')