小说爬虫强制绕过ssl验证

小说爬虫强制绕过ssl验证

requests.get(url,verify=False)

verify=False 忽略证书验证

import requests
# 取消忽略ssl的验证警告
import urllib3
urllib3.disable_warnings()

from lxml import etree
from multiprocessing.dummy import Pool

####获取网页信息####
def get_dow(url):
        ip = {
                }
        headers = {
                'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36'
                }
        req = requests.get(url,headers=headers,proxies=ip,verify=False)
        req.encoding = 'gbk'
        soup = etree.HTML(req.text)
        return (soup)
        
####多线程下载章节####
def cache_download(url):
	dictname = url[-13:-5]
	chapter_url = prefix+url
	soup = get_dow(chapter_url)
	content_name = soup.xpath('//*[@class="bookname"]/h1/text()')[0]
	content_text = soup.xpath('//*[@id="content"]/text()')
	content_text = ''.join(content_text)
	dict[dictname] = [content_name,content_text]
	print(content_name)

####顺序合并多线程下载章节####
def cache_text(url):
	dictname = url[-13:-5]
	content_name = dict[dictname][0]
	content_text = dict[dictname][1]
	with open(name+'.txt','a',encoding='utf-8') as f:
		f.write(content_name+'\n')
		f.write(content_text+'\n')

####开始####
if __name__ == '__main__':
	print('仅支持:https://www.45zw.la/'+'\n')
	dict = {}
	a = input('输入要下载的链接码:')
	target='https://www.45zw.la/txt/'+a+'/'
	prefix = 'https://www.45zw.la/txt/'+a+'/'
	soup = get_dow(target)
	chapter_url_list = soup.xpath('//*[@id="list"]/dl/dd/a/@href')
	name = str(soup.xpath('//*[@id="info"]/h1/text()')[0])
	print('\n'+'你须要下载的小说是:'+name+'\n')
	chapter_unm  = int(input('请输入开始章节:')) +11
	thread = int(input('\n'+'要启动多少线程下载:'))
	pool = Pool(thread)
	pool.map(cache_download,chapter_url_list[chapter_unm:])
	pool.close()
	pool.join()
	pool = Pool(1)
	pool.imap(cache_text,chapter_url_list[chapter_unm:])
	pool.close()
	pool.join()
	print('\n'+'....下载完成....')

你可能感兴趣的:(Python)