任务
- 爬取上面这 4 本小说
- 使用 requests 库
- 不能漏掉 1 章
- 在有限的时间内爬完
- 以下面这个形式保存
ip 代理的设置
免费 ip 代理网站:
- https://seofangfa.com/proxy/
- http://www.data5u.com/
代理测试网站:
测试代理是否可用
import requests
proxy = ['221.131.158.246:8888','183.245.8.185:80','218.7.171.91:3128',
'223.82.106.253:3128','58.250.21.56:3128','221.6.201.18:9999',
'27.220.51.34:9000','123.149.136.187:9999','125.108.127.160:9000',
'1.197.203.254:9999','42.7.30.35:9999','175.43.56.24:9999',
'125.123.154.223:3000','27.43.189.161:9999','123.169.121.100:9999']
for i in proxy:
proxies = {
'http':'http://'+i,
'https':'https://'+i
}
print(proxies)
try:
response = requests.get("http://httpbin.org/",proxies=None)
print(response.text)
except requests.exceptions.ConnectionError as e:
print('Error',e.args)
随机选取 1 个 ip
import requests
from random import choice
def get_proxy():
proxy = ['221.131.158.246:8888','183.245.8.185:80','218.7.171.91:3128',
'223.82.106.253:3128','58.250.21.56:3128','221.6.201.18:9999',
'27.220.51.34:9000','123.149.136.187:9999','125.108.127.160:9000',
'1.197.203.254:9999','42.7.30.35:9999','175.43.56.24:9999',
'125.123.154.223:3000','27.43.189.161:9999','123.169.121.100:9999']
return choice(proxy)
proxy = get_proxy()
proxies = {
'http':'http://'+proxy,
'https':'https://'+proxy
}
print(proxies)
try:
response = requests.get("http://httpbin.org/",proxies=None)
print(response.text)
except requests.exceptions.ConnectionError as e:
print('Error',e.args)
完整代码
import requests
import re
import os
import threading
from random import choice
def get_proxy():
proxy = ['221.131.158.246:8888','218.7.171.91:3128','58.250.21.56:3128']
return choice(proxy)
def getHTMLText(url,timeout = 100):
try:
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36",
}
proxy = get_proxy()
print(proxy)
proxies = {
'http':'http://'+proxy,
'https':'https://'+proxy
}
r = requests.get(url,headers=headers,proxies=proxies)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return '错误'
def write_file(file,content):
title_content = re.findall(r'(.*?)
[\s\S]*?([\s\S]*?)
',content)
for title,content in title_content:
content = content.replace(' ',' ').replace('
','\n')
with open(file,'w',encoding='utf-8') as f:
f.write('\t\t\t\t'+title+'\n\n\n\n')
f.write(content)
def download(book,title,href):
'''
book: 小说名称
title: 章节标题
href: 小说内容的url
'''
content = getHTMLText(href)
write_file(book+"\\"+title+'.txt',content)
def main():
threads = []
url = "http://www.xbiquge.la"
html = getHTMLText(url)
novel_info = re.findall(r'[\s\S]*?
.*?(.*?)',html)
for href,book in novel_info:
print(href,book)
if os.path.exists(book):
pass
else:
os.mkdir(book)
novel = getHTMLText(href)
chapter_info = re.findall(r"(.*?)",novel)
for href,title in chapter_info:
href = url + href
print(href,title)
T = threading.Thread(target=download,args=(book,title,href))
T.setDaemon(False)
T.start()
threads.append(T)
for T in threads:
T.join()
if __name__ == "__main__":
main()
效果
总结
解决方法:
- 这次可不用 ip 代理,或用付费 ip 代理,构造自己的代理池
- 增加超时处理