爬取的思路与我之前写的Java爬虫博客一致,这里不再分析,需要详解的看我另一篇博客
JAVA爬虫多线程高速爬取高清电脑壁纸
直接看代码吧,里面有注释,初次学习Python,如有问题请多指教
import requests
import threading
from bs4 import BeautifulSoup
str1 = 'http://desk.zol.com.cn/showpic/1920x1080_'
str2 = '_101.html'
def get(url, i):
print(threading.current_thread().name)
r = requests.get(url) # 获取网页代码
r.headers = {
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'close',
}
# 得到img标签最后得到图片的URL
soup = BeautifulSoup(r.text, 'lxml')
imgurl = soup.img['src']
r = requests.get(imgurl)
path = 'E:/picture/' + str(i) + '.jpg'
# 下载图片
with open(path, 'wb') as f:
f.write(r.content)
f.close()
print("img: " + str(i) + ".jpg download successfully!")
# 使用多线程下载一个线程下载一张图片,
if __name__ == '__main__':
ts = []
for i in range(91200, 92200):
url = str1 + str(i) + str2;
t = threading.Thread(target=get, args=(url, i))
ts.append(t)
for i in ts:
try:
i.start()
except:
continue # 防止单个线程出现异常而导致主线程不会启动后面的线程