“我看见一个男人,前几年他无忧无虑,逍遥自在,现在他,一身酒味,两眼无光,满脸憔悴,我很想心疼他一下,于是我伸手摸了一下镜子”
一个比较简单的爬虫源码,爬取 peca 网站作品信息,包括图片及文字内容信息,几乎没有限制,适合新人学习练手使用,文末附上源码供参考学习。
这里简单分享一下批量下载图片的方式,使用了三种方式,前两种是多线程的方式,后面应用了队列的方式,最后是采用了线程池批量下载图片。
第一种,以图片数量来确定线程数批量下载图片!
import threading
def down_imgs(path,imgs):
threadings = []
for img in imgs:
t = threading.Thread(target=get_img, args=(path,img))
threadings.append(t)
t.start()
for x in threadings:
x.join()
print(f"恭喜,多线程下载图片完成!")
第二种,使用了队列控制,开启四个线程下载图片。
#使用了队列
import threading
from queue import Queue
def down_four_imgs(path,imgs):
imgQ=Queue()
for img in imgs:
imgQ.put(img)
while not imgQ.empty():
threadings = []
for i in range(5):
imgurl=imgQ.get()
t = threading.Thread(target=get_img, args=(path,imgurl))
threadings.append(t)
t.start()
for x in threadings:
x.join()
print(f"恭喜,四线程下载图片完成!")
print(f"恭喜,多线程下载图片完成!")
第三种,应用了线程池批量下载图片。
#线程池
def downs_imgs(path,imgs):
datas=[]
for img in imgs:
data=path,img
datas.append(data)
try:
# 开4个 worker,没有参数时默认是 cpu 的核心数
pool = ThreadPool(4)
results = pool.map(gets_img, datas)
pool.close()
pool.join()
print("采集所有图片完成!")
except:
print("Error: unable to start thread")
附完整源码:
# -*- coding: UTF-8 -*-
# Author@公众号:eryeji
# https://www.peca.com.mx/
import requests
from lxml import etree
import time
import random
import re
import threading
import os
from queue import Queue
from multiprocessing.dummy import Pool as ThreadPool
def get_ua():
ua_list = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36Chrome 17.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0Firefox 4.0.1',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
]
ua=random.choice(ua_list)
return ua
def get_hrefs():
url='https://www.peca.com.mx/'
headers={
"User-Agent":get_ua()
}
response=requests.get(url=url,headers=headers,timeout=6)
print(response.status_code)
html=response.content.decode('utf-8')
#print(html)
tree=etree.HTML(html)
hrefs=tree.xpath('//a[@class="grid-item"]/@href')
print(len(hrefs))
print(hrefs)
for href in hrefs:
href=f'https://www.peca.com.mx{href}'
try:
get_detail(href)
except Exception as e:
print(f"采集错误:错误代码:{e}")
time.sleep(3)
def get_detail(href):
headers = {
"User-Agent": get_ua()
}
response = requests.get(url=href, headers=headers, timeout=6)
print(response.status_code)
html = response.content.decode('utf-8')
#print(html)
tree = etree.HTML(html)
h1=tree.xpath('//h1/text()')[0]
pattern = r"[\/\\\:\*\?\"\<\>\|]"
h1=re.sub(pattern, "_", h1) # 替换为下划线
print(h1)
path = f'{h1}/'
os.makedirs(path, exist_ok=True)
print(f">> 生成保存目录 {h1} 文件夹成功!")
ptexts=tree.xpath('//div[@class="accordion-item__dropdown"]//p/text()')
ptext='\n'.join(ptexts)
print(ptext)
with open(f'{path}{h1}.txt','w',encoding='utf-8') as f:
f.write(f'{h1}\n{ptext}')
print(f">> 保存 {h1}.txt 文件成功!")
imgs=tree.xpath('//div[@class="gallery-masonry-item-wrapper"]//img/@data-src')
print(len(imgs))
print(imgs)
down_imgs(path, imgs)
#down_four_imgs(path, imgs)
#downs_imgs(path, imgs)
# 3次重试
def get_resp(url):
i = 0
while i < 4:
try:
headers = {
"User-Agent":get_ua()
}
response = requests.get(url, headers=headers, timeout=10)
print(response.status_code)
return response
except requests.exceptions.RequestException:
i += 1
print(f">> 获取网页出错,6S后将重试获取第:{i} 次")
time.sleep(i * 2)
def down_four_imgs(path,imgs):
imgQ=Queue()
for img in imgs:
imgQ.put(img)
while not imgQ.empty():
threadings = []
for i in range(5):
imgurl=imgQ.get()
t = threading.Thread(target=get_img, args=(path,imgurl))
threadings.append(t)
t.start()
for x in threadings:
x.join()
print(f"恭喜,四线程下载图片完成!")
print(f"恭喜,多线程下载图片完成!")
def down_imgs(path,imgs):
threadings = []
for img in imgs:
t = threading.Thread(target=get_img, args=(path,img))
threadings.append(t)
t.start()
for x in threadings:
x.join()
print(f"恭喜,多线程下载图片完成!")
def downs_imgs(path,imgs):
datas=[]
for img in imgs:
data=path,img
datas.append(data)
try:
# 开4个 worker,没有参数时默认是 cpu 的核心数
pool = ThreadPool(4)
results = pool.map(gets_img, datas)
pool.close()
pool.join()
print("采集所有图片完成!")
except:
print("Error: unable to start thread")
#下载图片
def gets_img(data):
path=data[0]
img_url=data[1]
img_name = img_url.split('/')[-1]
r = get_resp(img_url)
time.sleep(1)
with open(f'{path}{img_name}', 'wb')as f:
f.write(r.content)
print(f">> {img_name}下载图片成功")
#下载图片
def get_img(path,img_url):
img_name = img_url.split('/')[-1]
r = get_resp(img_url)
time.sleep(1)
with open(f'{path}{img_name}', 'wb')as f:
f.write(r.content)
print(f">> {img_name}下载图片成功")
def main():
get_hrefs()
if __name__=='__main__':
main()
往期推荐
微博爬虫,python微博用户主页小姐姐图片内容采集爬虫
图片爬虫,手把手教你Python多线程下载获取图片
Python下载爬虫,解析跳转真实链接下载文件
Python爬虫,B站视频下载源码脚本工具助手附exe
·················END·················
你好,我是二大爷,
革命老区外出进城务工人员,
互联网非早期非专业站长,
喜好python,写作,阅读,英语
不入流程序,自媒体,seo . . .
公众号不挣钱,交个网友。
读者交流群已建立,找到我备注 “交流”,即可获得加入我们~
听说点 “在看” 的都变得更好看呐~
关注关注二大爷呗~给你分享python,写作,阅读的内容噢~
扫一扫下方二维码即可关注我噢~
关注我的都变秃了
说错了,都变强了!
不信你试试
扫码关注最新动态
公众号ID:eryeji