踩过的坑:
1. OSError: [Errno 22] Invalid argument 创建jpg文件时,直接用的图片链接作为图片名的,而链接中有'/',所以报错了,解决方法是链接切片
2. TypeError: a bytes-like object is required, not 'str' 把URL返回的response写入图片时报错,resp.text返回的是Unicode型的数据,
所以用resp.content,它返回的是bytes型也就是二进制的数据
#coding=utf-8
import time
import requests
from lxml import etree
import time
from multiprocessing.dummy import Pool
headers = {
'userAgent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N)\
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Mobile Safari/537.36'}
def get_info(url):
'''
get源码,encode,解析,xpath,保存
'''
response = requests.get(url, headers=headers)
response = response.text.encode('utf-8')
selector = etree.HTML(response)
soup = selector.xpath('//*[@class="photo-item photo-item--overlay"]/a[1]/img')
list_url = []
for img in soup:
photo = img.get('src')
list_url.append(photo)
for item in list_url:
with open(item[33:39]+ '.jpg', 'wb') as fp: #创建jpg
data = requests.get(item, headers = headers) #get url
fp.write(data.content) #写入.text返回的是Unicode型的数据,所以用.content返回的是bytes型也就是二进制的数据
if __name__ == '__main__':
urls = ['https://www.pexels.com/?page={}'.format(str(i)) for i in range(1, 2)]
start_time = time.time()
for url in urls:
print(url)
get_info(url)
end_time = time.time()
print('time1 : ', end_time - start_time)
#多线程
# start_time2 = time.time()
# pool = Pool(processes=6)
# pool.map(get_info, urls)
# end_time2 = time.time()
# print('time2 : ', end_time2 - start_time2)