这次我们来爬取一个图片网站 unsplash.com,为什么要选择这个网站呢?因为这个网站的所有图片都是js动态请求生成的,所以说一般的爬取肯定是不行的啦 ~
这次爬取我们需要借助一款工具代理工具:Fiddler
评价:强大好用,上手简单,没毛病,妥妥五星
下载链接:https://www.telerik.com/download/fiddler
由此数据可以得出:
图片的请求地址:https://unsplash.com/napi/photos?page=12&per_page=12&order_by=latest
图片的下载地址:download=https://unsplash.com/photos/J3hfM2S2KN0/download
import requests,json
if __name__ == '__main__':
target = 'https://unsplash.com/napi/photos?page=1&per_page=30&order_by=latest'
req = requests.get(url = target, verify=False)
jsons = json.loads(req.text);
for index in range(len(jsons)):
print('图片'+str(index+1)+'的ID: ',jsons[index]['id'])
踩坑:由于是HTTPS请求,所以需要设置 verify=False ,但是这么做会报警告,最好的做法如下:
解决方法:https://www.cnblogs.com/BlueSkyyj/p/7594533.html
注意:执行py文件的时候,记得关闭Fiddler
def download_pics(self, pid, file_name):
with closing(requests.get(url = self.down_domain.replace('xxx',pid),stream = True )) as req:
with open('images/%s.jpg' % file_name, 'ab+') as f:
for chunk in req.iter_content(chunk_size = 1024):
if(chunk):
f.write(chunk)
f.flush()
import requests,sys,time,json
from contextlib import closing
import t2
class get_pictures(object):
"""docstring for get_pictures"""
def __init__(self):
super(get_pictures, self).__init__()
self.pid = []
self.down_domain = 'https://unsplash.com/photos/xxx/download'
self.down_domain_href = 'https://unsplash.com/photos/'
self.target = 'https://unsplash.com/napi/photos' #?page=1&per_page=30&order_by=latest
self.cpage = 0 # 当前页面
self.apage = 5 # 下载多少页mian
self.per_page = 30 # 每页多少张图片
self.time = 1 # 翻页的速度
"""
函数说明: get_ids() 获取图片的id
"""
def get_ids(self):
for i in range(self.apage):
req = requests.get(self.target+'?page='+str(i)+'&per_page='+str(self.per_page)+'&order_by=latest') #, verify=False
imgs = json.loads(req.text)
time.sleep(self.time)
for i in range(len(imgs)):
self.pid.append(imgs[i]['id'])
"""
函数说明: donwload_pics() 下载图片到本地
"""
def download_pics(self, pid, file_name):
with closing(requests.get(url = self.down_domain.replace('xxx',pid),stream = True )) as req:
with open('images/%s.jpg' % file_name, 'ab+') as f:
for chunk in req.iter_content(chunk_size = 1024):
if(chunk):
f.write(chunk)
f.flush()
"""
执行图片下载
"""
if __name__ == '__main__':
gp = get_pictures();
gp.get_ids();
print('图片下载中……')
for i in range(len(gp.pid)):
print('正在下载第%d张图片' % (i+1))
gp.download_pics(gp.pid[i],'图片'+str(i+1));