使用爬虫爬取网络图片,只需要知道图片的地址,以百度随意一张为例,https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1562045484890&di=942bc24341f9fe3e9c63d158a0b65543&imgtype=0&src=http%3A%2F%2Fwww.chinapoesy.com%2FUploadFiles%2FPoesy%2F20141015_92b4978b-973a-472c-b4e8-33d89e01853f.jpg
想要保存图片,只需要分析出图片的src。一个简单的例子,爬取一张妹子图片。
发送网络请求----->获取响应---->分析img标签的位置----->获取图片的地址---->保存---->打完收工
rs = requests.get("http://www.mmonly.cc/mmtp/swmn/289848_2.html")
soup = BeautifulSoup(rs.text, 'lxml')
ele = soup.find("div", class_="big-pic")
img = ele.find('img')
src1 = img.get('src')
print(src1)
rs1 = requests.get(src1)
with open('2.jpg', 'wb') as f:
f.write(rs1.content)
该网站是典型的反扒,直接访问是没有问题的,但是当你复制地址,在新的对话框内打开时,会出现403,解决策略,通过目录页间接进入。学习于一言不合就开车的张诚的博客python入门系列课程。
获取目录后,诸葛进行访问,获取每个界面的最大值,遍历保存即可。
(滑稽)要是只想保存比较色青的图片,可以使用阿里云的图片智能鉴黄api。
import requests
from bs4 import BeautifulSoup
import os
class mzi_Spider():
def __init__(self):
self.all_url = "http://www.mzitu.com/all"
self.headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36",
"referer": "https://www.mzitu.com/tag/youhuo/",
}
# 存储路径
self.path = ''
def get_link_list(self):
# 发送请求
start_html = requests.get(self.all_url, headers=self.headers)
# print(start_html.text)
soup = BeautifulSoup(start_html.text, "html.parser")
peles = soup.find_all("p", class_="url")
return peles
def read_link(self, peles):
for n in peles:
# 读取每一个p分组内的a标签
aeles = n.find_all("a")
return aeles
def get_max_size(self, a):
# 进行请求 获得最大页面
html = requests.get(a["href"], headers=self.headers)
mess = BeautifulSoup(html.text, "html.parser")
pic_max = mess.find("div", class_="pagenavi").find_all('span')[-2]
pic_max = pic_max.text
return pic_max
def spider_f(self, aeles):
for a in aeles:
title = a.get_text()
if title != "":
print("ready:" + title)
if os.path.exists(self.path + title.strip().replace('?', '')):
print("directory already exists")
flag = 1
else:
# 创建目录
os.makedirs(self.path + title.strip().replace('?', ''))
flag = 0
try:
pic_max = self.get_max_size(a)
except:
continue
if flag == 1 and len(os.listdir(self.path + title.strip().replace('?', ''))) >= int(pic_max):
print("已经保存完毕,跳过")
continue
else:
self.Save(a, pic_max, title)
def Save(self, a, pic_max, title):
for num in range(1, int(pic_max) + 1):
pic = a["href"] + "/" + str(num)
html = requests.get(pic, headers=self.headers)
mess = BeautifulSoup(html.text, "html.parser")
pic_url = mess.find('img', alt=title)
if pic_url == None:
continue
print(pic_url['src'])
html = requests.get(pic_url['src'], headers=self.headers)
filename = pic_url['src'].split('/')[-1]
f = open(title + "/" + filename, "wb")
f.write(html.content)
f.close()
def MainLoop(self):
#主要逻辑
#网站的目录列表获取
peles = self.get_link_list()
#读取获取到的连接
aeles = self.read_link(peles)
#开始爬取
self.spider_f(aeles)
if __name__ == '__main__':
spiderman = mzi_Spider()
spiderman.MainLoop()
print("success")