Python爬取图片(你懂得)
requests与Bs4
这两个模块是本文使用的主要模块,requests可以获取连接,bs4全名BeautifulSoup,是编写python爬虫常用库之一,主要用来解析html标签。这两个模块可以通过cmd终端下载
pip install bs4
pip install requests
代码实现
import requests
from bs4 import BeautifulSoup
import os
class Mzitu():
def __init__(self):
self.headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36'
}# 构造请求头,主网站的请求头较为简单只需构造浏览器头
self.base_path = os.getcwd() # 获取当前路径
def get_url(self,html):
'''获取每个套图的链接,并返回'''
html_b=BeautifulSoup(html,'lxml')
urls_b = html_b.find_all('ul',attrs={'id':'pins'})[0]
urls = urls_b.find_all('a')
for i in urls:
yield i['href']
def get_img_url_max(self,url):
'''获取图片的张数'''
html_i = requests.get(url,headers=self.headers).text
html_b = BeautifulSoup(html_i,'lxml')
max_number=html_b.find_all('div',attrs={'class':'pagenavi'})[0]
max_number = max_number.find_all('a')[-2].span.text
return max_number
def get_img_url(self,url):
'''获取每张图片的链接'''
html_i = requests.get(url, headers=self.headers).text
html_b = BeautifulSoup(html_i, 'lxml')
img_url = html_b.find_all('div',attrs={'class':'main-image'})[0].p.a.img['src']
return img_url
def download_img(self,name,url):
'''获取每张图片的内容'''
headers = {
'Accept':'image/webp,image/apng,image/*,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.9',
'Connection':'keep-alive',
'Host': 'i.meizitu.net',
'Referer': 'http://www.mzitu.com/%s'%name,
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36'
}
img = requests.get(url,headers=headers).content
return img
def get_img(self,name,max,img_url):
'''下载图片'''
path = os.path.join(self.base_path,name)
if os.path.exists(path):
pass
else:
os.mkdir(path)
for i in range(1,int(max)):
k = str(i)
file_name = k+'.jpg'
img_file_name = os.path.join(path,file_name)
if len(k) <2:
img_url = img_url[:-5]+k+img_url[-4:]
else:
img_url = img_url[:-6]+k+img_url[-4:]
img = self.download_img(name,img_url)
with open(img_file_name,'wb') as f:
f.write(img)
def get_html_url_link_max(self):
'''获取主网站中的总页数'''
url = 'http://www.mzitu.com/'
html = requests.get(url,headers = self.headers).text
html_b = BeautifulSoup(html,'lxml')
max_number = html_b.find_all('a',attrs={'class':'page-numbers'})[-2]['href']
max_number = max_number.split('/')[4]
return max_number
def main(self):
max_number = int(self.get_html_url_link_max())
for i in range(1,max_number+1):
'''遍历构造网址'''
url = 'http://www.mzitu.com/page/%d/'%i
html = requests.get(url,headers=self.headers).text
urls = self.get_url(html)
for i in urls:
name = i.split('/')[-1]
max_number = self.get_img_url_max(i)
img_url = self.get_img_url(i)
self.get_img(name,max_number,img_url)
if __name__ == '__main__':
mzitu = Mzitu()
mzitu.main()
运行程序后,即可在同文件夹下发现不断有包含图片的文件夹生成
封装后的exe下载