直接贴代码,含注释,有时间再写解释。
# -*- coding:utf-8 -*-
import os
import time
import random
import requests
from bs4 import BeautifulSoup
# 爬取的图像保存路径
SAVE_DIR = os.path.join(os.getcwd(), 'pictures')
if not os.path.exists(SAVE_DIR):
os.mkdir(SAVE_DIR)
# 使用BeautifulSoup解析html
def parse_html(html):
soup = BeautifulSoup(html, 'html.parser')
# find_all函数返回'bs4.element.ResultSet'对象列表,其中每个元素是'bs4.element.Tag'对象
# 'bs4.element.Tag'对象通过find()方法获取子标签,通过get()方法获取属性的值
divs = soup.find_all('div', attrs={'class': 'item masonry_brick masonry-brick'})
imgs = [div.find('a').find('img') for div in divs]
list_data = []
for img in imgs:
name = img.get('alt')
img_src = img.get('src')
list_data.append([name, img_src])
dict_data = dict(list_data)
return dict_data
# 下载单张图像,返回图像的字节数据
def get_image_content(url, headers):
print("开始下载", url)
try:
# 设置连接活跃状态为False
s = requests.session()
s.keep_alive = False
r = requests.get(url, headers=headers)
r.raise_for_status()
return r.content
except Exception as e:
print('图像下载失败', str(e))
# 下载指定单页的图像
def spider_images(page=1):
url = 'https://www.mmonly.cc/ktmh/hzw/list_34_%d.html' % page
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
'Chrome/76.0.3809.132 Safari/537.36',
'Connection': 'close'
}
try:
resp = requests.get(url, headers=headers)
resp.encoding = resp.apparent_encoding # 防止乱码
resp.raise_for_status() # 如果状态码为4XX或5XX则抛出异常
except Exception as e:
print('爬取失败', str(e))
# 开始解析html
html = resp.text # 编码问题
dictdata = parse_html(html)
for img in dictdata.items():
img_name = img[0].replace('/', ' ')
img_url = img[1]
file_path = '{0}/{1}.{2}'.format(SAVE_DIR, img_name, 'jpg')
if os.path.exists(file_path):
os.remove(file_path)
img_bytes = get_image_content(img_url, headers)
if img_bytes is None:
print('网站提供的路径有误:{}'.format(img_url))
continue
with open(file_path, 'wb') as f:
f.write(img_bytes)
print('图像下载成功')
# 批量下载前5页的图像
def batch_spider_images():
t0 = time.time()
for i in range(1, 6):
spider_images(i)
# 设置时间间隔
print('暂停一下')
time.sleep(3)
elapsed = time.time() - t0
print('共花时长:{}'.format(elapsed))
if __name__ == '__main__':
batch_spider_images()
二、多线程下载:
# -*- coding:utf-8 -*-
import os
import time
import random
import requests
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
from bs4 import BeautifulSoup
# 设置线程数
MAX_WORKERS = 5
# 爬取的图像保存路径
SAVE_DIR = os.path.join(os.getcwd(), 'pictures')
if not os.path.exists(SAVE_DIR):
os.mkdir(SAVE_DIR)
# 使用BeautifulSoup解析html
def parse_html(html):
soup = BeautifulSoup(html, 'html.parser')
# find_all函数返回'bs4.element.ResultSet'对象列表,其中每个元素是'bs4.element.Tag'对象
# 'bs4.element.Tag'对象通过find()方法获取子标签,通过get()方法获取属性的值
divs = soup.find_all('div', attrs={'class': 'item masonry_brick masonry-brick'})
imgs = [div.find('a').find('img') for div in divs]
list_data = []
for img in imgs:
name = img.get('alt')
img_src = img.get('src')
list_data.append([name, img_src])
dict_data = dict(list_data)
return dict_data
# 下载单张图像,返回图像的字节数据
def get_image_content(url, headers):
print("开始下载", url)
try:
# 设置连接活跃状态为False
s = requests.session()
s.keep_alive = False
r = requests.get(url, headers=headers)
r.raise_for_status()
return r.content
except Exception as e:
print('图像下载失败', str(e))
# 下载指定单页的图像
def spider_images(page=1):
url = 'https://www.mmonly.cc/ktmh/hzw/list_34_%d.html' % page
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
'Chrome/76.0.3809.132 Safari/537.36',
'Connection': 'close'
}
try:
resp = requests.get(url, headers=headers)
resp.encoding = resp.apparent_encoding # 防止乱码
resp.raise_for_status() # 如果状态码为4XX或5XX则抛出异常
except Exception as e:
print('爬取失败', str(e))
# 开始解析html
html = resp.text # 编码问题
dictdata = parse_html(html)
for img in dictdata.items():
img_name = img[0].replace('/', ' ')
img_url = img[1]
file_path = '{0}/{1}.{2}'.format(SAVE_DIR, img_name, 'jpg')
if os.path.exists(file_path):
os.remove(file_path)
img_bytes = get_image_content(img_url, headers)
if img_bytes is None:
print('网站提供的路径有误:{}'.format(img_url))
continue
with open(file_path, 'wb') as f:
f.write(img_bytes)
print('图像下载成功')
# 批量下载前5页的图像
def batch_spider_images():
t0 = time.time()
# with方法内部已经实现了wait(),在使用完毕之后可以自行关闭线程池,不用手动进行executor.shutdown,减少资源浪费。
with ThreadPoolExecutor(MAX_WORKERS) as executor:
executor.map(spider_images, range(1, 6))
elapsed = time.time() - t0
print('共花时长:{}'.format(elapsed))
if __name__ == '__main__':
batch_spider_images()