1、系统分析目标网页
2、html标签数据解析方法
3、海量图片数据一键保存
python 3.8
pycharm 2021专业版
requests >>> pip install requests
parsel >>> pip install parsel
import requests
import parsel
import os
# 伪装
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36'
}
url = 'https://www.hexuexiao.cn/meinv/qingchunmeinv/'
list_html = requests.get(url).text
list_select = parsel.Selector(list_html)
url_list = list_select.css('.waterfall_1box dt a::attr(href)').getall()
title_list = list_select.css('.waterfall_1box dt a::text').getall()
for link, title in zip(url_list, title_list):
print(f'正在爬取: {title}')
if not os.path.exists('img/'+title):
os.mkdir('img/'+title)
# 1. 发送请求
# post: 不能直接用浏览器访问的
# get: 直接使用浏览器输入域名进行访问
response = requests.get(url=link, headers=headers)
response.encoding = 'utf-8'
# 2. 获取数据
html_data = response.text
# 3. 解析数据 图片
selector = parsel.Selector(html_data)
# 网站开发 html前端开发
# html: <div class="abc"></div> <button class="abc"></button> <span class="icon-bar"></span>
img_list = selector.css('.swiper-slide a img::attr(src)').getall()
img_list = img_list[1: -1]
for img_url in img_list:
big_img = img_url.replace('.300.jpg', '')
# content 获取二进制数据
img_data = requests.get(big_img).content
img_name = big_img.split('/')[-1]
print(img_name)
with open(f'img/{title}/{img_name}', mode='wb') as f:
f.write(img_data)