先看看壁纸爬取的结果,这是动漫部分壁纸,总共有几个分类。
本次爬取其实只需要了解两个基础知识点即可:
本文爬取的网站是 彼岸网图,初看觉得网站反爬等安全处理不是很到位,较容易爬取,希望大家以学习为目的,也希望作者能加强反爬等安全措施,现在能爬,随着网站服务升级以后不一定能爬 。
注:彼岸图网为用户免费分享产生,请勿用于商业用途!
分页显示,通过打开过个页面观察 URL
很容易知道他的分页就是 URL 上跟随页数(index)变化,这样通过 URL 拼接分别请求不同页数。这里就以动漫壁纸分类为例:
动漫壁纸:只有首页没有 index 是个特例
https://pic.netbian.com/4kdongman/
https://pic.netbian.com/4kdongman/index_2.html
https://pic.netbian.com/4kdongman/index_3.html
https://pic.netbian.com/4kdongman/index_4.html
假设,通过页面 URL 请求拿到了本页的 html 文本数据,接下来是解析当前页面获得图片信息(图片地址、跳转地址,文字描述等)。
class ImageListInfoItem(scrapy.Item):
# 图片地址,缩略图
img_src = scrapy.Field()
# 图片的一些其他信息,如标题、描述
img_alt = scrapy.Field()
img_href = scrapy.Field()
img_desc = scrapy.Field()
# 点击图片详情页
img_detail_url = scrapy.Field()
检查元素,复制 xpath,可直接上手爬取~
class ImageDetailInfoItem(scrapy.Item):
# 图片标题:报纸墙 长卷发 大波浪美女 4K动漫壁纸
img_title = scrapy.Field()
# 图片下载地址
img_src = scrapy.Field()
# 图片大小:2.88MB
img_size = scrapy.Field()
# 上传时间:2024-01-22 01:12:20
img_time = scrapy.Field()
# 图片尺寸:4096x3048
img_dimension = scrapy.Field()
# 图片分类:4k动漫
img_type = scrapy.Field()
Spider
:是用户自定义的类,定义爬取逻辑,包括开始请求的 URL、如何解析响应内容、提取数据或生成新的请求等。
Selectors
:内置了基于 XPath 和 CSS 选择器的方法,用来从 HTML 或 XML 文档中提取数据。
Item
:一种数据容器,用于表示爬取到的数据条目,是一个 Python 字典子类,具有预定义的字段名和类型,便于组织和持久化爬取结果。
Pipeline
:是一组可自定义的组件,负责处理 Spider 抓取到的 Item。这些组件按顺序执行,可以实现对 Item 的清洗、验证、去重、存储等。
这几个是本文用到的,需深入了解还需查阅官方文档比较好,下面列举获取动漫壁纸为例。
获取动漫壁纸每一页数据:
import json
import os
from typing import Any
import scrapy
from scrapy.http import Response
from wallpaper.items import ImageListInfoItem
# 动漫壁纸
# https://pic.netbian.com/4kdongman/
# https://pic.netbian.com/4kdongman/index_2.html
# https://pic.netbian.com/4kdongman/index_3.html
# https://pic.netbian.com/4kdongman/index_4.html
class DongmanImageListSpider(scrapy.Spider):
name = "dongman_image_list_spider"
def start_requests(self):
start_page_index = 2
end_page_index = 133
for page_index in range(start_page_index, end_page_index):
url = "https://pic.netbian.com/4kdongman/index_" + str(page_index) + ".html"
yield scrapy.Request(url=url, callback=self.parser_dongman_image_list_info)
# 测试
break
# 图片列表数据
def parser_dongman_image_list_info(self, response: Response):
xpath_selector = scrapy.Selector(text=response.text)
lu_li_a_elements = xpath_selector.xpath(
'//*[@id="main"]/div[3]/ul[@class="clearfix"]//li//a'
).getall()
for a_tag in lu_li_a_elements:
a_selector = scrapy.Selector(text=a_tag)
a_href = a_selector.xpath("//a/@href").get()
img_src = a_selector.xpath("//a//img/@src").get()
img_alt = a_selector.xpath("//a//img/@alt").get()
b_text = a_selector.xpath("//a//b/text()").get()
baseUrl = "https://pic.netbian.com"
info = ImageListInfoItem()
info["img_href"] = a_href
info["img_src"] = baseUrl + img_src
info["img_alt"] = img_alt
info["img_desc"] = b_text
info["img_detail_url"] = baseUrl + a_href
yield info
运行 spider 爬取数据:
scrapy crawl dongman_image_list_spider
请求获得的数据最终在 pipeline 中被存储到本地 dongman_image_list_spider.jl
json 文件。
{
"img_href": "/tupian/32227.html",
"img_src": "https://pic.netbian.com/uploads/allimg/230905/124045-16938888459690.jpg",
"img_alt": "秋日午后角落女孩4k壁纸",
"img_desc": "秋日午后角落女孩4k壁纸",
"img_detail_url": "https://pic.netbian.com/tupian/32227.html"
}
获取每一个图片详细信息:
这里的请求地址来自上述爬取获得。
import json
import os
from typing import Any
import scrapy
from scrapy.http import Response
from wallpaper.items import ImageDetailInfoItem
class DongmanImageDetailSpider(scrapy.Spider):
name = "dongman_image_detail_spider"
def start_requests(self):
current_dir = os.getcwd()
dongman_file_path = os.path.join(
current_dir, "data/dongman_image_list_spider.jl"
)
with open(dongman_file_path, "r", encoding="utf-8") as f:
for line in f:
if line == None or len(line.split()) == 0:
continue
json_dict = json.loads(line.strip())
img_detail_url = json_dict["img_detail_url"]
print("img_detail_url ====" + img_detail_url)
yield scrapy.Request(
url=img_detail_url, callback=self.parser_dongman_image_detail_info
)
# 测试
break
# 解析页面详情
def parser_dongman_image_detail_info(self, response: Response):
if response.text is None:
return
pageDetailInfo = ImageDetailInfoItem()
xpath_selector = scrapy.Selector(text=response.text)
title = xpath_selector.xpath(
'//*[@id="main"]/div[2]/div[1]/div[1]/h1/text()'
).get()
pageDetailInfo["img_title"] = title
img_src = xpath_selector.xpath('//*[@id="img"]/img/@src').get()
pageDetailInfo["img_src"] = "https://pic.netbian.com" + img_src
info_element = xpath_selector.xpath(
'//*[@id="main"]/div[2]/div[2]/div[2]//p'
).getall()
for element in info_element:
div_selector = scrapy.Selector(text=element)
value = div_selector.xpath("//span//a/text()").get()
if value is None:
value = div_selector.xpath("//span//text()").get()
if value is None:
continue
if "MB" in value:
pageDetailInfo["img_size"] = value
elif "-" in value and ":" in value:
pageDetailInfo["img_time"] = value
elif "x" in value:
pageDetailInfo["img_dimension"] = value
else:
pageDetailInfo["img_type"] = value
yield pageDetailInfo
最终也是通过 pipeline 存储到本地 dongman_image_detail_spider.jl
json 文件。
{
"img_title": "黄色 夕阳 女孩 猫 流星 星空 4k动漫壁纸",
"img_src": "https://pic.netbian.com/uploads/allimg/230912/210736-1694524056d756.jpg",
"img_type": "4K动漫",
"img_dimension": "3840x2160",
"img_size": "3.43 MB",
"img_time": "2023-09-12 21:06:37"
}
定义每个组装起来的数据结构。
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class ImageListInfoItem(scrapy.Item):
img_src = scrapy.Field()
img_alt = scrapy.Field()
img_href = scrapy.Field()
img_desc = scrapy.Field()
img_detail_url = scrapy.Field()
class ImageDetailInfoItem(scrapy.Item):
img_title = scrapy.Field()
img_src = scrapy.Field()
img_size = scrapy.Field()
img_time = scrapy.Field()
img_dimension = scrapy.Field()
img_type = scrapy.Field()
简单的把数据以 json 格式写到文件中。
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
import json
from itemadapter import ItemAdapter
# 图片列表详情
class ImageListPipeline:
data_index = 1
def open_spider(self, spider):
self.file = open(spider.name + ".jl", "w", encoding="utf-8")
def close_spider(self, spider):
self.file.close()
def process_item(self, item, spider):
line = json.dumps(dict(item), ensure_ascii=False) + "\n"
self.file.write(line)
self.data_index = self.data_index + 1
return item
经过上述操作我们已经拿到了图片下载地址等信息。
图片下载额外写脚本运行,通过读取本地每个壁纸分类的数据内的 img_src
,使用 python 自带的 requests
发起 get 请求即可;这个过程下载了近两万多张图片,中间没有中断,似乎没遇到什么反爬阻拦,大多数网站其实或多或少都有反爬措施的。
# download.py
import json
import os
import requests
global_successful_count = 1
global_failed_count = 1
def download():
# 这里是每个壁纸分类本地数据文件名
filename_list = [
"dongman_image_detail_spider",
"senery_image_detail_spider",
"beauty_image_detail_spider",
"backdrop_image_detail_spider",
"game_image_detail_spider",
"car_image_detail_spider",
"move_image_detail_spider",
"animal_image_detail_spider",
"ipad_image_detail_spider",
"single_image_detail_spider",
"phone_image_detail_spider",
]
for filename in filename_list:
current_dir = os.getcwd()
file_path = os.path.join(current_dir, "data/" + filename + ".jl")
start_download_image(file_path, filename)
def start_download_image(file_path, file_dir_name):
with open(file_path, "r", encoding="utf-8") as f:
for line in f:
request_data = genRequestData(line, file_dir_name)
if request_data is None:
continue
url = request_data.url
is_ok = download_save_to_file(
url, os.path.join(request_data.save_path, request_data.filename)
)
if is_ok:
global global_successful_count
print("download ok: ", global_successful_count, url)
global_successful_count = global_successful_count + 1
else:
global global_failed_count
print("download fail : ", global_failed_count, url)
global_failed_count = global_failed_count + 1
# 测试
break
def genRequestData(line, dir_name):
if line is None or len(line.strip()) == 0:
return None
json_dict = json.loads(line.strip())
url = json_dict["img_src"]
if url is None or len(url.strip()) == 0:
return None
split = url.split("/")
if len(split) < 1:
return None
filename = split[len(split) - 1]
if filename.endswith(".jpg") is False and filename.endswith(".png") is False:
filename = filename + ".jpg"
request_data = DownloadRequestData()
request_data.filename = filename
request_data.url = url
file_path = os.path.join(os.getcwd(), "data/img/" + dir_name)
request_data.save_path = file_path
if not os.path.exists(file_path):
os.makedirs(file_path)
return request_data
def download_save_to_file(url, filename):
response = requests.get(url)
if response.status_code != 200:
return False
with open(filename, "wb") as f:
f.write(response.content)
return True
class DownloadRequestData:
url = ""
filename = ""
save_path = ""
def to_string(self):
print("download request data: ", self.filename, self.url, self.save_path)
if __name__ == "__main__":
download()
一切就绪之后直接运行即可下载:
python download.py
我这里爬取到的应该不是原图,比如高清原图是 4MB,可是我拿到的是 400KB,不过没关系,后面打算本地使用类似高清图片修复框架自己再处理一次。
所以搜索后找到了一个开源、免费、多平台、本地处理
的工具:Upscayl
对比三组图片,处理效果是比较满意的。