pixiv 搜索图片时,需要会员才能按人气排序。
因此写了这个爬虫。可搜索pixiv图片, 并只保存大于一定收藏数的图片
(没有代理,需要能登陆上pixiv)
import requests
import re
import os
from lxml import etree
def get_html(url, headers):
try:
r = requests.get(url, headers=headers)
r.raise_for_status()
r.encoding = r.apparent_encoding
html = r.text
return html
except:
print("爬取失败")
return ""
def get_artwork_html(url, headers):
try:
r = requests.get(url, headers=headers)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
print("爬取失败")
return ""
# 获取图片url
def get_img_urls(illust_ids, headers, tag, bookmark):
img_urls = []
count = 0
print('\n正在加载数据: ', tag)
for illust_id in illust_ids:
artwork_url = 'https://www.pixiv.net/artworks/{}'.format(illust_id)
artwork_html = requests.get(artwork_url, headers).text
bookmark_count = re.findall(
'"width":.*?,"height":.*?,"pageCount":.*?,"bookmarkCount":(.*?),"likeCount":.*?,"commentCount":.*?',
artwork_html)
if int(bookmark_count[0]) >= bookmark:
img_url = re.findall('{"mini":".*?","thumb":".*?","small":".*?","regular":"(.*?)","original":".*?"}',
artwork_html)
img_urls.append(img_url[0])
count += 1
print('\r加载进度: {:.2f}%'.format(count * 100 / len(illust_ids)), end='')
return img_urls
# 下载图片
def download_pictures(img_urls, headers, root, tag):
count = 0
print('\n即将开始下载: ' + tag)
try:
right_tag = re.sub('[\/:*?"<>|]', '', tag)
dir_path = root + '/' + right_tag
if not os.path.exists(root):
os.mkdir(root)
if not os.path.exists(dir_path):
os.mkdir(dir_path)
for img_url in img_urls:
file_name = img_url.split('/')[-1]
picture_path = dir_path + '/' + file_name
if not os.path.exists(picture_path):
r = requests.get(img_url, headers=headers)
with open(picture_path, 'wb') as f:
f.write(r.content)
count += 1
print('\r下载进度: {:.2f}%'.format(count * 100 / len(img_urls)), end='')
else:
print("图片已存在")
except:
print("保存失败")
if __name__ == '__main__':
tag = input('检索作品: ')
bookmark = int(input('输入收藏数: '))
cookie = input('输入你的cookie: ')
headers = {
'authority': 'pixon.ads-pixiv.net',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36',
'sec-fetch-dest': 'iframe',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'sec-fetch-site': 'cross-site',
'sec-fetch-mode': 'navigate',
'referer': 'https://www.pixiv.net/',
'accept-language': 'zh-CN,zh;q=0.9,ja;q=0.8',
'cookie': cookie
}
root = 'D:\pixiv_tag_imgs'
for page in range(1, 10):
url = 'https://www.pixiv.net/ajax/search/artworks/{}?word={}&order=date_d&mode=all&p={}&s_mode=s_tag&type=all&lang=ja'.format(tag, tag, page)
html = get_html(url, headers)
illust_ids = re.findall('"illustId":"(.*?)"', html)
img_urls = get_img_urls(illust_ids, headers, tag, bookmark)
download_pictures(img_urls, headers, root, tag)