python爬虫 按人气顺序下载pixiv图片

pixiv 搜索图片时,需要会员才能按人气排序。
因此写了这个爬虫。可搜索pixiv图片, 并只保存大于一定收藏数的图片
(没有代理,需要能登陆上pixiv)

import requests
import re
import os
from lxml import etree


def get_html(url, headers):
    try:
        r = requests.get(url, headers=headers)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        html = r.text
        return html
    except:
        print("爬取失败")
        return ""


def get_artwork_html(url, headers):
    try:
        r = requests.get(url, headers=headers)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        print("爬取失败")
        return ""

# 获取图片url
def get_img_urls(illust_ids, headers, tag, bookmark):
    img_urls = []
    count = 0
    print('\n正在加载数据: ', tag)
    for illust_id in illust_ids:
        artwork_url = 'https://www.pixiv.net/artworks/{}'.format(illust_id)
        artwork_html = requests.get(artwork_url, headers).text
        bookmark_count = re.findall(
            '"width":.*?,"height":.*?,"pageCount":.*?,"bookmarkCount":(.*?),"likeCount":.*?,"commentCount":.*?',
            artwork_html)
        if int(bookmark_count[0]) >= bookmark:
            img_url = re.findall('{"mini":".*?","thumb":".*?","small":".*?","regular":"(.*?)","original":".*?"}',
                                 artwork_html)
            img_urls.append(img_url[0])
        count += 1
        print('\r加载进度: {:.2f}%'.format(count * 100 / len(illust_ids)), end='')
    return img_urls

# 下载图片
def download_pictures(img_urls, headers, root, tag):
    count = 0
    print('\n即将开始下载: ' + tag)
    try:
        right_tag = re.sub('[\/:*?"<>|]', '', tag)
        dir_path = root + '/' + right_tag

        if not os.path.exists(root):
            os.mkdir(root)
        if not os.path.exists(dir_path):
            os.mkdir(dir_path)
        for img_url in img_urls:
            file_name = img_url.split('/')[-1]
            picture_path = dir_path + '/' + file_name
            if not os.path.exists(picture_path):
                r = requests.get(img_url, headers=headers)
                with open(picture_path, 'wb') as f:
                    f.write(r.content)
                    count += 1
                    print('\r下载进度: {:.2f}%'.format(count * 100 / len(img_urls)), end='')
            else:
                print("图片已存在")

    except:
        print("保存失败")


if __name__ == '__main__':
	tag = input('检索作品: ')
    bookmark = int(input('输入收藏数: '))
    cookie = input('输入你的cookie: ')
    headers = {
        'authority': 'pixon.ads-pixiv.net',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36',
        'sec-fetch-dest': 'iframe',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'cross-site',
        'sec-fetch-mode': 'navigate',
        'referer': 'https://www.pixiv.net/',
        'accept-language': 'zh-CN,zh;q=0.9,ja;q=0.8',
        'cookie': cookie
    }
    
    root = 'D:\pixiv_tag_imgs'

    for page in range(1, 10):
        url = 'https://www.pixiv.net/ajax/search/artworks/{}?word={}&order=date_d&mode=all&p={}&s_mode=s_tag&type=all&lang=ja'.format(tag, tag, page)
        html = get_html(url, headers)
        illust_ids = re.findall('"illustId":"(.*?)"', html)
        img_urls = get_img_urls(illust_ids, headers, tag, bookmark)
        download_pictures(img_urls, headers, root, tag)

你可能感兴趣的:(爬虫)