python爬取pixvic

# coding=utf-8
import sys
import requests  # 导入requests库
import re  # 导入正则表达式库
import os  # 保存文件
import threading  # 导入多线程库
VNK = 'dbcbfa01'  # 默认的VNK
data = '2020-07-21'  # 获取爬取日期
pages = 1  # 因为异步加载,所以实际上是多页
downloadPath = '/Users/lidong/Downloads/'


def get_pixiv(page):
    user = {

        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36',

    }

    url = 'https://api.pixivic.com/ranks?page=' + str(
        page) + '&date=' + data + '&mode=day&pageSize=30'  # 异步加载接口,返回json数组

    response = requests.get(url, headers=user)  # 模拟访问

    response.encoding = response.apparent_encoding  # 防止乱码

    html = response.text  # 用文本显示访问网页得到的内容

    urls = re.findall('"original":"https://i.pximg.net/img-original/img/(..../../../../../../[0-9]*?_p0.*?g)"',
                      html)  # 用正则表达式获得本页各网址

    names = re.findall('"artistId":.*?,"title":"(.*?)","type"', html)  # 获取图片名字

    ids = re.findall('"original":"https://i.pximg.net/img-original/img/..../../../../../../([0-9]*?)_p0.*?g"',
                     html)  # 获取图片id,为后面referer做准备

    for name, url, id in zip(names, urls, ids):

        user = {

            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18362',

            'Referer': 'https://pixivic.com/illusts/' + id + '?VNK=' + VNK,  # 缺少此将返回403

            'Accept': 'image/png, image/svg+xml, image/*; q=0.8, */*; q=0.5',

            'Host': 'original.img.cheerfun.dev',

            'Cache-Control': 'max-age=0',

            'Accept-Encoding': 'gzip, deflate, br',

            'Connection': 'Keep-Alive',

            'Accept-Language': 'zh-CN'

        }

        url = 'https://original.img.cheerfun.dev/img-original/img/' + url  # 真实的原图地址,抓包获得,无法直接访问

        try:

            name = name.replace('\\', '_').replace('?', '过滤')  # 防止创建文件时因名字问题失败
        except:

            name = name

        response = requests.get(url, headers=user)  # 模拟访问

        if response.status_code == 200:  # 200即为成功

            print('正在下载图片:' + name)

        else:

            print('错误代码' + response.status_code + '下载图片' + name + '失败!')

        with open(downloadPath + name + '.jpg', 'wb') as f:  # 保存图片

            f.write(response.content)


if __name__ == '__main__':
    if sys.getdefaultencoding() != 'utf-8':
        reload(sys)
        sys.setdefaultencoding('utf-8')
    for page in [x for x in range(1, pages + 1)]: 
        threading.Thread(target=get_pixiv, args=(page,)).start()

你可能感兴趣的:(Python)