p站(pixiv)爬虫(爬取关注画师的插画)

import requests
import re
import os


def get_original_json_text(url, headers):
    try:

        r = requests.get(url, headers=headers)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.json()
    except:
        print("爬取失败")
        return ""


# 从json文件中获取作品编号
def get_illusts(dict):
    body = dict['body']
    author_illusts = body['illusts']
    illusts = [key for key, value in author_illusts.items()]
    return illusts


# 根据作品编号构造作品的url,然后匹配图片的url
def get_pictures_urls(illusts, headers, user_id):
    img_urls = []
    count = 0
    print('\n正在加载数据: ' + user_id)
    for illust in illusts:
        img_url = 'https://www.pixiv.net/artworks/' + illust
        html = requests.get(img_url, headers).text
        img_url = re.findall('{"mini":".*?","thumb":".*?","small":".*?","regular":"(.*?)","original":".*?"}', html)
        img_urls.append(img_url[-1])
        count += 1
        print('\r加载进度: {:.2f}%'.format(count * 100 / len(illusts)), end='')
    return img_urls


# 获取画师name,用于文件夹命名
def get_user_name(user_id, headers):
    name_url = 'https://www.pixiv.net/users/' + user_id
    r = requests.get(name_url, headers)
    html = r.text
    user_name = re.findall('(.*?) - pixiv', html)
    return user_name[-1]


# 保存图片
def download_pictures(pictures_urls, headers, root, user_name):
    count = 0
    print('\n即将开始下载: ' + user_name)
    try:
        right_user_name = re.sub('[\/:*?"<>|]', '', user_name)
        dir_path = root + '/' + right_user_name

        if not os.path.exists(root):
            os.mkdir(root)


        if not os.path.exists(dir_path):
            os.mkdir(dir_path)
        for pictures_url in pictures_urls:
            file_name = pictures_url.split('/')[-1]
            picture_path = dir_path + '/' + file_name
            if not os.path.exists(picture_path):
                r = requests.get(pictures_url, headers=headers)
                with open(picture_path, 'wb') as f:
                    f.write(r.content)
                    count += 1
                    print('\r下载进度: {:.2f}%'.format(count * 100 / len(pictures_urls)), end='')
            else:
                print("图片已存在")

    except:
        print("保存失败")

#获取关注的所有画师的id
def get_user_ids(headers):
    final_user_ids = []
    total_page = int('') # 关注画师的总页数
    for page in range(1, total_page + 1):
        url = 'https://www.pixiv.net/bookmark.php?type=user&rest=show&p={page}'.format(page=page)
        html = get_original_html_text(url, headers)
        user_ids = re.findall('"data-user_id="(.*?)"', html)
        for user_id in user_ids:
            final_user_ids.append(user_id)
    return final_user_ids


if __name__ == '__main__':
    headers = {
        'authority': 'pixon.ads-pixiv.net',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36',
        'sec-fetch-dest': 'iframe',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'cross-site',
        'sec-fetch-mode': 'navigate',
        'referer': 'https://www.pixiv.net/',
        'accept-language': 'zh-CN,zh;q=0.9,ja;q=0.8',
        'cookie' = '你的cookie'
    }
    root = "D:\pixiv_users_pictures"
    address = input('Enter your mail address: ') # 输入邮箱
    password = input('Enter your password: ')    # 输入密码

    user_ids = get_user_ids(headers)

    for user_id in user_ids:
        picture_url = 'https://www.pixiv.net/ajax/user/{}/profile/all?lang=ja'.format(user_id)

        dict = get_original_json_text(picture_url, headers)
        user_name = get_user_name(user_id, headers)
        illusts = get_illusts(dict)
        pictures_urls = get_pictures_urls(illusts, headers, user_id)
        download_pictures(pictures_urls, headers, root, user_name)



你可能感兴趣的:(爬虫)