Ajax爬取今日头条街拍美图

运行环境:python 3.6.0

"""
抓取今日头条街拍美图,然后抓取到的图片去重后分类存放
为了加快效率启动了多进程
"""
import requests
from urllib.parse import urlencode
import os
from hashlib import md5
from multiprocessing.pool import Pool

# 在创建二级目录的时候替换掉不合法的字符
table = {ord(f): ord(t) for f, t in zip(
     '\/:*?"<>|',
     '         ')}


def get_page(search_keywords, offset):
    """
    拿到网页源码
    :param search_keywords: 搜索关键字
    :param offset: 页数
    :return: 网页源码
    """
    parse = {
        'aid': '24',
        'app_name': 'web_search',
        'offset': offset,
        'format': 'json',
        'keyword': search_keywords,
        'autoload': 'true',
        'count': 20,
        'en_qc': 1,
        'cur_tab': 1,
        'from': 'search_tab',
        'pd': 'synthesis'
    }
    url = 'https://www.toutiao.com/api/search/content/?' + urlencode(parse)
    print(url)
    try:
        response = requests.get(url=url)
        if response.status_code == 200:
            return response.json()
        else:
            print('请求内容错误')
    except requests.RequestException as e:
        print('Error', e.args)
        return None


def get_image(json):
    """
    拿到图片的信息
    :param json: 获取网页的json数据
    :return: 图片的信息
    """
    if json.get('data'):
        # print(json.get('data'))
        for item in json.get('data'):
            if item.get('image_list'):
                title = item.get('title')
                images = item.get('image_list')
                for image in images:
                    yield {
                        'title': title,
                        'image': image.get('url'),
                    }


def save_image(save_directory, item):
    """
    保存图片
    :param save_directory: 图片保存目录
    :param item: 网页的json数据
    :return: None
    """
    content = save_directory
    if not os.path.exists(content):
        os.mkdir(content)

    # 用图片的标题命名文件夹并替换掉不合法字符
    two_level_directory = item.get('title').translate(table).replace('.', '').strip()
    if not os.path.exists("{0}/{1}".format(content, two_level_directory)):
        os.makedirs("{0}/{1}".format(content, two_level_directory))

    try:
        response = requests.get(item.get('image'))
        if response.status_code == 200:
            image = md5(response.content).hexdigest()
            file_path = '{0}/{1}/{2}.jpg'.format(content, two_level_directory, image)
            if not os.path.exists(file_path):
                with open(file_path, 'wb') as f:
                    print(image)
                    f.write(response.content)

            else:
                print(file_path)
                print('is already download')

    except requests.ConnectionError as e:
        print('Failed to save image: ', item.get('title'))
        print('Reason: ', e.args)


def main(offset):
    """
    控制分页
    :param offset: 第几页
    :return: None
    """
    search_keywords = '街拍'
    save_directory = 'Ajax爬取今日头条街拍美图'
    json = get_page(search_keywords=search_keywords, offset=offset)
    # print(json)
    for item in get_image(json=json):
        print(item)
        save_image(save_directory, item=item)


GROUP_START = 0
GROUP_END = 20


if __name__ == '__main__':
    pool = Pool()
    groups = [x*20 for x in range(GROUP_START, GROUP_END)]
    pool.map(main, groups)
    pool.close()
    pool.join()

# [main(x*20) for x in range(GROUP_START, GROUP_END)]

# print(os.path.dirname(__file__))

运行结果:

Ajax爬取今日头条街拍美图_第1张图片

Ajax爬取今日头条街拍美图_第2张图片

 

你可能感兴趣的:(Spider)