今日头条--寡姐


from multiprocessing.pool import Pool
from urllib.parse import urlencode

import requests
import os
from hashlib import md5

# 设置相关的请求头信息
headers = {
    'referer': 'https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D',
    'accept': 'application/json, text/javascript',
    'content-type': 'application/x-www-form-urlencoded',
    'User-Agent': 'Mozilla / 5.0(Windows NT 10.0;WOW64) AppleWebKit/537.36(KHTML, like Gecko) Chrome / 67.0.3396.99 Safari/537.36',
    'x-requested-with': 'XMLHttpRequest'
}

# 基本的url
base_url = 'https://www.toutiao.com/search_content/?'

def get_page(offest):
    params = {
        # ajax请求一直在修变的参数,所以将它设置为变量
        'offset': offest,
        'format': 'json',
        # 查询的主题(可以修改)
        'keyword': '斯嘉丽·约翰逊',
        'autoload': 'true',
        'count': 20,
        'cur_tab': 1,
        'from': 'search_tab'
    }
    # 拼接url
    url = base_url + urlencode(params)
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.json()
    except BaseException:
        return None
        
# 解析页面,将需要的文章标题和图片路径封装为dict
def parse_page(results):
    for result in results.get('data'):
        try:
            dict = {}
            dict['title'] = result['title']
            dict['images'] = result['image_list']
            yield dict
        except:
            pass

# 保存图片
def save_img(item:dict):
    # 保存图片的路径
    filename = 'C:\\Users\\13194\\Pictures\\'+item.get('title')
    if not os.path.exists(filename):
        os.mkdir(filename)
    for image in item.get('images'):
        img = image['url']
        try:
            # 如果不加https: 会报抛出没有协议名的异常
            resp = requests.get('https:'+img)
            if resp.status_code == 200:
                # 图片要以二进制的方式获取
                content = resp.content
                file_path = '{0}/{1}.{2}'.format(filename, md5(content).hexdigest(), 'jpg')
            if not os.path.exists(file_path):
                with open(file_path, 'wb') as file:
                    file.write(content)
            else:
                print('Already Downloaded', file_path)
        except requests.ConnectionError:
            print('Faleld to Save Image' )

# 根据页面的offest,爬取网页
def main(offest):
    results = get_page(offest)
    for i in range(len(results)):
        items = parse_page(results)
        for item in items:
            save_img(item)
START = 1
END = 20

if __name__ == '__main__':
    pool = Pool()
    groups = [ x * 20 for x in range(START, END+1)]
    pool.map(main, groups)
    pool.close()
    pool.join()

你可能感兴趣的:(今日头条--寡姐)