爬取今日头条

import re
import requests
import json,os
from urllib import request

def get_detail(url,title):
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NTr 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36'
    }
    # url = 'https://www.toutiao.com/a6589905154147877384/#p=3'
    response = requests.get(url,headers=headers)
    s = response.text

    match_res = re.search(r'gallery: JSON.parse\((.+?)\)',s)
    match_count = json.loads(match_res.group(1))
    # print((match_count))
    result = json.loads(match_count)
    print(type(result))

    if not os.path.exists('download/'+title):
        os.makedirs('download/'+title)




    for image_ in result['sub_images']:
        image_url = image_['url']
        fname = image_url.split('/')[-1]
        request.urlretrieve(image_url,'download/'+title+'/'+fname+'.jpg')


    print((result))

def get_url(offset=0):
    url = 'https://www.toutiao.com/search_content/?offset={}&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1&from=search_tab'
    full_url = url.format(offset)
    response = requests.get(full_url)

    res_json = response.json()
    # print(res_json)
    # print(type(res_json))
    # 循环获得四页图片
    number = offset/20
    if number <= 4:
        number += 1
        offset = 20*(number)
        for page in res_json['data']:
            if  'article_url' in page:
                article_url = page['article_url']
                title = page['title']
                get_detail(article_url,title)
        get_url(offset)

if __name__ == '__main__':
    # 从第一页开始获取数据
    get_url(0)import re
import requests
import json,os
from urllib import request

def get_detail(url,title):
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NTr 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36'
    }
    # url = 'https://www.toutiao.com/a6589905154147877384/#p=3'
    response = requests.get(url,headers=headers)
    s = response.text

    match_res = re.search(r'gallery: JSON.parse\((.+?)\)',s)
    match_count = json.loads(match_res.group(1))
    # print((match_count))
    result = json.loads(match_count)
    print(type(result))

    if not os.path.exists('download/'+title):
        os.makedirs('download/'+title)




    for image_ in result['sub_images']:
        image_url = image_['url']
        fname = image_url.split('/')[-1]
        request.urlretrieve(image_url,'download/'+title+'/'+fname+'.jpg')


    print((result))

def get_url(offset=0):
    url = 'https://www.toutiao.com/search_content/?offset={}&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=1&from=search_tab'
    full_url = url.format(offset)
    response = requests.get(full_url)

    res_json = response.json()
    # print(res_json)
    # print(type(res_json))
    # 循环获得四页图片
    number = offset/20
    if number <= 4:
        number += 1
        offset = 20*(number)
        for page in res_json['data']:
            if  'article_url' in page:
                article_url = page['article_url']
                title = page['title']
                get_detail(article_url,title)
        get_url(offset)

if __name__ == '__main__':
    # 从第一页开始获取数据
    get_url(0)

转载于:https://www.cnblogs.com/luwanhe/p/9490785.html

你可能感兴趣的:(爬取今日头条)