利用Python3解析Ajax爬取今日头条图集

本文内容参考来自:

1.分析Ajax抓取今日头条街拍美图

2.爬虫学习-爬取今日头条图集

今日头条街拍图集首页是利用ajax异步加载方式加载数据。

详情页组图的URL在html页面的javascript中,使用正则表达式匹配。

街拍图集首页
街拍图集首页请求参数
返回json格式数据
返回json数据的data中某一项
详情页组图的URL

完整代码如下:

1.toutiao_spider.py (爬取图片并且下载到本地,同时保存信息到mongoDB中)

#-*- coding: utf-8 -*-
import re
from urllib.parse import urlencode
import pymongo
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
import requests
import json
import os
from hashlib import md5
from Mongo_config import *
from multiprocessing import Pool

client = pymongo.MongoClient(MONGO_URL, connect=False)
db  = client[MONGON_DB]

headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0',
        'Content-Type': 'application/x-www-form-urlencoded',
        'Connection': 'Keep-Alive',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
    }

#获取页面信息
def get_page_index(offset, keyword):
    data = {
        'offset': offset,
        'format': 'json',
        'keyword': keyword,
        'autoload': 'true',
        'count': 20,
        'cur_tab': 1,
        'from':'gallery'
    }

    url = 'https://www.toutiao.com/search_content/?' + urlencode(data)
    try:
        response = requests.get(url, headers = headers)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        print('请求索引出错')
        return None

#索引
def parse_page_index(html):
    data = json.loads(html)
    if data and 'data' in data.keys():
        for item in data.get('data'):
            yield item.get('article_url')

#获取详情页信息
def get_page_detail(url):
    try:
        response = requests.get(url, headers = headers)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        print('请求详情页出错', url)
        return None

# 获取页面详情
def parse_page_detail(html, url):
    soup = BeautifulSoup(html, 'lxml')
    # 获取标题
    title = soup.select('title')[0].get_text()
    # 图片正则表达式对象
    images_pattern = re.compile('gallery: JSON.parse\("(.*?)"\)', re.S)
    result = re.search(images_pattern, html)
    # 替换不需要的数据
    json_images = re.sub(r'\\{1,2}', '', result.group(1))
    if  result:
         images_data = json.loads(json_images)
         if images_data and 'sub_images' in images_data.keys():
             sub_images = images_data.get('sub_images')
             # 转换成数组
             images = [item.get('url') for item in sub_images]
             # 下载图片
             for image in images: down_load_images(image)
             return{
                'title': title,
                'url': url,
                'images': images
             }

# 下载图片
def down_load_images(url):
    print('正在下载',url)
    try:
        response = requests.get(url, headers = headers)
        if response.status_code == 200:
            save_images(response.content)
        return None
    except RequestException:
        print('请求图片出错', url)
        return None

# 存储图片
def save_images(content):
    file_path = '{0}/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg')
    if not os.path.exists(file_path):
        with open(file_path, 'wb') as f:
            f.write(content)
            f.close()

# 存储到mongoDB
def save_to_mongo(result):
    if db[MONGO_TABLE].insert(result):
        print('存储成功', result)
        return True
    return False

def main(offset):
    html = get_page_index(offset, KEYWORD)
    for url in parse_page_index(html):
        html = get_page_detail(url)
        if html:
            result = parse_page_detail(html, url)
            if result: save_to_mongo(result)
            print(result)

if __name__ == '__main__':
    groups = [x*20 for x in range(GROUP_START, GROUP_END + 1)]
    pool = Pool()
    pool.map(main, groups)

2.Mongo_config (定义Mongodb全局配置信息)

#-*- coding: utf-8 -*-
MONGO_URL = 'localhost'
MONGON_DB = 'toutiao'
MONGO_TABLE = 'toutiao'

# 如果没有mongodb密码就不用写数据库密码了,如果有就需要填写一下
GROUP_START = 1
GROUP_END = 20
KEYWORD = '街拍'

3、运行结果


Pycharm运行结果

下载到本地

保存信息到Mongodb

你可能感兴趣的:(利用Python3解析Ajax爬取今日头条图集)