Python爬虫练习:采集头条文章图片,并存储MangoDB数据

mango_config.py

#MangoDB 连接信息
MONGO_URL = 'localhost'
MONGO_DB = 'toutiao'
MONGO_TABLE = 'toutiao'

#采集列表的页数范围
GROUP_START = 1
GROUP_END = 20

#采集列表的搜索内容
KEYWORD = '美食'

app.py

import requests
import pymongo
import json
import re
import os
from hashlib import md5
from urllib.parse import urlencode
from requests.exceptions import RequestException
from mongo_config import *
from multiprocessing import Pool

#MangoDB初始化
client = pymongo.MongoClient(MONGO_URL)
mango_db = client[MONGO_DB]

#搜索列表页请求
def get_find_index(page=0,keyword="街拍"):
    data = {
        'aid':24,
        'app_name':'web_search',
        'offset':page,
        'format':'json',
        'keyword':keyword,
        'autoload':'true',
        'count':20,
        'en_qc':1,
        'cur_tab':1,
        'from':'search_tab',
        'pd':'synthesis',
        'timestamp':'1556807950562'
    }
    headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36','cookie':'tt_webid=6686435681956447751; WEATHER_CITY=%E5%8C%97%E4%BA%AC; UM_distinctid=16a78fa5ae74c8-012c5c6db69a84-6353160-1fa400-16a78fa5ae8380; CNZZDATA1259612802=1163063110-1556803453-%7C1556803453; __tasessionId=8pp8fiish1556807834386; tt_webid=6686435681956447751; csrftoken=fe46f2a51c61c8af81792a8cc3d368d9; s_v_web_id=3db23f6b1fb477d9e6f9b9a234e364b7'}

    try:
        q_data = requests.get('https://www.toutiao.com/api/search/content/?'+urlencode(data),headers=headers)
        if q_data.status_code == 200:
            return q_data.content
        return None
    except RequestException:
        print('请求索引错误')
        return None

#搜索列表URL提取
def parse_page_index(html):
    try:
        data_json = json.loads(html)
        if data_json and 'data' in data_json.keys():
            for item in data_json.get('data'):
                yield item.get('article_url')
    except TypeError:
        return None

#内容页HTML请求
def get_find_show(url):
    headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36','cookie':'tt_webid=6686435681956447751; WEATHER_CITY=%E5%8C%97%E4%BA%AC; UM_distinctid=16a78fa5ae74c8-012c5c6db69a84-6353160-1fa400-16a78fa5ae8380; CNZZDATA1259612802=1163063110-1556803453-%7C1556803453; __tasessionId=8pp8fiish1556807834386; tt_webid=6686435681956447751; csrftoken=fe46f2a51c61c8af81792a8cc3d368d9; s_v_web_id=3db23f6b1fb477d9e6f9b9a234e364b7'}
    try:
        q_data = requests.get(url,headers=headers)
        if q_data.status_code == 200:
            return q_data.text
        return None
    except RequestException:
        print('请求错误',url)
        return None

#内容页标题 + 图片URL列表 提取
def parse_page_show(html):
    try:
        title = re.search('(.*?)

总结:众所周知Bilibili是一个学习的网站!

你可能感兴趣的:(Python学习笔记)