多进程 + MongoDB 爬取今日头条街拍图集

注意

TypeError: can't pickle _thread.lock objects

  • 进程池无法序列化pymongo对象,因为pymongo数据库中含有线程锁。
# python3.7
# 使用多进程对今日头条街拍图片进行爬取,并将图片相关信息保存到MongoDB数据库
import re
import json
import requests
import pymongo
from hashlib import md5
from multiprocessing import Pool
from urllib.parse import urlencode


class JiePaiSpider(object):
    client = pymongo.MongoClient('localhost')  # 创建连接
    db = client['jiepai']  # 连接到jiepai数据库

    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
        }

    def get_list_json(self, offset):
        """
        请求列表页的json接口,获取列表页的图片信息
        :param offset: 请求接口时的偏移量参数(0,20,40...)
        :return:
        """
        params = {
            'offset': offset,
            'format': 'json',
            'keyword': '街拍',
            'autoload': 'true',
            'count': '20',
            'cur_tab': '1',
            'from': 'search_tab',
            'pd': 'synthesis'
        }
        api_url = 'https://www.toutiao.com/search_content/?' + urlencode(params)  # 此处在于学习urlencode()
        try:
            response = requests.get(api_url, headers=self.headers)
            if response.status_code == 200:
                return response.text
            else:
                print('请求异常:url={}, status_code={}'.format(api_url, response.status_code))
                return None
        except Exception as e:
            print('请求异常:url={}, error={}'.format(api_url, e))
            return None

    def parse_list_json(self, json_str):
        """
        解析列表页json数据
        :param json_str: 获取到的json字符串
        :return:
        """
        json_dict = json.loads(json_str)
        if 'data' in json_dict.keys():  # 判断字典json_dict的所有键里面是否包含'data',有的话就可以解析
            data_list = json_dict.get('data', None)
            if data_list and len(data_list) > 0:  # 说明有数据,可以解析
                urls = []
                for item in data_list:
                    if 'single_mode' not in item and 'cell_type' not in item:
                        article_url = item['article_url']
                        urls.append(article_url)
        return urls

    def get_detail_page(self, detail_url):
        try:
            response = requests.get(detail_url, headers=self.headers)
            if response.status_code == 200:
                return response.text
            else:
                print('请求异常:url={}, status_code={}'.format(detail_url, response.status_code))
                return None
        except Exception as e:
            print('请求异常:url={}, error={}'.format(detail_url, e))
            return None

    def parse_detail_page(self, detail_html):
        js_json_str = re.findall(re.compile(r'gallery: JSON.parse\((.*?)\),', re.S), detail_html)[0].replace('\\', '').strip('"')  # \(:表示对正则表达式中的(进行转义,转化为一个普通的字符
        data_dict = json.loads(js_json_str)

        for item_dict in data_dict['sub_images']:  # 解析Json,取出图片的url地址
            img_url = item_dict['url']
            self.download_image(img_url)  # 根据图片url地址,下载图片

            self.db['img'].insert_one(item_dict)  # 把图片url保存到数据库

    def download_image(self, img_url):
        response = requests.get(img_url, headers=self.headers)
        if response.status_code == 200:
            content = response.content  # 图片属于二进制资源,不再使用text,使用content
            img_name = md5(img_url.encode('utf-8')).hexdigest()  # md5()函数的参数需要是bytes字节码,不能是str类型的字符串;hexdigest()获取md5加密后的结果
            f = open('imgs/{}.jpg'.format(img_name), 'wb')  # 'w': 写入普通文本;'wb': 用于写入二进制数据(图片、音频、视频)
            f.write(content)
            f.close()
            print('正在下载图片...')
        else:
            print('图片url请求失败:{}'.format(img_url))

    def start_spider(self, offset):
        json_str = self.get_list_json(offset)
        if json_str:
            urls = self.parse_list_json(json_str)
            for detail_url in urls:
                detail_html = self.get_detail_page(detail_url)
                if detail_html:
                    self.parse_detail_page(detail_html)


if __name__ == '__main__':
    jp = JiePaiSpider()
    pool = Pool(3)  # 在进程池里创建3个进程
    pool.map(jp.start_spider, [x for x in range(0, 101) if x % 20 == 0])  # 爬取偏移量前100以内符合条件的图片
    print('爬取完毕')
    pool.close()  # 关闭pool,使其不在接受新的(主进程)任务
    pool.join()  # 主进程阻塞后,让子进程继续运行完成,子进程运行完后,再把主进程关掉

爬取到的部分数据

多进程 + MongoDB 爬取今日头条街拍图集_第1张图片

你可能感兴趣的:(多进程 + MongoDB 爬取今日头条街拍图集)