80行代码爬取百度知道问答全站数据

爬取知道问答策略

采用广度优先策略
采用redis-queue-tool分布式爬取框架

安装依赖包

pip install requests
pip install bs4
pip install py-log
pip install retrying
pip install redis-queue-tool==4.4.0

实例代码

# -*- coding: utf-8 -*-
# @Time    : 2020/7/18 10:53
# @Author  : CC
# @Desc    : baidu_zhidao_spider.py
# @Notice  :Please do not use it for commercial use. All consequences are borne by users
import re
import time
import traceback
import requests
from urllib.parse import quote
from bs4 import BeautifulSoup
from py_log import get_logger
from redis_queue_tool import task_deco
from retrying import retry

logger = get_logger('baidu_zhidao_spider')


@task_deco('zhidao:questions:list', threads_num=20, qps=15)
def get_list_data(url):
    try:
        wb_data = get_list_source_html(url)
        soup = BeautifulSoup(wb_data.text, 'lxml')
        # 定义爬取的数据
        titles = soup.select('a.ti')
        answer_times = soup.select('dd.dd.explain.f-light > span:nth-of-type(1)')
        answer_users = soup.select('dd.dd.explain.f-light > span:nth-of-type(2) > a')
        answers = soup.select('dd.dd.explain.f-light > span:nth-of-type(3) > a')
        # 在获取到的数据提取有效内容
        for title, answer_time, answer_user, answer in zip(titles, answer_times, answer_users, answers):
            data = {'answer_time': answer_time.get_text(),
                    'answer_user': answer_user.get_text(),
                    'answers': answer.get_text().replace('个回答', ''),
                    'answer_detail_url': title['href'],
                    'create_time': int(time.time() * 1000)}
            logger.info(data)
    except:
        logger.error(traceback.format_exc())


def push_list_task(keywords: list = ['景德镇']):
    if keywords:
        for keyword in keywords:
            try:
                keyword = quote(keyword)
                url_first = f'https://zhidao.baidu.com/search?lm=0&rn=10&pn=10&fr=search&ie=gbk&word={keyword}'
                pagesize = int(get_totalcount_by_keyword(url_first) / 10)
                for page in range(0, pagesize + 1):
                    url = f'https://zhidao.baidu.com/search?lm=0&rn=10&pn={page * 10}&fr=search&ie=gbk&word={keyword}'
                    get_list_data.pub(url)
            except:
                get_list_data.pub(url_first)
                logger.error(traceback.format_exc())


def get_totalcount_by_keyword(url):
    wb_data = get_list_source_html(url)
    soup = BeautifulSoup(wb_data.text, 'lxml')
    last_url = soup.select('a.pager-last')[0]['href']
    total_count = re.match(".*pn=(\d+)", last_url).group(1)
    logger.info(f'keyword:{url},total_count:{total_count}')
    return int(total_count)


@retry(stop_max_attempt_number=5)
def get_list_source_html(url):
    logger.info(url)
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36',
        'Cookie': 'ab_jid=7f174627b431e8ca4f8a5b57e4558ec3d1d6; ab_jid=7f174627b431e8ca4f8a5b57e4558ec3d1d6; BAIDUID=F0A2CA8353329EC7266079FDA6DC9E77:FG=1; ZD_ENTRY=empty; BAIDUID=822782C0542CB8101C22F210324FF50F:FG=1; ZD_ENTRY=empty'
    }
    wb_data = requests.request("GET", url, headers=headers, timeout=10)
    wb_data.encoding = 'gbk'
    return wb_data


if __name__ == '__main__':
    push_list_task()
    get_list_data.start()

框架参数说明

本框架默认使用redis作为中间件(若未安装reids,可设置middleware='sqlite'参数,使用sqllite作为中间件)

@task_deco('zhidao:questions:list', threads_num=20, qps=15)
1.zhidao:questions:list 发布消费任务使用到的队列名称
2.threads_num=20  并发爬取设置的线程数
3.qps=15 设置的每秒最大并发请求数,防止请求过大,导致爬取站点宕机

依赖分布式框架项目参数说明文档

redis-queue-tool

你可能感兴趣的:(80行代码爬取百度知道问答全站数据)