(三)python爬虫实例 - 微博评论爬取

1、环境

  • pycharm,python3.5以上,requests,BeautifulSoup4,chrome

2、代码

import requests
from bs4 import BeautifulSoup
from urllib import parse

RQS_ID = ''  # ***去手动复制第一个评论的id
ROOT_COMMENT_MAX_ID = ''
ROOT_COMMENT_MAX_ID_TYPE = ''


def get_con_page(nbr):
    global RQS_ID, ROOT_COMMENT_MAX_ID, ROOT_COMMENT_MAX_ID_TYPE
    headers = {
        "Cookie": ""  # ***去手动复制cookie
    }
    if nbr == 1:
        res = requests.get(
            f'https://weibo.com/aj/v6/comment/big?ajwvr=6&id={RQS_ID}&from=singleWeiBo',
            headers=headers
        )
    else:
        res = requests.get(
            f'https://weibo.com/aj/v6/comment/big'
            f'?ajwvr=6&id={RQS_ID}'
            f'&root_comment_max_id={ROOT_COMMENT_MAX_ID}'
            f'&root_comment_max_id_type={ROOT_COMMENT_MAX_ID_TYPE}'
            f'&root_comment_ext_param='
            f'&page=%s&filter=hot'
            f'&filter_tips_before=0&from=singleWeiBo' % nbr,
            headers=headers
        )

    html = res.json()['data']['html']
    soup = BeautifulSoup(html, 'html.parser')
    m_con_list = soup.find_all('div', attrs={'node-type': 'replywrap'})
    for m_con in m_con_list:
        con_text = m_con.find('div', class_='WB_text').text.strip()
        print(con_text)

    action_data = soup.find('div', attrs={'node-type': 'comment_loading'})
    if action_data is not None:
        action_data = action_data['action-data']
    else:
        action_data = soup.find('a', attrs={'action-type': 'click_more_comment'})['action-data']

    print(action_data)
    parse_qs = parse.parse_qs(action_data)
    RQS_ID = parse_qs['id'][0]
    ROOT_COMMENT_MAX_ID = parse_qs['root_comment_max_id'][0]
    ROOT_COMMENT_MAX_ID_TYPE = parse_qs['root_comment_max_id_type'][0]


for _nbr in range(1, 150):
    print('第%d页' % _nbr)
    get_con_page(_nbr)
    print('-' * 120)

运行结果:


欢迎大家加入qq群一起交流爬虫技术:python爬虫技术交流群(494976303)


你可能感兴趣的:((三)python爬虫实例 - 微博评论爬取)