10、实战2:爬取微博内容

附源码:

from urllib.parse import urlencode

import requests

from pyquery import PyQuery as pq

base_url = 'https://m.weibo.cn/api/container/getIndex?'

headers = {

    'Host': 'm.weibo.cn',

    'Referer': 'https://m.weibo.cn/u/3519635132',

    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36',

    'X-Requested-With': 'XMLHttpRequest'

}

max_page = 2

def get_page(page):

    params = {

        'type': 'uid',

        'value': '3519635132',

        'containerid': '1076033519635132',

        'page': page

    }

    url = base_url + urlencode(params)

    try:

        response = requests.get(url, headers=headers)

        if response.status_code == 200:

            return response.json(), page

    except requests.ConnectionError as e:

        print('Error', e.args)

def parse_page(json, page: int):

    if json:

        items = json.get('data').get('cards')

        for index, item in enumerate(items):

            if page == 1 and index == 1:

                continue

            else:

                item = item.get('mblog', {})

                weibo = {}

                weibo['id'] = item.get('id')

                #提取文本信息需要注意使用方法

                weibo['text'] = pq(item.get('text')).text()

                weibo['attitudes'] = item.get('attitudes_count')#点赞数量

                weibo['comments'] = item.get('comments_count')#评论数量

                weibo['reposts'] = item.get('reposts_count')#转发数量

                yield weibo

if __name__ == '__main__':

    for page in range(1, max_page + 1):

        json = get_page(page)

        results = parse_page(*json)

        for result in results:

            print(result)

你可能感兴趣的:(10、实战2:爬取微博内容)