python爬取微博文本

#! usr/bin/python
# -*- coding=utf-8 -*-
from urllib.parse import urlencode
import requests
from pyquery import PyQuery as pq
import pandas as pd
base_url='https://m.weibo.cn/api/container/getIndex?'
headers={
     
    'Host':'m.weibo.cn',
    'Referer':'https://m.weibo.cn/u/xxx',
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.87 Safari/537.36',
    'X-Requested-With':'XMLHttpRequest'
}
def get_page(since_id):
    params={
     'uid':'xxx',#随需更改
        't':'0',
        'luicode':'xxx',
        'type':'uid',
        'value':'1862462415',
        'containerid':'1076031862462415'}
    if since_id !=0:
        params['since_id']=since_id
    url=base_url+urlencode(params)
    print(url)
    try:
        response=requests.get(url,headers=headers)
        if response.status_code==200:
            return response.json()
    except requests.ConnectionError as e:
        print('error',e.args)
def parse_page(json):
    if json:
        items=json.get('data').get('cards')
        for item in items:
            url=item.get('mblog').get('scheme')
            yemian=get_page(url)
            weibo={
     }
            weibo['id']=item.get('id')
            weibo['time']=item.get('created_at')
            weibo['text']=pq(item.get('text')).text()
            weibo['reposts']=item.get('reposts_count')
            weibo['comment']=item.get('comments_count')
            weibo['attitude']=item.get('attitudes_count')
            weibo['user']=item.get('user').get('screen_name')
            yield weibo
if __name__ == '__main__':
    since_id=0
    danny=[]
    for page in range(1,100):
        jsone=get_page(since_id)
        print(jsone)
        results=parse_page(jsone)
        for result in results:
            try:
                danny.append(result)
            except:
                print('='*10+"此内容无法显示"+"="*10)
        try:
            since_id=jsone['data']['cardlistInfo']['since_id']
        except:
            print("到头了")
    danny_f=pd.DataFrame(danny)
    danny_f.to_excel(r'C:\Users\xxxx\Desktop\项目实战数据\xx.xls',encoding='utf-8-sig')
    print(danny_f)

你可能感兴趣的:(python爬取微博文本)