第四次作业:对微博关注中街舞分类下的内容进行爬取

import requests

import json

import re

headers = {

'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Mobile Safari/537.36',

'cookie':'SSOLoginState=1560744529; ALF=1563336529; SCF=AvTBvMMmz0oZngGtGSQ33rN-nryQ4Lw-Q9ZgXAHOU7FKal63f2BX601Mw8qrS1TwAyGD7MSpzamSAeaYvhnUfmo.; SUB=_2A25wA2IBDeRhGeRP41US-CzKyjyIHXVTDA5JrDV6PUNbktAKLRfhkW1NU-rmoTDAZi6xmOJKcVSCgl64cCl4ftoR; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WW8judAe5n2NqdgSI.PKq9S5JpX5KMhUgL.Fozp1hM01hzceK52dJLoIEnLxKMLBK.LB.qLxK.L1h5L12qLxKML1heL1-qLxK.L1-zLB.HjIgp4; SUHB=0tKbOEMSFuEDBK; MLOGIN=1; _T_WM=62613902799; WEIBOCN_FROM=1110005030; XSRF-TOKEN=53f153; M_WEIBOCN_PARAMS=lfid%3D1005052187382610%26luicode%3D20000174%26uicode%3D20000174'

}

url ='https://m.weibo.cn/feed/group?gid=4218621758638928&max_id='

def get_info(url,no):

res = requests.get(url, headers=headers)

#  print(res.text)

    json_data = json.loads(res.text)

#    print(json_data)

    statuses = json_data['data']['statuses']

for statusein statuses:

text = statuse['text']

#        new_text = re.sub('[a-zA-Z0-9\s<="_>:/.?]+','',text,re.S)

        new_text = re.sub('<(.*?)>', '', text, re.S)

print(new_text)

max_id = json_data['data']['max_id']

no = no +1

        if no <=5:

next_url ='https://m.weibo.cn/feed/group?gid=4218621758638928&max_id={}'.format(max_id)

get_info(next_url, no)

else:

pass

get_info(url,1)


你可能感兴趣的:(第四次作业:对微博关注中街舞分类下的内容进行爬取)