微博评论的爬取代码

关于爬虫(1)

爬取微博评论

# coding=utf-8
import bs4
import requests
def getUrl(url):
    head = {
     
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) '
                          'Version/14.0 Safari/605.1.15',
            'cookie':'cross_origin_proto=SSL; Apache=9248610096172.86.1601702374292; '
                     'SINAGLOBAL=9248610096172.86.1601702374292; '
                     'ULV=1601702374297:5:1:3:9248610096172.86.1601702374292:1601205183902; UOR=baidu.com,weibo.com,'
                     'baidu.com; _s_tentry=baidu.com; wb_view_log=1440*9002; WBStorage=70753a84f86f85ff|undefined; '
                     'login_sid_t=ea6532b90a65382499bc8bc3b8c411d2; '
                     'SUB=_2AkModLgCf8NxqwJRmP0VzWnnbY92wwzEieKeKEnZJRMxHRl-yT_nqhQbtRB6A_SW7UzYV2u4jCUtL'
                     '-JGazVDIFpfpGH4; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WFMlkjK8_2...kALQdYsmdD '
    }
    req = requests.get(url,headers=head)
    return req.json()["data"]["html"]
baseUrl = "https://weibo.com/aj/v6/comment/big?ajwvr=6&id=4555570424058617&from=singleWeiBo&__rnd=1601702735666"
#用于生成下一个url
root_comment_max_id = ""#上一页最后一条评论减一
sum_comment_number = 0  #已展示的comment
for i in range(1,44):
    data=getUrl(baseUrl+"&page="+str(i)+"&sum_comment_number="+str(i*15)+"&root_comment_max_id="+str(root_comment_max_id))
    soup = bs4.BeautifulSoup(data, "lxml")
    comment = soup.findAll('div',class_='list_li S_line1 clearfix')
    root_comment_max_id = int(comment[-1]["comment_id"])-1
    for j in comment:
       print(j.find("div",class_='WB_text').text[1:])
    print(i)

你可能感兴趣的:(python)