1,要爬取微博及评论有三个方向可选:
1. weibo.com(网页版)
2. m.weibo.cn(手机版)
3. weibo.cn(移动版)
网页版的反爬技术已经很完善,作为初学者,建议选择后两者练手。本文以手机版m.weibo.cn为例。
2,要爬取数据第一步就是分析数据结构,打开m.weibo.cn中要爬取的页面,在f12中找的对应请求头信息与数据结构
3,编辑请求头信息(一定要加cookie)
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Cookie': 'ALF=1584682452; SCF=Ao1AQDSyNMyR23TDx5xG_IJe2v4XqtrohlDD67o143N_ilDj19QP-Ffy2qe6e9RCYw2cmt-buWEsw2I7h1PUgm8.; SUB=_2A25zSLo0DeRhGeFM4lUZ9yrJwj6IHXVQssZ8rDV6PUJbktANLXfDkW1NQIfd1Q7IIALrtuY8yxDNe-TWrxqGr9QJ; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9Wh-dpVkFAo9J5yWOxXdB82b5JpX5K-hUgL.FoME1KMRS0Bf1Kz2dJLoIp7LxKML1KBLBKnLxKqL1hnLBoMNeo.N1hMXSK.E; SUHB=0rr7f9X9Ndnx8q; _T_WM=52236839197; XSRF-TOKEN=9d35ef; WEIBOCN_FROM=1110006030; MLOGIN=1',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'User-Agent': agent
}
4,编写url,其中的参数id与page与你想爬取的博主和要爬取博主微博的第几页有关
url='https://m.weibo.cn/api/container/getIndex?uid={}&t=0&luicode=10000011&lfid=100103type%3D1%26q%3D%E5%9B%9B%E5%B7%9D%E6%97%A5%E6%8A%A5&type=uid&value={}&containerid=107603{}&page={}'.format(id,id,id,page)
response=requests.get(url,headers=headers)
5,根据爬出的json结构获取需要的数据
a = response.content.decode('utf-8')
ob_json=json.loads(a )
#print(a)
list_cards=ob_json['data']['cards']
6,将爬取的数据数据写入表格中
workbook = xlsxwriter.Workbook("demo70.xlsx")
worksheet1 = workbook.add_worksheet()
worksheet1.write("A" + str(m + 1), name[m])
worksheet1.write("B" + str(m + 1), id[m])
workbook.close()
参考完整代码:(ps:完整代码中许被注释的语句,是因为俺也是第一次写,在不断尝试中才写成功的,泪目。)
import requests
import json
from lxml import etree
import xlsxwriter
import random
import time
#agent1='Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:73.0) Gecko/20100101 Firefox/73.0'
#agent2='Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)'
#agent3='Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)'
#list1=[agent1,agent2,agent3]
#agent= random.choice(list1)
#(agent)
#headers={
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
# 'Cookie' : 'ALF=1584682452; SCF=Ao1AQDSyNMyR23TDx5xG_IJe2v4XqtrohlDD67o143N_ilDj19QP-Ffy2qe6e9RCYw2cmt-buWEsw2I7h1PUgm8.; SUB=_2A25zSLo0DeRhGeFM4lUZ9yrJwj6IHXVQssZ8rDV6PUJbktANLXfDkW1NQIfd1Q7IIALrtuY8yxDNe-TWrxqGr9QJ; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9Wh-dpVkFAo9J5yWOxXdB82b5JpX5K-hUgL.FoME1KMRS0Bf1Kz2dJLoIp7LxKML1KBLBKnLxKqL1hnLBoMNeo.N1hMXSK.E; SUHB=0rr7f9X9Ndnx8q; _T_WM=52236839197; XSRF-TOKEN=9d35ef; WEIBOCN_FROM=1110006030; MLOGIN=1',
# 'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
# 'User-Agent': agent
#}
class Weibo(object):
def get_weibo(self,id,page):#个人id
agent1 = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:73.0) Gecko/20100101 Firefox/73.0'
agent2 = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)'
agent3 = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)'
agent4='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11 '
agent5='Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)'
list1 = [agent1, agent2, agent3,agent4,agent5]
agent = random.choice(list1)
print(agent)
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Cookie': 'ALF=1584682452; SCF=Ao1AQDSyNMyR23TDx5xG_IJe2v4XqtrohlDD67o143N_ilDj19QP-Ffy2qe6e9RCYw2cmt-buWEsw2I7h1PUgm8.; SUB=_2A25zSLo0DeRhGeFM4lUZ9yrJwj6IHXVQssZ8rDV6PUJbktANLXfDkW1NQIfd1Q7IIALrtuY8yxDNe-TWrxqGr9QJ; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9Wh-dpVkFAo9J5yWOxXdB82b5JpX5K-hUgL.FoME1KMRS0Bf1Kz2dJLoIp7LxKML1KBLBKnLxKqL1hnLBoMNeo.N1hMXSK.E; SUHB=0rr7f9X9Ndnx8q; _T_WM=52236839197; XSRF-TOKEN=9d35ef; WEIBOCN_FROM=1110006030; MLOGIN=1',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'User-Agent': agent
}
url='https://m.weibo.cn/api/container/getIndex?uid={}&t=0&luicode=10000011&lfid=100103type%3D1%26q%3D%E5%9B%9B%E5%B7%9D%E6%97%A5%E6%8A%A5&type=uid&value={}&containerid=107603{}&page={}'.format(id,id,id,page)
response=requests.get(url,headers=headers)
print(response.content)
a = response.content.decode('utf-8')
ob_json=json.loads(a )
#print(a)
list_cards=ob_json['data']['cards']
return list_cards
def get_comments(self,id,page):#微博id
agent1 = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:73.0) Gecko/20100101 Firefox/73.0'
agent2 = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)'
agent3 = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)'
agent4 = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11 '
agent5 = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)'
list1 = [agent1, agent2, agent3,agent4,agent5]
agent = random.choice(list1)
print(agent)
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Cookie': 'ALF=1584682452; SCF=Ao1AQDSyNMyR23TDx5xG_IJe2v4XqtrohlDD67o143N_ilDj19QP-Ffy2qe6e9RCYw2cmt-buWEsw2I7h1PUgm8.; SUB=_2A25zSLo0DeRhGeFM4lUZ9yrJwj6IHXVQssZ8rDV6PUJbktANLXfDkW1NQIfd1Q7IIALrtuY8yxDNe-TWrxqGr9QJ; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9Wh-dpVkFAo9J5yWOxXdB82b5JpX5K-hUgL.FoME1KMRS0Bf1Kz2dJLoIp7LxKML1KBLBKnLxKqL1hnLBoMNeo.N1hMXSK.E; SUHB=0rr7f9X9Ndnx8q; _T_WM=52236839197; XSRF-TOKEN=9d35ef; WEIBOCN_FROM=1110006030; MLOGIN=1',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'User-Agent': agent
}
url='https://m.weibo.cn/api/comments/show?id={}&page={}'.format(id,page)
response = requests.get(url,headers=headers)
print(response.status_code)
a = response.content.decode('utf-8')
ob_json = json.loads(a)
if ob_json['ok'] == 1:
list_comments = ob_json['data']['data']
else:
list_comments = []
return list_comments
def main(self,uid,page):
list_cards=self.get_weibo(uid,page)
for i in range(len(list_cards)):
if list_cards[i]['card_type'] == 9:
id.append(list_cards[i]['mblog']['id'])
name.append(list_cards[i]['mblog']['user']['screen_name'])
created_at.append(list_cards[i]['mblog']['created_at'])
comments_count.append(list_cards[i]['mblog']['comments_count'])
attitudes_count.append(list_cards[i]['mblog']['attitudes_count'])
text1=list_cards[i]['mblog']['text']
tree=etree.HTML(text=text1)#过滤多于标签
text.append(tree.xpath('string(.)'))
print('******')
did=list_cards[i]['mblog']['id']
mpage=1
list_comments=weibo.get_comments(did,mpage )
while len(list_comments)!=0:
for j in range(len(list_comments)):
uprofile_image_url.append(list_comments[j]['user']['profile_image_url'])
cid.append(list_comments[j]['user']['id'])
uscreen_name.append(list_comments[j]['user']['screen_name'])
uprofile_url.append(list_comments[j]['user']['profile_url'])
text1=list_comments[j]['text']
tree=etree.HTML(text=text1)#过滤)多于标签
utext.append(tree.xpath('string(.)'))
ucreated_at.append(list_comments[j]['created_at']) # 时间
ulike_counts.append(list_comments[j]['like_counts'] ) # 点赞数
# huifu.append(list_comments[j]['total_number'])#回复数
# worksheet2.write("A" + str(j + 1), profile_image_url)
#worksheet2.write("B" + str(j + 1), uid)
# worksheet2.write("C" + str(j + 1), screen_name)
# worksheet2.write("D" + str(j + 1), profile_url)
# worksheet2.write("E" + str(j + 1), text)
# worksheet2.write("F" + str(j + 1), created_at)
# worksheet2.write("G" + str(j + 1), like_counts)
#worksheet2.write("H" + str(j + 1), t )
#print( name_user,created_at,str(like_counts))
#print(text+'\n')
# worksheet.write("B"+str(i+1),name[i])
# worksheet.write("A" + str(i + 1), id)
# worksheet.write("C" + str(i + 1), text)
# worksheet.write("D" + str(i + 1), created_at)
# worksheet.write("E" + str(i + 1), scheme)
# worksheet.write("F" + str(i + 1), comments_count)
# worksheet.write("G" + str(i + 1), attitudes_count)
mpage+=1
time.sleep(5)
list_comments = weibo.get_comments(did, mpage)
print('=================')
if __name__ == '__main__':
workbook = xlsxwriter.Workbook("demo70.xlsx")
worksheet1 = workbook.add_worksheet()
worksheet2 = workbook.add_worksheet()
id=[]
name=[]
text=[]
created_at = []
comments_count=[]
attitudes_count=[]
uprofile_image_url=[]
cid=[]
uscreen_name=[]
uprofile_url=[]
utext=[]
ucreated_at=[]
ulike_counts=[]
utext=[]
#huifu=[]
weibo=Weibo()
weibo.main('3167104922',70)
for m in range(len(id)):
worksheet1.write("A" + str(m + 1), name[m])
worksheet1.write("B" + str(m + 1), id[m])
worksheet1.write("C" + str(m + 1), text[m])
worksheet1.write("D" + str(m + 1), created_at[m])
worksheet1.write("E" + str(m + 1), comments_count[m])
worksheet1.write("F" + str(m + 1), attitudes_count[m])
for n in range(len(cid)):
worksheet2.write("A" + str(n + 1), uprofile_image_url[n])
worksheet2.write("B" + str(n + 1), cid[n])
worksheet2.write("C" + str(n + 1), uscreen_name[n])
worksheet2.write("D" + str(n + 1), uprofile_url[n])
worksheet2.write("E" + str(n + 1), utext[n])
worksheet2.write("F" + str(n+ 1), ucreated_at[n])
worksheet2.write("G" + str(n+ 1), ulike_counts[n])
#worksheet2.write("" + str(n + 1), huifu[n])
workbook.close()