控制爬取速度0.5间隔和32并行
# -*- coding: utf-8 -*-
import json
from scrapy import Spider, Request
from weibouser.items import WeibouserItem
class WeiboSpider(Spider):
name = 'weibo2'
allowed_domains = ['weibo.cn']
id = "5702787827"
# 第一次发起请求
def start_requests(self):
# 发现每一页只有url后面的id不同,直接发送请求也可以得到响应
# for i in range(300):
yield Request(
url="https://m.weibo.cn/api/container/getIndex?containerid=231051_-_fans_-_"+str(self.id)+"&luicode=10000011&lfid=107603"+str(self.id)+"&featurecode=20000320&since_id="+str(1),
callback=self.parse
)
# 处理得到的json
def parse(self, response):
result = json.loads(response.body.decode("utf-8"))
try:
result = result["data"]["cards"][0]["card_group"]
except:
print("没有爬取到数据,退出当前循环")
return
# print(result)
item = WeibouserItem()
# 提取json页面信息
# 当item中定义需要提取的键值队在result中就赋值,快捷的遍历所有
for data in result:
for field in item.fields:
# for i in data:
if field in data["user"]:
item[field] = data["user"][field]
print(item[field])
# with open("ww","w") as f:
# f.write(result)
yield item
# print(response.text)
# 将得到的id取出来,在返回调用自己取出内容
if data["user"]['screen_name'] == '新手指南':
return
else:#
num1 = 0
num = int(int(data["user"]["followers_count"])//20)
print('当前循环次数',num)
if num > 0: #粉丝数量大于20才爬取
if num > 250:# 大于250就可能读取不出来了
num = 250
for i in range(num):
yield Request(
url="https://m.weibo.cn/api/container/getIndex?containerid=231051_-_fans_-_"+str(item["id"])+"&luicode=10000011&lfid=107603"+str(item["id"])+"&featurecode=20000320&since_id="+str(i),
callback=self.parse
)