爬取字段有:
spiders文件夹下microID_Spider.py这样写:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.selector import Selector
from blogSpider.items import blogIDItem
class MicroidSpiderSpider(scrapy.Spider):
name = 'microID_Spider'
allowed_domains = ['weibo.cn']
start_urls = ['https://weibo.cn/search']
# 默认50页
max_page = 50
myCookie = 'xxxxxxx'
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': ' max-age=0',
'Connection': ' keep-alive',
'Content-Type': ' application/x-www-form-urlencoded',
'Host': ' weibo.cn',
'Origin': ' https://weibo.cn',
'Upgrade-Insecure-Requests': ' 1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36',
}
def start_requests(self):
cookie = {}
for i in self.myCookie.split(';')[:-1]:
cookie[i.split('=')[0]] = i.split('=')[1]
blogID = '罗志祥'
for i in range(1, self.max_page+1):
url = '{url}/user/?keyword={blogID}&page={pageNum}'.format(url=self.start_urls[0], blogID=blogID, pageNum=i)
yield scrapy.FormRequest(
url,
headers=self.headers,
cookies=cookie,
callback=self.write_BlogID,
)
def write_BlogID(self, response):
pageInfo = Selector(response)
# print(response.body.decode('utf-8'))
# 所有用户信息都存放在HTML table标签中,所以获取页面中的所有table
all_Table = pageInfo.xpath('//table')
cookie = {}
for i in self.myCookie.split(';')[:-1]:
cookie[i.split('=')[0]] = i.split('=')[1]
# print(all_Table)
for table in all_Table:
ID_href = table.css('a::attr(href)').extract()[0]
# print(ID_href.split('?'))
# print(type(ID_href))
url = 'https://weibo.cn' + ID_href.split('?')[0]
# print(url)
yield scrapy.Request(
url,
headers=self.headers,
cookies=cookie,
callback=self.getBlogIDinfo,
)
def getBlogIDinfo(self, response):
# print(response.body.decode('utf-8'))
# print(response.url)
blogID_Info = blogIDItem()
# 微博ID存放在URL里,从URL提取就可以
blogID_Info['ID'] = response.url.split('/')[len(response.url.split('/'))-1]
pageInfo = Selector(response)
ut_div = pageInfo.xpath('//div[@class="ut"]')
spans = ut_div.xpath('span[@class="ctt"]/text()').extract()
# print(len(spans))
# print(spans)
# 四种情况
if len(spans) == 1:
# 第一种,只有昵称、性别、地址信息,形如['羅誌祥\xa0男/台湾 \xa0 '],数组长度为1
firstRowInfo = spans[0].split(u'\xa0')
# 微博昵称
blogID_Info['blogName'] = firstRowInfo[0].replace(u'\xa0', u' ')
# 性别
blogID_Info['sex'] = firstRowInfo[1][:firstRowInfo[1].index('/')]
# 地址
blogID_Info['location'] = firstRowInfo[1][firstRowInfo[1].index('/')+1:].strip(' ')
# 认证信息为空
blogID_Info['identification'] = ''
# 个性签名为空
blogID_Info['personal_sign'] = ''
elif len(spans) == 2:
firstRowInfo = spans[0].split(u'\xa0')
blogID_Info['blogName'] = firstRowInfo[0].replace(u'\xa0', u' ')
blogID_Info['sex'] = firstRowInfo[1][:firstRowInfo[1].index('/')]
blogID_Info['location'] = firstRowInfo[1][firstRowInfo[1].index('/') + 1:].strip(' ')
if spans[1].find('认证') == -1:
# 认证信息为空
blogID_Info['identification'] = ''
# 个性签名不为空
blogID_Info['personal_sign'] = spans[1].replace(u'\u301c', u' ')
else:
# 认证信息不为空
blogID_Info['identification'] = spans[1].replace(u'\u301c', u' ')
# 个性签名为空
blogID_Info['personal_sign'] = ''
elif len(spans) == 3:
blogID_Info['blogName'] = spans[0].replace(u'\xa0', u' ')
secondRowInfo = spans[1].split(u'\xa0')
blogID_Info['sex'] = secondRowInfo[1][:secondRowInfo[1].index('/')]
blogID_Info['location'] = secondRowInfo[1][secondRowInfo[1].index('/') + 1:].strip(' ')
if spans[2].find('认证') == -1:
# 认证信息为空
blogID_Info['identification'] = ''
# 个性签名不为空
blogID_Info['personal_sign'] = spans[2].replace(u'\u301c', u' ')
else:
# 认证信息不为空
blogID_Info['identification'] = spans[2].replace(u'\u301c', u' ')
# 个性签名为空
blogID_Info['personal_sign'] = ''
elif len(spans) == 4:
blogID_Info['blogName'] = spans[0].replace(u'\xa0', u' ')
secondRowInfo = spans[1].split(u'\xa0')
blogID_Info['sex'] = secondRowInfo[1][:secondRowInfo[1].index('/')]
blogID_Info['location'] = secondRowInfo[1][secondRowInfo[1].index('/') + 1:].strip(' ')
blogID_Info['identification'] = spans[2].replace(u'\u301c', u' ')
blogID_Info['personal_sign'] = spans[3].replace(u'\u301c', u' ')
# print(blogID_Info['blogName'])
blogNumInfo = pageInfo.xpath('//span[@class="tc"]/text()').extract()
# print(blogNumInfo)
tip2 = pageInfo.xpath('//div[@class="tip2"]')
focusInfo = tip2.xpath('a[1]/text()').extract()
# print(focusInfo)
fansInfo = tip2.xpath('a[2]/text()').extract()
# print(fansInfo)
blogID_Info['blog_Num'] = blogNumInfo[0][blogNumInfo[0].index('[')+1:blogNumInfo[0].index(']')]
blogID_Info['focus_Num'] = focusInfo[0][focusInfo[0].index('[')+1:focusInfo[0].index(']')]
blogID_Info['fans_Num'] = fansInfo[0][fansInfo[0].index('[')+1:fansInfo[0].index(']')]
# print(blogID_Info)
yield blogID_Info
items.py这样写:
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class blogIDItem(scrapy.Item):
'''
微博用户信息
'''
collection = 'blogID_Data'
ID = scrapy.Field()
blogName = scrapy.Field()
sex = scrapy.Field()
location = scrapy.Field()
identification = scrapy.Field()
personal_sign = scrapy.Field()
blog_Num = scrapy.Field()
fans_Num = scrapy.Field()
focus_Num = scrapy.Field()