Scrapy实现爬取新浪微博用户信息(爬虫结果写入mongodb)

爬取字段有:

  1. 微博ID
  2. 微博昵称
  3. 性别
  4. 地区信息
  5. 认证信息
  6. 个性签名
  7. 发表微博个数
  8. 粉丝个数
  9. 关注个数

spiders文件夹下microID_Spider.py这样写:

# -*- coding: utf-8 -*-
import scrapy
from scrapy.selector import Selector
from blogSpider.items import blogIDItem

class MicroidSpiderSpider(scrapy.Spider):
    name = 'microID_Spider'
    allowed_domains = ['weibo.cn']
    start_urls = ['https://weibo.cn/search']
    # 默认50页
    max_page = 50
    myCookie = 'xxxxxxx'
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Cache-Control': ' max-age=0',
        'Connection': ' keep-alive',
        'Content-Type': ' application/x-www-form-urlencoded',
        'Host': ' weibo.cn',
        'Origin': ' https://weibo.cn',
        'Upgrade-Insecure-Requests': ' 1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36',
    }

    def start_requests(self):
        cookie = {}
        for i in self.myCookie.split(';')[:-1]:
            cookie[i.split('=')[0]] = i.split('=')[1]

        blogID = '罗志祥'
        for i in range(1, self.max_page+1):
            url = '{url}/user/?keyword={blogID}&page={pageNum}'.format(url=self.start_urls[0], blogID=blogID, pageNum=i)
            yield scrapy.FormRequest(
                url,
                headers=self.headers,
                cookies=cookie,
                callback=self.write_BlogID,
            )

    def write_BlogID(self, response):
        pageInfo = Selector(response)
        # print(response.body.decode('utf-8'))
        # 所有用户信息都存放在HTML table标签中,所以获取页面中的所有table
        all_Table = pageInfo.xpath('//table')
        cookie = {}
        for i in self.myCookie.split(';')[:-1]:
            cookie[i.split('=')[0]] = i.split('=')[1]
        # print(all_Table)
        for table in all_Table:
            ID_href = table.css('a::attr(href)').extract()[0]
            # print(ID_href.split('?'))
            # print(type(ID_href))
            url = 'https://weibo.cn' + ID_href.split('?')[0]
            # print(url)
            yield scrapy.Request(
                url,
                headers=self.headers,
                cookies=cookie,
                callback=self.getBlogIDinfo,
            )


    def getBlogIDinfo(self, response):
        # print(response.body.decode('utf-8'))
        # print(response.url)
        blogID_Info = blogIDItem()
        # 微博ID存放在URL里,从URL提取就可以
        blogID_Info['ID'] = response.url.split('/')[len(response.url.split('/'))-1]
        pageInfo = Selector(response)
        ut_div = pageInfo.xpath('//div[@class="ut"]')
        spans = ut_div.xpath('span[@class="ctt"]/text()').extract()
        # print(len(spans))
        # print(spans)
        # 四种情况
        if len(spans) == 1:
            # 第一种,只有昵称、性别、地址信息,形如['羅誌祥\xa0男/台湾    \xa0    '],数组长度为1
            firstRowInfo = spans[0].split(u'\xa0')
            # 微博昵称
            blogID_Info['blogName'] = firstRowInfo[0].replace(u'\xa0', u' ')
            # 性别
            blogID_Info['sex'] = firstRowInfo[1][:firstRowInfo[1].index('/')]
            # 地址
            blogID_Info['location'] = firstRowInfo[1][firstRowInfo[1].index('/')+1:].strip(' ')
            # 认证信息为空
            blogID_Info['identification'] = ''
            # 个性签名为空
            blogID_Info['personal_sign'] = ''
        elif len(spans) == 2:
            firstRowInfo = spans[0].split(u'\xa0')
            blogID_Info['blogName'] = firstRowInfo[0].replace(u'\xa0', u' ')
            blogID_Info['sex'] = firstRowInfo[1][:firstRowInfo[1].index('/')]
            blogID_Info['location'] = firstRowInfo[1][firstRowInfo[1].index('/') + 1:].strip(' ')
            if spans[1].find('认证') == -1:
                # 认证信息为空
                blogID_Info['identification'] = ''
                # 个性签名不为空
                blogID_Info['personal_sign'] = spans[1].replace(u'\u301c', u' ')
            else:
                # 认证信息不为空
                blogID_Info['identification'] = spans[1].replace(u'\u301c', u' ')
                # 个性签名为空
                blogID_Info['personal_sign'] = ''
        elif len(spans) == 3:
            blogID_Info['blogName'] = spans[0].replace(u'\xa0', u' ')
            secondRowInfo = spans[1].split(u'\xa0')
            blogID_Info['sex'] = secondRowInfo[1][:secondRowInfo[1].index('/')]
            blogID_Info['location'] = secondRowInfo[1][secondRowInfo[1].index('/') + 1:].strip(' ')
            if spans[2].find('认证') == -1:
                # 认证信息为空
                blogID_Info['identification'] = ''
                # 个性签名不为空
                blogID_Info['personal_sign'] = spans[2].replace(u'\u301c', u' ')
            else:
                # 认证信息不为空
                blogID_Info['identification'] = spans[2].replace(u'\u301c', u' ')
                # 个性签名为空
                blogID_Info['personal_sign'] = ''
        elif len(spans) == 4:
            blogID_Info['blogName'] = spans[0].replace(u'\xa0', u' ')
            secondRowInfo = spans[1].split(u'\xa0')
            blogID_Info['sex'] = secondRowInfo[1][:secondRowInfo[1].index('/')]
            blogID_Info['location'] = secondRowInfo[1][secondRowInfo[1].index('/') + 1:].strip(' ')
            blogID_Info['identification'] = spans[2].replace(u'\u301c', u' ')
            blogID_Info['personal_sign'] = spans[3].replace(u'\u301c', u' ')
        # print(blogID_Info['blogName'])
        blogNumInfo = pageInfo.xpath('//span[@class="tc"]/text()').extract()
        # print(blogNumInfo)
        tip2 = pageInfo.xpath('//div[@class="tip2"]')
        focusInfo = tip2.xpath('a[1]/text()').extract()
        # print(focusInfo)
        fansInfo = tip2.xpath('a[2]/text()').extract()
        # print(fansInfo)
        blogID_Info['blog_Num'] = blogNumInfo[0][blogNumInfo[0].index('[')+1:blogNumInfo[0].index(']')]
        blogID_Info['focus_Num'] = focusInfo[0][focusInfo[0].index('[')+1:focusInfo[0].index(']')]
        blogID_Info['fans_Num'] = fansInfo[0][fansInfo[0].index('[')+1:fansInfo[0].index(']')]
        # print(blogID_Info)
        yield blogID_Info

items.py这样写:

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy

class blogIDItem(scrapy.Item):
    '''
    微博用户信息
    '''
    collection = 'blogID_Data'
    ID = scrapy.Field()
    blogName = scrapy.Field()
    sex = scrapy.Field()
    location = scrapy.Field()
    identification = scrapy.Field()
    personal_sign = scrapy.Field()
    blog_Num = scrapy.Field()
    fans_Num = scrapy.Field()
    focus_Num = scrapy.Field()

你可能感兴趣的:(scrapy爬虫,微博,python,爬虫)