用Scrapy爬取新浪微博用户信息,粉丝,关注数

通过一段时间对爬虫的学习,发现Scrapy框架真的是一个很好用的框架,接下来让我们用一个简单的例子,来见识一下Scrapy框架的强大之处.
本示例将获取新浪微博指定用户下的所有基本信息,粉丝和关注者,并且通过该用户的关注和粉丝继续深度爬出,简单来说,只要时间够多,ip够用,通过一个用户就可以得到新浪微博所有用户的基础信息

创建项目

scrapy startproject weibospider
目录层级如下:
weibospider
    -weibospider
        -spiders # 爬虫目录,创建爬虫文件
            -__init__.py
            _weibo.py   # 写入爬虫相关信息
        __init__.py
        items.py    # 设置数据存储模板,用于结构化数据,如Django中的models
        middlewares.py  # 设置中间下载键  可以设置请求头和IP池
        pipelines.py   # 一般结构化的数据持久化
        setting.py  # 爬虫运行的相关设置
    main.py   # 设置快捷启动
    scrapy.cfg   # 项目的配置信息,主要为Scrapy命令行工具提供一个基础的配置信息,主要的配置信息在setting.py中

爬取用户信息

import scrapy
from scrapy import Request
import json
from weibospider.items import UserItem, UserRelationItem

class WeiBoUserInfoSpider(scrapy.Spider):
    # 给爬虫唯一命名,通过此命名可以启动爬虫
    name = 'weibo'
    # 通过分析api接口,可以得的用户信息url
    user_urls = 'https://m.weibo.cn/api/container/getIndex?uid={uid}type=uid&value={uid}&containerid=100505{uid}'
    # 通过分析api接口,可以得到该用户关注人的信息的url
    followers_urls = 'https://m.weibo.cn/api/container/getIndex?containerid=231051_-_followers_-_{uid}&page={page}'
    # 通过分析api接口,可以得到该用户粉丝的信息的url
    fans_urls = 'https://m.weibo.cn/api/container/getIndex?containerid=231051_-_fans_-_{uid}&since_id={since_id}'

    uid_lists = [
      '1767840980',
        '1582188504',
        '1759006061',
        '3908615569'
    ]
    # start_requests 会循环生成要访问的网址
    def start_requests(self):
        for uid in uid_lists:
            # 生成器,会访问self.parse_urls函数
            yield Request(self.user_urls.format(uid=uid), callback=self.parse_user)

    def parse_user(self, response):
        res = json.loads(response.body_as_unicode())
        # 判断api接口返回成功与否
        if res['ok']:
            # 创建item对象
            user_item = UserItem()
            # 对页面信息进行解析
            user_info = res.get('data').get('userInfo')
            user_param = {
                'id':'id','screen_name':'screen_name','profile_image_url':'profile_image_url','profile_url':'profile_url','verified_reason':'verified_reason','close_blue_v':'close_blue_v','description':'description','gender':'gender','follow_me':'follow_me','following':'following','followers_count':'followers_count','follow_count':'follow_count','cover_image_phone':'cover_image_phone','avatar_hd':'avatar_hd'
            }
            for k,v in user_param.items():
                user_item[k] = user_info.get(v)
            # 返回item,用户数据保存
            yield user_item

            # 关注人信息 ,Request会发送一个请求,第一个参数为请求url,第二个参数为执行的回调函数,第三个参数为要传递的值,在回调函数中可以通过response.meta来获取.
            yield Request(self.followers_urls.format(uid = user_item.get('id'),page=1),
                          callback=self.parse_follower,
                          meta={'uid':user_item.get('id'),'page':1})

            # 粉丝信息
            yield Request(self.fans_urls.format(uid=user_item.get('id'), since_id=1),
                          callback=self.parse_fans,
                          meta={'uid': user_item.get('id'), 'since_id': 1})

获取该用户关注人信息

    def parse_follower(self,response):
        # 解析用户关注信息
        res = json.loads(response.text)
        if res['ok']:
            card_group =  res['data']['cards'][-1]['card_group']
            for card_info in card_group:
                user_info = card_info['user']
                # 得到该用户的id
                uid = user_info['id']
                print(user_info)
                # 对该用户的全部信息进行录入
                yield Request(self.user_urls.format(uid=uid), callback=self.parse_user)

            # 解析用户和关注人信息之间的关系
            follow_list = []
            for follow in card_group:
                follow_list.append({'id':follow['user']['id'],'name':follow['user']['screen_name']})

            uid = response.meta.get('uid')
            # 创建关注人和之前爬取用户的关联关系
            user_relation = UserRelationItem()
            user_relation['id'] = uid
            user_relation['fans'] = []
            user_relation['follower'] = follow_list
            yield user_relation

            # 获取下一页的关注信息
            uid = response.meta.get('uid')
            page = int(response.meta.get('page')) + 1
            yield Request(self.followers_urls.format(uid=uid, page=page), callback=self.parse_follower,meta={'uid':uid,'page':page})

items.py页面

class UserItem(scrapy.Item):
    collections = 'user_info'
    # define the fields for your item here like:
    # name = scrapy.Field()
    id = scrapy.Field()
    screen_name = scrapy.Field()
    profile_image_url = scrapy.Field()
    profile_url = scrapy.Field()
    verified_reason = scrapy.Field()
    close_blue_v = scrapy.Field()
    description = scrapy.Field()
    gender = scrapy.Field()
    follow_me = scrapy.Field()
    following = scrapy.Field()
    followers_count = scrapy.Field()
    follow_count = scrapy.Field()
    cover_image_phone = scrapy.Field()
    avatar_hd = scrapy.Field()
    create_time = scrapy.Field()


class UserRelationItem(scrapy.Item):
    collections = 'user'
    id =scrapy.Field()
    fans = scrapy.Field()
    follower = scrapy.Field()

pipelines.py页面,数据持久化操作

import pymongo
from datetime import datetime
from weibospider.items import UserItem, UserRelationItem

# 对传入的item对象增加create_time属性
class UserCreateTimePipeline(object):
    def process_item(self,item,spider):
        if isinstance(item,UserItem):
            item['create_time'] = datetime.now().strftime('%Y-%m-%d %H:%M')
        return item

class WeibospiderPipeline(object):
    def process_item(self, item, spider):
        return item

class WeiboPymongoPipeline(object):
    # 用于保存item数据
    def __init__(self):
        conn = pymongo.MongoClient(host='127.0.0.1',port=27017)
        db = conn['day07_weibo']
        self.collection = db[UserItem.collections]

    def process_item(self, item, spider):
        # 进行判断,如果传入的是用户的信息,进行数据持久化
        if isinstance(item,UserItem):
            self.collection.update({'id':item['id']},{'$set':dict(item)},True)
        # 判断,如果是该用户粉丝或关注者的信息,则添加到该用户的粉丝或关注者的字段中
        if isinstance(item,UserRelationItem):
            self.collection.update(
                {'id':item['id']},
                {'$addToSet':{
                    'fans':{'$each':item['fans']},
                    'follower':{'$each':item['follower']}
                }}
            )

middlewares.py 设置请求头和IP池

import random
from scrapy.conf import settings
from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware

class RandomUserAgent(UserAgentMiddleware):
    def process_request(self, request, spider):
        random.choice(settings['USER_AGENT_LIST'])
        request.headers.setdefault(b'User-Agent', self.user_agent)

class RandomProxy(object):
    def process_request(self, request, spider):
        random_proxy = random.choice(settings['PROXY'])
        request.meta['proxy'] = 'http://%s' %  random_proxy

在setting.py中
DOWNLOADER_MIDDLEWARES = {
   # 'weibospider.middlewares.WeibospiderDownloaderMiddleware': 543,
   'weibospider.middlewares.RandomUserAgent': 543,
   'weibospider.middlewares.RandomProxy': 544,
}

你可能感兴趣的:(用Scrapy爬取新浪微博用户信息,粉丝,关注数)