通过一段时间对爬虫的学习,发现Scrapy框架真的是一个很好用的框架,接下来让我们用一个简单的例子,来见识一下Scrapy框架的强大之处.
本示例将获取新浪微博指定用户下的所有基本信息,粉丝和关注者,并且通过该用户的关注和粉丝继续深度爬出,简单来说,只要时间够多,ip够用,通过一个用户就可以得到新浪微博所有用户的基础信息
创建项目
scrapy startproject weibospider
目录层级如下:
weibospider
-weibospider
-spiders # 爬虫目录,创建爬虫文件
-__init__.py
_weibo.py # 写入爬虫相关信息
__init__.py
items.py # 设置数据存储模板,用于结构化数据,如Django中的models
middlewares.py # 设置中间下载键 可以设置请求头和IP池
pipelines.py # 一般结构化的数据持久化
setting.py # 爬虫运行的相关设置
main.py # 设置快捷启动
scrapy.cfg # 项目的配置信息,主要为Scrapy命令行工具提供一个基础的配置信息,主要的配置信息在setting.py中
爬取用户信息
import scrapy
from scrapy import Request
import json
from weibospider.items import UserItem, UserRelationItem
class WeiBoUserInfoSpider(scrapy.Spider):
name = 'weibo'
user_urls = 'https://m.weibo.cn/api/container/getIndex?uid={uid}type=uid&value={uid}&containerid=100505{uid}'
followers_urls = 'https://m.weibo.cn/api/container/getIndex?containerid=231051_-_followers_-_{uid}&page={page}'
fans_urls = 'https://m.weibo.cn/api/container/getIndex?containerid=231051_-_fans_-_{uid}&since_id={since_id}'
uid_lists = [
'1767840980',
'1582188504',
'1759006061',
'3908615569'
]
def start_requests(self):
for uid in uid_lists:
yield Request(self.user_urls.format(uid=uid), callback=self.parse_user)
def parse_user(self, response):
res = json.loads(response.body_as_unicode())
if res['ok']:
user_item = UserItem()
user_info = res.get('data').get('userInfo')
user_param = {
'id':'id','screen_name':'screen_name','profile_image_url':'profile_image_url','profile_url':'profile_url','verified_reason':'verified_reason','close_blue_v':'close_blue_v','description':'description','gender':'gender','follow_me':'follow_me','following':'following','followers_count':'followers_count','follow_count':'follow_count','cover_image_phone':'cover_image_phone','avatar_hd':'avatar_hd'
}
for k,v in user_param.items():
user_item[k] = user_info.get(v)
yield user_item
yield Request(self.followers_urls.format(uid = user_item.get('id'),page=1),
callback=self.parse_follower,
meta={'uid':user_item.get('id'),'page':1})
yield Request(self.fans_urls.format(uid=user_item.get('id'), since_id=1),
callback=self.parse_fans,
meta={'uid': user_item.get('id'), 'since_id': 1})
获取该用户关注人信息
def parse_follower(self,response):
res = json.loads(response.text)
if res['ok']:
card_group = res['data']['cards'][-1]['card_group']
for card_info in card_group:
user_info = card_info['user']
uid = user_info['id']
print(user_info)
yield Request(self.user_urls.format(uid=uid), callback=self.parse_user)
follow_list = []
for follow in card_group:
follow_list.append({'id':follow['user']['id'],'name':follow['user']['screen_name']})
uid = response.meta.get('uid')
user_relation = UserRelationItem()
user_relation['id'] = uid
user_relation['fans'] = []
user_relation['follower'] = follow_list
yield user_relation
uid = response.meta.get('uid')
page = int(response.meta.get('page')) + 1
yield Request(self.followers_urls.format(uid=uid, page=page), callback=self.parse_follower,meta={'uid':uid,'page':page})
items.py页面
class UserItem(scrapy.Item):
collections = 'user_info'
# define the fields for your item here like:
# name = scrapy.Field()
id = scrapy.Field()
screen_name = scrapy.Field()
profile_image_url = scrapy.Field()
profile_url = scrapy.Field()
verified_reason = scrapy.Field()
close_blue_v = scrapy.Field()
description = scrapy.Field()
gender = scrapy.Field()
follow_me = scrapy.Field()
following = scrapy.Field()
followers_count = scrapy.Field()
follow_count = scrapy.Field()
cover_image_phone = scrapy.Field()
avatar_hd = scrapy.Field()
create_time = scrapy.Field()
class UserRelationItem(scrapy.Item):
collections = 'user'
id =scrapy.Field()
fans = scrapy.Field()
follower = scrapy.Field()
pipelines.py页面,数据持久化操作
import pymongo
from datetime import datetime
from weibospider.items import UserItem, UserRelationItem
class UserCreateTimePipeline(object):
def process_item(self,item,spider):
if isinstance(item,UserItem):
item['create_time'] = datetime.now().strftime('%Y-%m-%d %H:%M')
return item
class WeibospiderPipeline(object):
def process_item(self, item, spider):
return item
class WeiboPymongoPipeline(object):
def __init__(self):
conn = pymongo.MongoClient(host='127.0.0.1',port=27017)
db = conn['day07_weibo']
self.collection = db[UserItem.collections]
def process_item(self, item, spider):
if isinstance(item,UserItem):
self.collection.update({'id':item['id']},{'$set':dict(item)},True)
if isinstance(item,UserRelationItem):
self.collection.update(
{'id':item['id']},
{'$addToSet':{
'fans':{'$each':item['fans']},
'follower':{'$each':item['follower']}
}}
)
middlewares.py 设置请求头和IP池
import random
from scrapy.conf import settings
from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware
class RandomUserAgent(UserAgentMiddleware):
def process_request(self, request, spider):
random.choice(settings['USER_AGENT_LIST'])
request.headers.setdefault(b'User-Agent', self.user_agent)
class RandomProxy(object):
def process_request(self, request, spider):
random_proxy = random.choice(settings['PROXY'])
request.meta['proxy'] = 'http://%s' % random_proxy
在setting.py中
DOWNLOADER_MIDDLEWARES = {
'weibospider.middlewares.RandomUserAgent': 543,
'weibospider.middlewares.RandomProxy': 544,
}