我现在想建两个表,一个存储微博人物信息,一个存储微博人物发的微博,通过user_id这个参数将他们关联起来。
但是呢,第二个表,也就是微博人物转发的微博,这个数据一直存不进数据库中,不清楚为什么
用的是本地MongoDB存储。
ITEM_PIPELINES = {
'weibo.pipelines.WeiboPipeline': 300,# Weibo_Save
'weibo.pipelines.Weibo_Save': 400,# Weibo_Sweet_Save
# 'weibo.pipelines.Weibo_Sweet_Save': 500,# Weibo_Sweet_Save
# 'weibo.pipelines.Weibo_Sweets_Save': 600,
}
import pymongo
from .items import Weibo_Xinxi_Item,WeiboItem
class WeiboPipeline(object):
def __init__(self):
print("管道开启")
self.weibo = pymongo.MongoClient()['spider']['weibo_people']
self.sweet = pymongo.MongoClient()['spider']['weibo_sweets']
def open_spider(self, spider): # 官方文档说,这个调用spider的时候会自己启动。
print('UniquePipeline 开启')
def process_item(self, item, spider):
if isinstance(item,WeiboItem):
if self.weibo.find_one({'user_id': item['user_id']}):
return None
else:
return item
else:
if self.sweet.find_one({'user_id': item['user_id']}):
return None
else:
return item
class Weibo_Save(object):
def __init__(self):
self.client = pymongo.MongoClient()['spider']
self.col = self.client['weibo_people']
self.sweet = self.client['sweet']
def open_spider(self, spider): # 官方文档说,这个调用spider的时候会自己启动。
print('SaveItemPipeline 开启')
def process_item(self, item, spider):
# print(type(item))
if isinstance(item,WeiboItem):# 利用该方法判断返回的字段到底要存到哪个表中。
try:
item = dict(item)
# print(type(item))
self.col.insert_one(item)
print("储存的是人物信息")
return item
except:
print("数据重复")
else:
try:
item = dict(item)
# print(type(item))
self.sweet.save(item)
print("储存的是微博信息")
return item
except:
print("数据重复")
class WeiboSpiderSpider(scrapy.Spider):
name = 'weibo_spider'
start_urls = ['https://weibo.cn/6487174658/info']
def parse(self, response):
# 这里面也需要判断。
text_end=';'.join(response.xpath("//div[@class='c']//text()").extract())
item = WeiboItem()
Nickname = re.findall('昵称:(.*?);', text_end)
if Nickname:
item['Nickname']=Nickname[0]
Gender = re.findall('性别:(.*?);', text_end)
if Gender:
item['Gender'] = Gender[0]
Location = re.findall('地区:(.*?);',text_end)
if Location:
item['Location'] = Location[0]
Birthday = re.findall('生日:(.*?);',text_end)
if Birthday:
item['Birthday'] = Birthday[0]
url = response.url.split('/')[3]
item['user_id'] = url
if url:
yield scrapy.Request(url='https://weibo.cn/'+url,callback=self.parse_urls,meta={'item':item})# 请求自生页面
if url:# 不停的访问粉丝。
yield scrapy.Request(url='https://weibo.cn/{}/fans'.format(url),callback=self.parse_user_id)# 请求粉丝页面
def parse_urls(self,response):
item = response.meta.get('item')
Microblog_num = re.findall(r'微博\[(\d+)\]',response.text)
Focus_People = re.findall(r'关注\[(\d+)\]',response.text)
Fans = re.findall(r'粉丝\[(\d+)\]',response.text)
if Microblog_num:
try:
item['Microblog_num'] = Microblog_num[0]
except:
pass
if Focus_People:
try:
item['Focus_People'] = Focus_People[0]
except:
pass
if Fans:
try:
item['Fans'] = Fans[0]
except:
pass
yield item
# 微博内容的抓取
alls = response.xpath("//div[@class='c']")[:-2]
sweet_items = Weibo_Xinxi_Item
comment_num = re.findall(r'class="cc">评论\[(\d+)\]<', response.text)
pass_num = re.findall(r'class="cc">转发\[(\d+)\]<', response.text)
for i,j,k in zip(alls,comment_num,pass_num):
Title = ''.join(i.xpath("./div/span[@class='ctt']/text()").extract())
if Title:
sweet_items['Title'] = Title
if j:
sweet_items['Comment_Num'] = j
if k:
sweet_items['Pass_Num'] = k
sweet_items['user_id'] = response.url.split('/')[3]
yield sweet_items
print('已经返回items了'*1000)
try:
next_url = response.xpath("//a[text()='下页']/@href").extract()[0]
except IndexError:
pass
try:
next_url = response.xpath("//a[text()='下页']/@href").extract()[0]
if next_url:
yield scrapy.Request(url='https://weibo.cn'+next_url,callback=self.parse_sweet_book)
except:
pass
def parse_sweet_book(self,response):
sweet_items = Weibo_Xinxi_Item()
alls = response.xpath("//div[@class='c']")[:-2]
comment_num = re.findall(r'class="cc">评论\[(\d+)\]<', response.text)
pass_num = re.findall(r'class="cc">转发\[(\d+)\]<', response.text)
for i, j, k in zip(alls, comment_num, pass_num):
Title = ''.join(i.xpath("./div/span[@class='ctt']/text()").extract())
if Title:
sweet_items['Title'] = Title
if j:
sweet_items['Comment_Num'] = j
if k:
sweet_items['Pass_Num'] = k
sweet_items['user_id'] = response.url.split('/')[3]
yield sweet_items
print(sweet_items)
next_url = response.xpath("//a[text()='下页']/@href").extract()[0]
try:
if next_url:
yield scrapy.Request(url='https://weibo.cn' + next_url, callback=self.parse_sweet_book)
except:
pass
def parse_user_id(self,response):
uid_list = re.findall(r'https://weibo.cn/u/(\d+)',response.text,re.S)
for uid in uid_list:
yield scrapy.Request(url='https://weibo.cn/{}/info'.format(uid))
# 翻页
next_url = response.xpath("//a[text()='下页']/@href").extract()
if next_url:
yield scrapy.Request(url='https://weibo.cn'+next_url[0],callback=self.parse_user_id)