关于scrapy中数据为什么存储不到数据库中

问题描述

我现在想建两个表,一个存储微博人物信息,一个存储微博人物发的微博,通过user_id这个参数将他们关联起来。
但是呢,第二个表,也就是微博人物转发的微博,这个数据一直存不进数据库中,不清楚为什么
用的是本地MongoDB存储。

尝试解决的办法

  1. 一开始认为settings没有配置好,后面配置好了
  2. 返回的参数在piplines都是item,但一开始我设置了四个类在pipline中,一个处理微博人物信息,一个处理他们转发的微博,后来发现这个是不对的,只能用item这个参数,用isinstance去判断返回的item的类型,然后再相应的存储到MongoDB中。
  3. 接着我去看了专门用来爬取数据的代码,也就是spider,怀疑是我返回的参数不对,于是把参数items,改成了sweet_items,还是不行

settings文件已经配置好了

ITEM_PIPELINES = {
   'weibo.pipelines.WeiboPipeline': 300,# Weibo_Save
   'weibo.pipelines.Weibo_Save': 400,# Weibo_Sweet_Save
   # 'weibo.pipelines.Weibo_Sweet_Save': 500,# Weibo_Sweet_Save
   # 'weibo.pipelines.Weibo_Sweets_Save': 600,
}

pipline的代码

import pymongo
from .items import Weibo_Xinxi_Item,WeiboItem


class WeiboPipeline(object):
    def __init__(self):
        print("管道开启")
        self.weibo = pymongo.MongoClient()['spider']['weibo_people']
        self.sweet = pymongo.MongoClient()['spider']['weibo_sweets']

    def open_spider(self, spider):  # 官方文档说,这个调用spider的时候会自己启动。
        print('UniquePipeline   开启')

    def process_item(self, item, spider):
        if isinstance(item,WeiboItem):
            if self.weibo.find_one({'user_id': item['user_id']}):
                return None
            else:
                return item
        else:
            if self.sweet.find_one({'user_id': item['user_id']}):
                return None
            else:
                return item
class Weibo_Save(object):
    def __init__(self):
        self.client = pymongo.MongoClient()['spider']
        self.col = self.client['weibo_people']
        self.sweet = self.client['sweet']
    def open_spider(self, spider):  # 官方文档说,这个调用spider的时候会自己启动。
        print('SaveItemPipeline   开启')
        

    def process_item(self, item, spider):
        # print(type(item))
        if isinstance(item,WeiboItem):# 利用该方法判断返回的字段到底要存到哪个表中。
            try:

                item = dict(item)
            # print(type(item))
                self.col.insert_one(item)
                print("储存的是人物信息")
                return item
            except:
                print("数据重复")
        else:
            try:

                item = dict(item)
                # print(type(item))
                self.sweet.save(item)
                print("储存的是微博信息")
                return item
            except:
                print("数据重复")

spider代码

class WeiboSpiderSpider(scrapy.Spider):
    name = 'weibo_spider'
    start_urls = ['https://weibo.cn/6487174658/info']

    def parse(self, response):
        # 这里面也需要判断。
        text_end=';'.join(response.xpath("//div[@class='c']//text()").extract())
        item = WeiboItem()

        Nickname = re.findall('昵称:(.*?);', text_end)
        if Nickname:
            item['Nickname']=Nickname[0]

        Gender = re.findall('性别:(.*?);', text_end)
        if Gender:
            item['Gender'] = Gender[0]
        Location = re.findall('地区:(.*?);',text_end)

        if Location:
            item['Location'] = Location[0]
        Birthday = re.findall('生日:(.*?);',text_end)

        if Birthday:
            item['Birthday'] = Birthday[0]
        url = response.url.split('/')[3]
        item['user_id'] = url

        if url:
            yield scrapy.Request(url='https://weibo.cn/'+url,callback=self.parse_urls,meta={'item':item})# 请求自生页面

        if url:# 不停的访问粉丝。
            yield scrapy.Request(url='https://weibo.cn/{}/fans'.format(url),callback=self.parse_user_id)# 请求粉丝页面

    def parse_urls(self,response):
        item = response.meta.get('item')
        Microblog_num = re.findall(r'微博\[(\d+)\]',response.text)
        Focus_People = re.findall(r'关注\[(\d+)\]',response.text)
        Fans = re.findall(r'粉丝\[(\d+)\]',response.text)
        if Microblog_num:
            try:
                item['Microblog_num'] = Microblog_num[0]
            except:
                pass
        if Focus_People:
            try:
                item['Focus_People'] = Focus_People[0]
            except:
                pass
        if Fans:
            try:
                item['Fans'] = Fans[0]
            except:
                pass
        yield item
        # 微博内容的抓取
        alls = response.xpath("//div[@class='c']")[:-2]
        sweet_items = Weibo_Xinxi_Item
        comment_num = re.findall(r'class="cc">评论\[(\d+)\]<', response.text)
        pass_num = re.findall(r'class="cc">转发\[(\d+)\]<', response.text)
        for i,j,k in zip(alls,comment_num,pass_num):
            Title = ''.join(i.xpath("./div/span[@class='ctt']/text()").extract())
            if Title:
                sweet_items['Title'] = Title
            if j:
                sweet_items['Comment_Num'] = j
            if k:
                sweet_items['Pass_Num'] = k
            sweet_items['user_id'] = response.url.split('/')[3]
            yield sweet_items
            print('已经返回items了'*1000)
        try:
            next_url = response.xpath("//a[text()='下页']/@href").extract()[0]
        except IndexError:
            pass
        try:
            next_url = response.xpath("//a[text()='下页']/@href").extract()[0]
            if next_url:
                yield scrapy.Request(url='https://weibo.cn'+next_url,callback=self.parse_sweet_book)
        except:
           	pass

    def parse_sweet_book(self,response):
        sweet_items = Weibo_Xinxi_Item()
        alls = response.xpath("//div[@class='c']")[:-2]
        comment_num = re.findall(r'class="cc">评论\[(\d+)\]<', response.text)
        pass_num = re.findall(r'class="cc">转发\[(\d+)\]<', response.text)
        for i, j, k in zip(alls, comment_num, pass_num):
            Title = ''.join(i.xpath("./div/span[@class='ctt']/text()").extract())
            if Title:
                sweet_items['Title'] = Title
          
            if j:
                sweet_items['Comment_Num'] = j
            if k:
                sweet_items['Pass_Num'] = k
            sweet_items['user_id'] = response.url.split('/')[3]
            yield sweet_items
            print(sweet_items)

        next_url = response.xpath("//a[text()='下页']/@href").extract()[0]
        try:
            if next_url:
                yield scrapy.Request(url='https://weibo.cn' + next_url, callback=self.parse_sweet_book)
        except:
            pass

    def parse_user_id(self,response):
        uid_list = re.findall(r'https://weibo.cn/u/(\d+)',response.text,re.S)
        for uid in uid_list:
            yield scrapy.Request(url='https://weibo.cn/{}/info'.format(uid))
# 翻页
        next_url = response.xpath("//a[text()='下页']/@href").extract()
        if next_url:
            yield scrapy.Request(url='https://weibo.cn'+next_url[0],callback=self.parse_user_id)

你可能感兴趣的:(IT)