pyspider+MongoDB简单操作

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2018-03-30 09:20:41
# Project: IvskyDemo

from pyspider.libs.base_handler import *

class MysqlWirter(object):
    def __init__(self):
        self.db = '创建db对象'
    def insert_result(self,result):
        # 将result数据写入数据库
        pass
    def __del__(self):
        # 关闭数据库
        # self.db.close()
        pass

class Handler(BaseHandler):

    # 准备请求头
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0',
        'Host':'www.ivsky.com'
    }

    crawl_config = {
    }
    mysql = MysqlWirter()

    @every(minutes=24 * 60)
    def on_start(self):
        self.crawl(
            'www.ivsky.com/tupian/',
            callback=self.big_categray,
            headers = self.headers
        )

    @config(age=10 * 24 * 60 * 60)
    def big_categray(self, response):
        # for循环遍历找到的所有标签
        for each in response.doc('.tpmenu>li>a').items():
            # 创建一个新的爬取任务
            self.crawl(
                # 获取标签的href属性值,把属性值作为url创建新任务
                each.attr.href,
                callback=self.small_categray,
                # save 可以将数据传递到下一个回调函数中,类似于scrapy中的meta
                save = {'big_cate':each.text()}

            )


    def small_categray(self, response):
        '''
        解析小分类
        :param response:
        :return:
        '''
        save  =response.save
        # 找到小分类
        for each in response.doc('.sline>div>a').items():
            # 将小分类存储到save中
            save['small_cate'] = each.text()
            # 创建新的爬取任务
            self.crawl(
                each.attr.href,
                callback = self.list_page,
                save = save,
                headers = self.headers
            )
    def list_page(self, response):
        '''
        解析列表
        :param response:
        :return:
        '''
        # 把save取出来
        save = response.save
        # 找到当前页所有图片的详细地址
        for each in response.doc('.pli>li>div>a').items():
            # 创建一个新的爬取任务
            self.crawl(
                each.attr.href,
                callback = self.detail_page,
                save = save,
                headers=self.headers
            )

        # 翻页
        for each in response.doc('.page-next').items():
            # 创建新的爬取任务,回调函数本身
            self.crawl(
                each.attr.href,
                callback = self.list_page,
                save = save,
                headers=self.headers
            )

    def detail_page(self, response):
        '''
        解析图片地址
        :param response:
        :return:
        '''
        # 找到指定标签拥有某个属性值的标签
        # 标签[属性名="属性值"]
        for each in response.doc('img[id="imgis"]').items():
            # 返回需要保存的数据,返回一个字典
            return {
                'url':each.attr.src,
                'title':each.attr.src.split('/')[-1],
                'big_cate':response.save['big_cate'],
                'small_cate':response.save['small_cate']
            }
    # 如果需要自己保存数据,需要重写on_result(self,result)
    def on_result(self, result):
        # 将数据保存的代码写在函数中
        self.mysql.insert_result(result)
        # 保留父类中原来数据保存功能
        super(Handler, self).on_result(result)






#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2018-03-30 09:20:41
# Project: IvskyDemo

from pyspider.libs.base_handler import *
from pymongo import MongoClient
# 向mongodb中插入数据的类
class MongoWriter(object):

    def __init__(self):
        # 构造对象
        self.client = MongoClient()
        # 获取数据库
        db = self.client.imgs
        # 获取集合(表)
        self.imgs = db.imgs

    def insert_result(self, result):

        if result:
            # 向mongodb中插入数据
            self.imgs.insert(result)

    def __del__(self):
        # 关闭数据库连接
        self.client.close()


class Handler(BaseHandler):
    # 构造对象
    mongo = MongoWriter()

    # 准备请求头
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0',
        'Host': 'www.ivsky.com',
        'Cookie': 'Hm_lvt_862071acf8e9faf43a13fd4ea795ff8c=1520298270,1520557723,1521015467,1522372414; BDTUJIAID=d4090b1fdf20d8f75ec2d25014d87217; Hm_lpvt_862071acf8e9faf43a13fd4ea795ff8c=1522372461; statistics_clientid=me',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3'
    }

    crawl_config = {
    }

    @every(minutes=24 * 60)
    def on_start(self):
        self.crawl(
            'www.ivsky.com/tupian/',
            callback=self.big_categray,
            headers=self.headers
        )

    @config(age=10 * 24 * 60 * 60)
    def big_categray(self, response):
        # for循环遍历找到的所有标签
        for each in response.doc('.tpmenu>li>a').items():
            # 创建一个新的爬取任务
            self.crawl(
                # 获取标签的href属性值,把属性值作为url创建新任务
                each.attr.href,
                callback=self.small_categray,
                # save 可以将数据传递到下一个回调函数中,类似于scrapy中的meta
                save={'big_cate': each.text()}

            )
        # .page-next

    def small_categray(self, response):
        '''
        解析小分类
        :param response:
        :return:
        '''
        save = response.save
        # 找到小分类
        for each in response.doc('.sline>div>a').items():
            # 将小分类存储到save中
            save['small_cate'] = each.text()
            # 创建新的爬取任务
            self.crawl(
                each.attr.href,
                callback=self.list_page,
                save=save,
                headers=self.headers
            )

    def list_page(self, response):
        '''
        解析列表
        :param response:
        :return:
        '''
        # 把save取出来
        save = response.save
        # 找到当前页所有图片的详细地址
        for each in response.doc('.pli>li>div>a').items():
            # 创建一个新的爬取任务
            self.crawl(
                each.attr.href,
                callback=self.detail_page,
                save=save,
                headers=self.headers
            )

        # 翻页

    def detail_page(self, response):
        '''
        解析图片地址
        :param response:
        :return:
        '''
        # 找到指定标签拥有某个属性值的标签
        # 标签[属性名="属性值"]
        for each in response.doc('img[id="imgis"]').items():
            # 返回需要保存的数据,返回一个字典
            return {
                'url': each.attr.src,
                'title': each.attr.src.split('/')[-1],
                'big_cate': response.save['big_cate'],
                'small_cate': response.save['small_cate']
            }
    def on_result(self, result):
        # 执行插入数据的操作
        self.mongo.insert_result(result)
        # 调用原有的数据存储
        super(Handler, self).on_result(result)












你可能感兴趣的:(pyspider+MongoDB简单操作)