Scrapy 简介及栗子

Scrapy 简介及栗子

  • 简介
  • 框架的使用
    • 创建项目
    • 创建蜘蛛
    • 栗子
    • 运行项目

简介

Scrapy是一个适用爬取网站数据、提取结构性数据的应用程序框架,它可以应用在广泛领域:Scrapy 常应用在包括数据挖掘,信息处理或存储历史数据等一系列的程序中。通常我们可以很简单的通过 Scrapy 框架实现一个爬虫,抓取指定网站的内容或图片。[百度百科]

框架的使用

创建项目

scrapy startproject 项目名

创建蜘蛛

因为Scrapy是一个爬虫框架,一般多个蜘蛛合起来的项目才会用到,所以还要创建小蜘蛛。

cd 项目名
scrapy genspider 蜘蛛名 初始url

栗子

很久前帮人写的爬个回收数据
Scrapy全程非阻塞,通过yeild item后,再通过pipeline进行工作,spider主文件只写解析网页的逻辑,真正工作的地方在pipeline


Spider

# -*- coding: utf-8 -*-
import scrapy
from scrapy.selector import Selector
from aihuishou.items import AihuishouItem,DownloadImg
from scrapy.http import Request


class HuishouSpider(scrapy.Spider):
    name = 'huishou'
    allowed_domains = ['aihuishou.com']
    start_urls = ['https://www.aihuishou.com/']

    def parse(self, response):
        hxs = Selector(response=response).xpath('//div[@class="category-panel"]')
        for obj in hxs:
            href = obj.xpath('.//a[@class="morebrands"]/@href').extract_first().strip()
            href = href.replace("/", "")

            if href == 'shouji':
                yield Request(url="https://www.aihuishou.com/%s" % href, meta={
     'flag': 'shouji'}, callback=self.c_item)
            elif href == 'pingban':
                yield Request(url="https://www.aihuishou.com/%s" % href, meta={
     'flag': 'pingban'}, callback=self.c_item)
            elif href == 'laptop':
                yield Request(url="https://www.aihuishou.com/%s" % href, meta={
     'flag': 'laptop'}, callback=self.c_item)
            elif href == 'sheying':
                yield Request(url="https://www.aihuishou.com/%s" % href, meta={
     'flag': 'sheying'}, callback=self.c_item)
            elif href == 'digital':
                yield Request(url="https://www.aihuishou.com/%s" % href, meta={
     'flag': 'digital'}, callback=self.c_item)

    def c_item(self, response):
        hxs = Selector(response=response).xpath('//div[@class="main-right"]/ul/li')
        for obj in hxs:
            href = obj.xpath('.//a/@href').extract_first().strip()
            yield Request(url="https://www.aihuishou.com/%s" % href, meta=response.meta, callback=self.c_item)
            # title = obj.xpath('.//a/@title').extract_first().strip().replace("回收", "").replace("手机", "").rstrip()
            title = obj.xpath('.//p/text()').extract_first().rstrip()
            img = obj.xpath('.//img/@src').extract_first().strip()
            lst = {
     response.meta['flag']: title + '-----' + img.split('/')[-1]}
            item_obj = AihuishouItem(_class=lst)  # 创建分类文件
            yield item_obj
            downimg = DownloadImg(imgurl=img, imgpath='class/images/')  # 下载分类图片
            yield downimg
            print(href, title, img)

        classification = Selector(response=response).xpath('//div[@class="list-box-wrapper"]//dd[@class="active"]/a/text()'). \
            extract()

        if classification:
            page = Selector(response=response).xpath('//div[@class="product-list-pager"]'
                                                     '/a[@class="next no_hover"]/@href').extract_first()
            item = Selector(response=response).xpath('//div[@class="product-list-wrapper"]/ul/li')

            if page:
                yield Request(url="https://www.aihuishou.com/%s" % page, meta=response.meta, callback=self.c_item)
            for obj in item:
                item_href = obj.xpath('.//a/@href').extract_first()
                yield Request(url="https://www.aihuishou.com/%s" % item_href, meta=response.meta, callback=self.c_item)

        item_info = Selector(response=response).xpath('//div[@id="group-property"]/div[@class="left"]')
        item_info2 = Selector(response=response).xpath('//div[@id="group-property"]/div[@class="right"]')
        if item_info and item_info2:
            catagory = Selector(response=response).xpath('//input[@id="product_brand"]/@value').extract_first().rstrip()
            item_name = item_info.xpath('.//h1/text()').extract_first().rstrip()
            img = item_info.xpath('.//img/@src').extract_first().strip()
            imgurl = (img.split('?')[0]).split('/')[-1]
            price = item_info.xpath('.//ul[@class="clearfix section-price"]/@data-highest-price').\
                extract_first().strip()
            price = price.split('?')[0]
            item_infos = item_info2.xpath('.//ul')
            item_multi= list()
            item_single = dict()
            for obj in item_infos:
                item_choice = obj.xpath('./@data-ppn-name').extract_first()
                if item_choice:
                    # 单选
                    item_single_all = list()
                    item_single_alls = obj.xpath('.//div/text()').extract()
                    for y in item_single_alls:
                        item_single_all.append(y)
                    item_single[item_choice] = item_single_all
                else:
                    item_multi_alls = obj.xpath('.//span[@class="property_value"]/text()').extract()
                    for y in item_multi_alls:
                        item_multi.append(y)
            lst = {
     response.meta['flag']: catagory + '-----' + item_name + '-----' + price + '-----' + imgurl + '-----' + str(item_single) + '-----' + str(item_multi)}
            item_obj = AihuishouItem(_item=lst)  # 创建分类文件
            yield item_obj
            downimg = DownloadImg(imgurl=img, imgpath='item/images/')  # 下载物品图片
            yield downimg


pipeline

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

import requests


class AihuishouPipeline(object):

    def process_item(self, item, spider):
        # print(item)
        if item.get('imgurl'):
            url = item.get('imgurl')
            res = requests.get(url)
            if url.find('?') > 0:
                img_path = item.get('imgpath') + (url.split('?')[0]).split('/')[-1]
                with open(img_path, 'wb') as f:
                    f.write(res.content)
            else:
                img_path = item.get('imgpath') + url.split('/')[-1]
                with open(img_path, 'wb') as f:
                    f.write(res.content)

        if item.get('_item'):
            v = item.get('_item')
            for c, n in v.items():
                f = open('item/%s.txt' % c, 'a+', encoding='utf-8')
                print(n)
                f.write(n + '\n')
                f.close()

        if item.get('_class'):
            v = item.get('_class')
            for c, n in v.items():
                f = open('class/%s.txt' % c, 'a+', encoding='utf-8')
                print(n)
                f.write(n + '\n')
                f.close()

        return item

item

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy
class DownloadImg(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    imgurl = scrapy.Field()
    imgpath = scrapy.Field()


class AihuishouItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    _class = scrapy.Field()
    _item = scrapy.Field()



运行项目

scrapy crawl 蜘蛛名
# 无日志显示运行
scrapy crawl 蜘蛛名 --nolog
# 如果单纯想运行蜘蛛 切换到蜘蛛目录下
# 把craw 换成 runspider 文件名

你可能感兴趣的:(Python)