Scrapy是一个为了爬取网站数据,提取结构性数据而编写的应用框架。可以应用在包括数据挖掘,信息处理或存储历史数据等一系列的程序中。
我用的是Anaconda,所以运行
conda install scrapy
切换到目标文件夹,然后运行
scrapy startproject one_night_in_shanghai
生成以下目录结构:
one_night_in_shanghai/
1 scrapy.cfg
2 one_night_in_shanghai/
2.1 __init__.py
2.2 items.py
2.3 pipelines.py
2.4 settings.py
2.5 spiders/
2.5.1 __init__.py
其中
- scrapy.cfg: 项目的配置文件
- one_night_in_shanghai/: 该项目的python模块,之后将在此加入代码;
- one_night_in_shanghai/items.py: 项目中的item文件;
- one_night_in_shanghai/pipelines.py: 项目中的pipelines文件;
- one_night_in_shanghai/settings.py: 项目的设置文件;
- one_night_in_shanghai/spiders/: 放置spider代码的目录。
Item 是保存爬取到的数据的容器;其使用方法和python字典类似, 并且提供了额外保护机制来避免拼写错误导致的未定义字段错误。
import scrapy
class OneNightInShanghaiItem(scrapy.Item):
img = scrapy.Field() # 我这里想爬图片,那么为图片定义一个关键字
#vedio = scrapy.Field() # 如果后面还要爬视频
gif = scrapy.Field() # 用于爬gif`
#-*- coding: utf-8 -*-
import scrapy
from one_night_in_shanghai.items import OneNightInShanghaiItem
class last_day_in_September_spider(scrapy.Spider):
#爬虫名字,唯一,用于区分以后新建的爬虫
name = "img"
#可选,定义爬取区域,超出区域的链接不爬取
allowed_domains = ["so.redocn.com"] #如果对于页面没有特殊要求,也可以不写
#定义开始爬取的页面
start_urls=["http://so.redocn.com/shuiguo/cbaeb9fb.htm"]
def parse(self, response): # 友情提示:不能更改此函数名,否则后果自负 )= =(
#用xpath的方式获取图片的src,具体语法移步scrapy教程->见末尾链接
urls = response.xpath('//div[@class="wrap g-bd"]/div/dl/dd/a/img[not(contains(@class, "lazy"))]/@src').extract()
for url in urls:
# 前面我们定义过item,此处将其实例化
imgItem = OneNightInShanghaiItem()
#将获得url赋值给定义好的item
imgItem['img'] = [url]
imgItem['gif'] = [] #上面如果定义了gif关键字,就得给初始化
#将结果交给Pipeline处理
yield imgItem
#翻页
##response.xpath('//a[@class="next"]//@href').extract() #也可以这样
nexturl=response.xpath(u'//a[contains(text(),"下一页")]/@href').extract()
domains = ['http://so.redocn.com']
nexturl_all = domains[0] + nexturl[0]
if nexturl_all:
yield scrapy.Request(nexturl_all.encode("utf-8"), callback=self.parse)
同时,更改pipelines.py来对爬取到的数据进一步处理
# -*- coding: utf-8 -*-
# Define your item pipelines here
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import os
import urllib
class OneNightInShanghaiPipeline(object):
def process_item(self, item, spider):
dir_path = '/home/archer/for_fun/one_night_in_shanghai/result/'
#print'dir_path',dir_path
if item['img'] is not None:
for image_url in item['img']:
list_name=image_url.split('/')
file_name=list_name[len(list_name)-1]#图片名称
file_path='%s/%s'%(dir_path,file_name)
if os.path.exists(file_name):
continue
with open(file_path,'wb') as file_writer:
conn=urllib.urlopen(image_url)#下载图片
file_writer.write(conn.read())
file_writer.close()
if item['gif'] is not None:
for image_url in item['gif']:
list_name=image_url.split('/')
file_name=list_name[len(list_name)-1]#图片名称
file_path='%s/%s'%(dir_path,file_name)
if os.path.exists(file_name):
continue
with open(file_path,'wb') as file_writer:
conn=urllib.urlopen(image_url)#下载图片
file_writer.write(conn.read())
file_writer.close()
return item
当然,有兴致的话,还可以更改settings.py文件
# -*- coding: utf-8 -*-
# Scrapy settings for one_night_in_shanghai project
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'one_night_in_shanghai'
SPIDER_MODULES = ['one_night_in_shanghai.spiders']
NEWSPIDER_MODULE = 'one_night_in_shanghai.spiders'
# 启动对应的Pipeline,有多个Pipeline时,数字小的先执行
ITEM_PIPELINES={
'one_night_in_shanghai.pipelines.OneNightInShanghaiPipeline':1,
}
DOWNLOAD_DELAY=1
ROBOTSTXT_OBEY = True
scrapy crawl img