我们通过一个案例来梳理使用scrapy框架抓取数据入库以及下载图片媒体文件的整个流程
在命令行编写命令,创建项目: $ scrapy startproject educsdn
项目目录结构:
educsdn
├── educsdn
│ ├── __init__.py
│ ├── __pycache__
│ ├── items.py # Items的定义,定义抓取的数据结构
│ ├── middlewares.py # 定义Spider和DownLoader的Middlewares中间件实现
│ ├── pipelines.py # 它定义Item Pipeline的实现,即定义数据管道
│ ├── settings.py # 它定义项目的全局配置
│ └── spiders # 其中包含一个个Spider的实现,每个Spider都有一个文件
│ ├── __init__.py
│ └── __pycache__
└── scrapy.cfg #Scrapy部署时的配置文件,定义了配置文件路径、部署相关信息等内容
进入educsdn项目目录 $ cd educsdn
执行genspider命令,第一个参数是Spider的名称,第二个参数是网站域名。$ scrapy genspider courses edu.csdn.net
新的目录结构:
├── demo
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-36.pyc
│ │ └── settings.cpython-36.pyc
│ ├── items.py
│ ├── middlewares.py
│ ├── pipelines.py
│ ├── settings.py
│ └── spiders
│ ├── __init__.py
│ ├── __pycache__
│ │ └── __init__.cpython-36.pyc
│ └── courses.py #在spiders目录下有了一个爬虫类文件courses.py
└── scrapy.cfg
Item是保存爬取数据的容器,它的使用方法和字典类型,但相比字典多了些保护机制。
创建Item需要继承scrapy.Item类,并且定义类型为scrapy.Field的字段:(课程标题、课程地址、图片、授课老师,视频时长、价格)
具体代码如下:(修改类名为CoursesItem)在文件:items.py中
import scrapy
class CoursesItem(scrapy.Item):
'''
课程信息item类
'''
# define the fields for your item here like:
# 课程标题、课程地址、图片、授课老师,视频时长、价格
title = scrapy.Field()
url = scrapy.Field()
pic = scrapy.Field()
teacher = scrapy.Field()
time = scrapy.Field()
price = scrapy.Field()
def parse(self, response):
pass
在courses.py文件中,parse()方法的参数response是start_urls里面的链接爬取后的结果。
提取的方式可以是CSS选择器、XPath选择器或者是re正则表达式。
代码如下:
# -*- coding: utf-8 -*-
import scrapy
from educsdn.items import CoursesItem
class CoursesSpider(scrapy.Spider):
name = 'courses'
allowed_domains = ['edu.csdn.net']
start_urls = ['https://edu.csdn.net/courses/o5329/p1']
p = 1
def parse(self, response):
''' 解析数据信息 '''
# print(response.url)
dlist = response.selector.css('div.course_item')
for dd in dlist:
# print(dd.css('span.title::text').extract_first())
item = CoursesItem()
item['title'] = dd.css("span.title::text").extract_first()
item['url'] = dd.css("a::attr(href)").extract_first()
item['pic'] = dd.css("img::attr(src)").extract_first()
item['teacher'] = dd.css("span.lecname::text").extract_first()
item['time'] = dd.re_first('(.*?)课时')
item['price'] = dd.re_first("¥([0-9\.]+)")
#print(item)
#print('=' * 50)
yield item # 此处的数据会被送到pipelines中去
self.p += 1
# 此处只象征性爬取2页,可以通过程序做更好的处理
if self.p < 2:
next_url = 'https://edu.csdn.net/courses/o5329/p' + str(self.p)
url = response.urljoin(next_url)
yield scrapy.Request(url = url, callback = self.parse)
在mysql中创建数据库csdndb和数据表courses
CREATE TABLE `courses` (
`id` int(10) unsigned NOT NULL AUTO_INCREMENT,
`title` varchar(255) DEFAULT NULL,
`url` varchar(255) DEFAULT NULL,
`pic` varchar(255) DEFAULT NULL,
`teacher` varchar(32) DEFAULT NULL,
`time` varchar(16) DEFAULT NULL,
`price` varchar(16) DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8
Item Pipeline为项目管道,当Item生产后,他会自动被送到Item Pipeline进行处理:
我们常用Item Pipeline来做如下操作:
pipelines.py 代码如下:
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline
from scrapy import Request
class EducsdnPipeline(object):
''' 此处 '''
def process_item(self, item, spider):
# 处理数据
if item['title']:
item['title'] = item['title'].strip()
# 用于判断是否丢弃
dropFlag = (item['teacher'] == None) or (item['title'] == None) or (item['price'] == None) or (item['url'] == None) or (item['pic'] == None)
if dropFlag:
raise DropItem('this item is dropped!')
else:
return item
class MysqlPipeline(object):
''' 用于操作数据库的类 '''
def __init__(self, host, user, password, database, port):
self.host = host
self.user = user
self.password = password
self.database = database
self.port = port
# 依赖注入
@classmethod
def from_crawler(cls, crawler):
return cls(
host = crawler.settings.get('MYSQL_HOST'),
user = crawler.settings.get('MYSQL_USER'),
password = crawler.settings.get('MYSQL_PASS'),
database = crawler.settings.get('MYSQL_DATABASE'),
port = crawler.settings.get('MYSQL_PORT'),
)
def open_spider(self, spider):
''' 连接数据库 '''
self.db = pymysql.connect(self.host,self.user,self.password,self.database,charset='utf8',port=self.port)
self.cursor = self.db.cursor()
def process_item(self, item, spider):
''' 执行数据表的写入操作 '''
sql = "insert into courses(title, url, pic, teacher, time, price) values('%s','%s','%s','%s','%s','%s')"%(item['title'],item['url'],item['pic'],item['teacher'],str(item['time']),str(item['price']))
self.cursor.execute(sql)
self.db.commit()
return item
def close_spider(self, spider):
self.db.close()
class ImagePipeline(ImagesPipeline):
'''自定义图片存储类'''
def get_media_requests(self, item, info):
'''通过抓取的item对象获取图片信息,并创建Request请求对象添加调度队列,等待调度执行下载'''
yield Request(item['pic'])
def file_path(self,request,response=None,info=None):
'''返回图片下载后保存的名称,没有此方法Scrapy则自动给一个唯一值作为图片名称'''
url = request.url
file_name = url.split("/")[-1]
return file_name
def item_completed(self, results, item, info):
''' 下载完成后的处理方法,其中results内容结构如下说明'''
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
#item['image_paths'] = image_paths
return item
打开配置文件:settings.py 开启并配置ITEM_PIPELINES信息,配置数据库连接信息
ITEM_PIPELINES = {
'educsdn.pipelines.EducsdnPipeline': 300,
'educsdn.pipelines.MysqlPipeline': 301,
}
MYSQL_HOST = 'localhost'
MYSQL_DATABASE = 'csdndb' # 填写你的数据库名称
MYSQL_USER = 'root' # 填写你的数据库登录用户
MYSQL_PASS = '' # 填写你的数据库登录密码
MYSQL_PORT = 3306
scrapy crawl courses
1 )关于ImagesPipeline介绍
2 ) 具体使用
首先定义存储文件的路径,在settings.py配置文件中添加代码:IMAGES_STORE = ‘./images’
并在项目目录下创建 images 文件夹 (与scrapy.cfg文件同级)。
要在pipelines.py文件中定义一个ImagePipeline
类,其代码如下:
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline
from scrapy import Request
class EducsdnPipeline(object):
''' 此处 '''
def process_item(self, item, spider):
# 处理数据
if item['title']:
item['title'] = item['title'].strip()
# 用于判断是否丢弃
dropFlag = (item['teacher'] == None) or (item['title'] == None) or (item['price'] == None) or (item['url'] == None) or (item['pic'] == None)
if dropFlag:
raise DropItem('this item is dropped!')
else:
return item
class MysqlPipeline(object):
''' 用于操作数据库的类 '''
def __init__(self, host, user, password, database, port):
self.host = host
self.user = user
self.password = password
self.database = database
self.port = port
# 依赖注入
@classmethod
def from_crawler(cls, crawler):
return cls(
host = crawler.settings.get('MYSQL_HOST'),
user = crawler.settings.get('MYSQL_USER'),
password = crawler.settings.get('MYSQL_PASS'),
database = crawler.settings.get('MYSQL_DATABASE'),
port = crawler.settings.get('MYSQL_PORT'),
)
def open_spider(self, spider):
''' 连接数据库 '''
self.db = pymysql.connect(self.host,self.user,self.password,self.database,charset='utf8',port=self.port)
self.cursor = self.db.cursor()
def process_item(self, item, spider):
''' 执行数据表的写入操作 '''
sql = "insert into courses(title, url, pic, teacher, time, price) values('%s','%s','%s','%s','%s','%s')"%(item['title'],item['url'],item['pic'],item['teacher'],str(item['time']),str(item['price']))
self.cursor.execute(sql)
self.db.commit()
return item
def close_spider(self, spider):
self.db.close()
class ImagePipeline(ImagesPipeline):
'''自定义图片存储类'''
def get_media_requests(self, item, info):
'''通过抓取的item对象获取图片信息,并创建Request请求对象添加调度队列,等待调度执行下载'''
yield Request(item['pic'])
def file_path(self,request,response=None,info=None):
'''返回图片下载后保存的名称,没有此方法Scrapy则自动给一个唯一值作为图片名称'''
url = request.url
file_name = url.split("/")[-1]
return file_name
def item_completed(self, results, item, info):
''' 下载完成后的处理方法,其中results内容结构如下说明'''
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
#item['image_paths'] = image_paths
return item
在item_completed()
方法中,results参数内容结构如下:
print(results)
[(True, {'url': 'https://img-bss.csdn.net/201803191642534078.png',
'path': '201803191642534078.png',
'checksum': 'cc1368dbc122b6762f3e26ccef0dd105'}
)]
在settings.py文件中配置如下(启用下载):
...
ITEM_PIPELINES = {
'educsdn.pipelines.EducsdnPipeline': 300,
'educsdn.pipelines.ImagePipeline': 301,
'educsdn.pipelines.MysqlPipeline': 302,
}
MYSQL_HOST = "localhost"
MYSQL_DATABASE = "csdndb"
MYSQL_USER = "root"
MYSQL_PASS = ""
MYSQL_PORT = 3306
IMAGES_STORE = "./images"
...