为了充分利用网上大数据资源,让用户能够方便利用影视信息,采用基于 Scrapy 框架的爬虫技术,开发了检索电影信息的搜索引擎。对豆瓣网站的影视信息进行爬取,以方便用户准确获取最新的电影信息。
项目代码
以“豆瓣电影”为爬取目 标,爬取网站中的影视信息。主要包括网站排名 “ Top250 ”和喜剧、动作类电影的电影名称、电影评分、电影导演, 电影上映时间以及电影评语。
首先创建工程
scrapy startproject DouBan
创建爬虫程序
cd DouBan/
scrapy genspider douban ‘douban.com’
items.py中的代码
# -*- coding: utf-8 -*-
"""
# 1. item.py文件的功能?
item.py主要目标是从非结构化来源(通常是网页)提取结构化数据。Scrapy爬虫可以将提取的数据作为Python语句返回。
# 2. 为什么使用item.py?
虽然方便和熟悉,Python dicts缺乏结构:很容易在字段名称中输入错误或返回不一致的数据,特别是在与许多爬虫的大项目。
# 3. item.py文件的优势?
- 定义公共输出数据格式,Scrapy提供Item类。
- Item对象是用于收集所抓取的数据的简单容器。
- 提供了一个类似字典的 API,具有用于声明其可用字段的方便的语法。
"""
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
# class DoubanItem(scrapy.Item):
# # define the fields for your item here like:
# # name = scrapy.Field()
# title = scrapy.Field()
# rating_num = scrapy.Field()
class DouBanMovieItem(scrapy.Item):
"""
确定要爬取的数据的类型和名称,包含:
电影名称( title) ;
电影评分( score) ;
电影评语( quote) ;
电影导演( director) ,
上映日期(release_date)
评论数(comment_num)
通过 Field( ) 方法来声明数据字段。
"""
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field() # 电影名称
score = scrapy.Field() # 电影评分
quote = scrapy.Field() # 电影评语
director = scrapy.Field() # 电影导演
release_date = scrapy.Field() # 上映日期
comment_num = scrapy.Field() # 评论数
image_url = scrapy.Field() # 图片的url地址
detail_url = scrapy.Field() # 电影详情页信息;
image_path = scrapy.Field() # 下載的封面本地存儲位置
middlewares.py中的重要代码
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class DoubanSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class DoubanDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
pipelines.py中的代码
import json
import pymysql
import scrapy
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline
class DoubanPipeline(object):
def process_item(self, item, spider):
return item
class AddScoreNum(object):
"""在原有评分的基础上加1"""
def process_item(self, item, spider):
if item['score']:
score = float(item['score'])
item['score'] = str(score + 1)
return item
else:
raise Exception("没有爬去到score")
class JsonWriterPipeline(object):
"""爬虫之前打开文件对象, 爬虫之后, 关闭文件对象"""
def open_spider(self, spider):
self.file = open('douban.json', 'w')
def process_item(self, item, spider):
# dict(item): 将item对象转成字典
# json.dumps: 将字典序列化成json字符串;
# indent=4: 存储是缩进为4;
# ensure_ascii=False: 解决中文乱码问题
line = json.dumps(dict(item), indent=4, ensure_ascii=False)
self.file.write(line)
return item
def close_spider(self, spider):
self.file.close()
class MysqlPipeline(object):
"""编写MySQL存储插件"""
def open_spider(self, spider):
# 连接数据库
self.connect = pymysql.connect(
host='127.0.0.1', # 数据库地址
port=3306, # 数据库端口
db='scrapyProject', # 数据库名
user='root', # 数据库用户名
passwd='westos', # 数据库密码
charset='utf8', # 编码方式
use_unicode=True,
autocommit=True
)
# 通过cursor执行增删查改
self.cursor = self.connect.cursor()
self.cursor.execute("create table if not exists douBanTop("
"title varchar(50) unique, "
"score float , "
"quote varchar(100), "
"director varchar(100), "
"comment_num int, "
"release_date varchar(10));")
def process_item(self, item, spider):
insert_sqli = "insert into douBanTop(title, score, quote,director) values ('%s', '%s', '%s', '%s')" % (
item['title'], item['score'], item['quote'], item['director'],)
print(insert_sqli)
try:
self.cursor.execute(insert_sqli)
# 提交sql语句
self.connect.commit()
except Exception as e:
self.connect.rollback()
return item # 必须实现返回
def close_spider(self, spider):
self.connect.commit()
self.cursor.close()
self.connect.close()
class MyImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info): # 單個的item對象;
"""
自動請求獲取圖片信息並下載;
:param item:
:param info:
:return:
"""
print("item: ", item)
yield scrapy.Request(item['image_url'])
在setting中
添加以下代码
BOT_NAME = 'DouBan'
SPIDER_MODULES = ['DouBan.spiders']
NEWSPIDER_MODULE = 'DouBan.spiders'
from fake_useragent import UserAgent
ua = UserAgent()
USER_AGENT = ua.random
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
'scrapy.pipelines.images.ImagesPipeline': 1,
'scrapy.pipelines.files.FilesPipeline': 2,
'DouBan.pipelines.MyImagesPipeline': 2,
'DouBan.pipelines.DoubanPipeline': 300,
'DouBan.pipelines.JsonWriterPipeline': 200, # 数字越小, 越先执行;
'DouBan.pipelines.AddScoreNum': 100, # 处理爬去的数据, 处理完成后才保存;
'DouBan.pipelines.MysqlPipeline': 200, # 处理爬去的数据, 处理完成后才保存;
}
# FILES_STORE = '/tmp/files/' # 文件存储路径
IMAGES_STORE = '/tmp/images/' # 图片存储路径
# 90 days of delay for files expiration
# FILES_EXPIRES = 90
# 30 days of delay for images expiration
IMAGES_EXPIRES = 30
# 图片缩略图
IMAGES_THUMBS = {
'small': (250, 250),
'big': (270, 270),
}
# 图片过滤器,最小高度和宽度
IMAGES_MIN_HEIGHT = 110
IMAGES_MIN_WIDTH = 110
在项目的spiders中可以添加多个爬虫文件
DanBan中的代码案例如下
# -*- coding: utf-8 -*-
import re
import scrapy
from scrapy import Request
from DouBan.items import DouBanMovieItem
class DoubanSpider(scrapy.Spider):
name = 'douban' # 爬虫名称, 随便起, 但是不能重复;
allowed_domains = ['douban.com', 'doubanio.com'] # 允许爬去的网站;
# start_urls = ['http://douban.com/'] # 种子URL, 最开始要爬去的url地址, 通过引擎传递给调度器。
start_urls = [
'https://movie.douban.com/top250'
]
url = 'https://movie.douban.com/top250'
def parse(self, response):
item = DouBanMovieItem()
# <ol class="grid_view">
movies = response.xpath('//ol[@class="grid_view"]/li')
for movie in movies:
# 电影名称( title): <span class="title">肖申克的救赎</span>
# extract()将对象转换成字符串
item['title'] = movie.xpath(
'.//span[@class="title"]/text()'
).extract()[0]
# 电影评分( score): <span class="rating_num" property="v:average">9.7</span>
item['score'] = movie.xpath(
'.//span[@class="rating_num"]/text()'
).extract()[0]
# 电影评语( quote): 有的电影没有短评, 存储空字符串即可;
quote = movie.xpath(
'.//span[@class="inq"]/text()'
).extract()
item['quote'] = quote[0] if quote else ''
# 电影导演( director)
"""
info:
['导演: 奥利维·那卡什 Olivier Nakache / 艾力克·托兰达 Eric Toledano\xa0\xa0\xa0主...', '2011\xa0/\xa0剧, '', '\n ']
"""
info = movie.xpath(
'.//div[@class="bd"]/p/text()'
).extract()
director = info[0].split('主演')[0].strip()
item['director'] = director
# 'https://img3.doubanio.com/view/photo/s_ratio_poster/public/p1454261925.jpg'
item['image_url'] = movie.xpath('.//div[@class="pic"]/a/img/@src').extract()[0]
# print("image url: ", item['image_url'])
item['detail_url'] = movie.xpath('.//div[@class="hd"]//a/@href').extract()[0]
# print("detail url: ", item['detail_url'])
# 上映日期(release_date):
yield item
"""
<span class="next">
<link rel="next" href="?start=50&filter=">
<a href="?start=50&filter=">后页></a>
</span>
"""
# nextLink = response.xpath('.//span[@class="next"]/link/@href').extract() # 返回的是列表
# if nextLink:
# nextLink = nextLink[0]
# print('Next Link: ', nextLink)
# yield Request(self.url + nextLink, callback=self.parse)
#
#