一个简单的基于scrapy框架爬虫的练习
GItHub:https://github.com/P-jinsan/doubanmovie
Pycharm、Python3.8、谷歌浏览器
获取的信息可能不只是一页
import scrapy
import re
from doubanmovie.items import DoubanmovieItem
from urllib import parse
class DbmovieSpider(scrapy.Spider):
name = 'dbmovie'
#allowed_domains = ['https://movie.douban.com/top250']
start_urls = ['https://movie.douban.com/top250/',]
def parse(self, response):
item = DoubanmovieItem()
selector = scrapy.Selector(response)
movies = selector.xpath('//div[@class="item"]')
for each in movies:
num = each.xpath('div[@class="pic"]/em/text()').extract()[0]
title = each.xpath('div[@class="info"]/div[@class="hd"]/a/span[@class="title"]/text()').extract()[0]
star = each.xpath('div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()').extract()[0]
# star = re.search('
quote ', each.extract(), re.S).group(1)= each.xpath('div[@class="info"]/div[@class="bd"]/p[@class="quote"]/span[@class="inq"]/text()').extract_first()
if quote is None:
quote = ' '
item['quote'] = quote
item['star'] = star
item['title'] = title
item['num'] = num
yield item
nextPage = selector.xpath('//span[@class="next"]/link/@href').extract_first()
if nextPage:
next = response.urljoin(nextPage)
print(next)
yield scrapy.http.Request(next,callback=self.parse)
import scrapy
class DoubanmovieItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
quote = scrapy.Field()
star = scrapy.Field()
num = scrapy.Field()
BOT_NAME = 'doubanmovie'
SPIDER_MODULES = ['doubanmovie.spiders']
NEWSPIDER_MODULE = 'doubanmovie.spiders'
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:45.0) Gecko/20100101 Firefox/45.0'
ROBOTSTXT_OBEY = True
#保存在数据库
MYSQL_HOST = 'localhost'
MYSQL_DBNAME = 'douban'
MYSQL_USER = 'root'
MYSQL_PASSWD = '123456'
MYSQL_PORT = 3306
ITEM_PIPELINES = {
'doubanmovie.pipelines.DoubanmoviePipeline': 300,
}
import pymysql
from twisted.enterprise import adbapi
import copy
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
class DoubanmoviePipeline(object):
def __init__(self,dbpool):
self.dbpool=dbpool
@classmethod
def from_settings(cls,settings):
"""
数据库建立连接
:param settings: 配置参数
:return: 实例化参数
"""
adbparams = dict(
host=settings['MYSQL_HOST'], # 读取settings中的配置
db=settings['MYSQL_DBNAME'],
user=settings['MYSQL_USER'],
passwd=settings['MYSQL_PASSWD'],
charset='utf8',#不要"-"
cursorclass=pymysql.cursors.DictCursor,
)
dbpool = adbapi.ConnectionPool('pymysql', **adbparams) # **表示将字典扩展为关键字参数,相当于host=xxx,db=yyy....
return cls(dbpool) # 相当于dbpool付给了这个类,self中可以得到
# pipeline默认调用
def process_item(self, item, spider):
asynItem = copy.deepcopy(item) #深度拷贝 为解决爬取与插入速度不统一导致的数据重复0
query = self.dbpool.runInteraction(self._conditional_insert, asynItem) # 调用插入的方法
query.addCallback(self.handle_error) # 调用异常处理方法
# 写入数据库中
def _conditional_insert(self, course, item):
# print item['name']
#sql = "insert into doubanmovie(name,url) values(%s,%s)"
sql = """insert into doubanmovie(num,title,rating,quote) value (%s, %s, %s, %s)"""
# params = (item["name"], item["url"])
course.execute(sql, (int(item['num']),item['title'],item['star'],pymysql.escape_string(item['quote'])))
def handle_error(self, failure):
if failure:
# 打印错误信息
print(failure)
#小数据
# def __init__(self):
# # # 连接数据库
# # self.conn = pymysql.connect('localhost','root','123456','douban',charset='utf8')
# # # 通过cursor执行增删查改
# # self.cursor = self.conn.cursor()
# # def process_item(self, item, spider):
# # # 插入数据
# # insert_sql ="""insert into doubanmovie(num,title,rating,quote) value (%s, %s, %s, %s)"""
# # self.cursor.execute(insert_sql,(item['num'],item['title'],item['star'],item['quote']))
# # # 提交sql语句
# # self.conn.commit()
# # def close_spider(self,spider):
# # #关闭游标和连接
# # self.cursor.close()
# # self.conn.close()
from scrapy import cmdline
cmdline.execute("scrapy crawl dbmovie".split())
爬取大量数据可能会出现爬取速度与插入速度不匹配产生大量重复数据,因此需要进行深度拷贝
asynItem = copy.deepcopy(item) #深度拷贝 为解决爬取与插入速度不统一导致的数据重复0