scrapy爬虫框架入门

scrapy爬虫框架学习

  • 创建项目:
# 创建一个scrapy爬虫项目,spider_name为项目名
scrapy startproject spider_name

# 创建爬虫,名字为 spider_name,允许爬取的域名为'baidu。com'
scrapy genspider spider_name 'baodu.com'

# 创建crawl爬虫,名字为 spider_name,允许爬取的域名为'baidu。com'
scrapy genspider -t crawl spider_name 'baodu.com'
  • 创建可执行可调试的main.py文件
from scrapy.cmdline import execute

import sys
import os

sys.path.append(os.path.dirname(os.path.abspath(__file__)))
execute(['scrapy', 'crawl', 'qsbk'])  # qsbk为要运行的爬虫名称

  • 流程简要概括:

- 从start_url开始,start-request函数向strat_url发送请求,获取响应,如有需要,此函数可自定义,比如携带cookies登录


def start_request(self):
	start_url = url
	yield scrapy.Request(start_url, callback=self.parse, cookise=self.cookies)
	
- parse函数接收响应,处理数据,获取到下一个分类或者下一页url和数据
form copy import deepcopy
def parse():
	item = {}
	"""
	此处为数据提取
	"""
	# yield item
	yield scrapy.Requst(url,callback,meta={"item":deepcopy(item)})
	
- 得到url,继续发送请求,scapy.Request(url, callback, meta),
- 得到数据,传递到pipelines中,定义保存方式
# pipelines.py
# 保存数据到mongodb数据库中
from pymongo import mongoClient

class BookPipeline(object):
	# 爬虫开启时,运行一次,链接数据库
	def open_spider(self):
		client = mongoClient('host', 'port')
		self.collection = client['dbname']['tablename']
	# 保存数据
	def process_item(self, item, spider):
		self.collection.insert(dict(item))  #item为一个对象,保存到数据库需要转成字典
		
  • settings文件中经常涉及的设置:
# robots协议,默认为True
ROBOTSTXT_OBEY = False

# log信息显示:
LOG_LEVEL = 'WARNING'

# 下载延迟,设置大一些
DOWNLOAD_DELAY = 1

# 是否禁用cookies,禁用,以防网站识别cookies来反爬, 如需携带cookies登录,则需要开启
COOKIES_ENABLED = False

# Mysql数据库连接配置
MYSQL_HOST = ''
MYSQL_DBNAME = ''
MYSQL_USER = ''
MYSQL_PASSWORD = ''

  • 分布式爬虫:scrapy_redis:
# setings 中需要的设置:
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
SCHEDULER_PERSIST = True
REDIS_URL = "redis://192.168.82.64:6379"

# spider.py 爬虫文件中继承的类不一样
def DangSpider(RedisSpider):
	name = 'dang'
    allowed_domains = ['dangdang.com']
    # start_urls = ['http://book.dangdang.com/']
    redis_key = "dang"  # 爬虫开启时,往redis中lpush一个dang的键,值为strat_url,则start_url只会被爬取一次
# spider.py 使用crawlspider时:
def DangSpider(RedisCrawlSpider):
	name = 'dang'
    allowed_domains = ['dangdang.com']
    # start_urls = ['http://book.dangdang.com/']
    redis_key = "dang"  

	rules = {
		# xpath的方式提取链接,匹配的li标签下的所有url
		Rule(LinkExtractor(restrict_xpaths=("//div[@class='categoryRefinementsSection']/ul/li",)), follow=True)
	}
  • 数据库异步存储操作:
import mysqldb
import MySQLdb.cursor
from twisted.enterprise import adbapi  # 可以将mysql操作变成异步操作

class MysqlTwistrdPipline:

	def __init__(self, dbpool):
		self.dbpool = dbpool
	
	@classmethod
	def from_settings(cls,settings):
		dbparms = dict(
			host = settings['MYSQL_HOST'],
			db = settings['MYSQL_DBNAME'],
			user = settings['MYSQL_USER'],
			psaawd = settings['MYSQL_PASSWORD'],
			charset = 'utf8',
			cursorclass = MySQLdb.cursors.DictCursor,
			use_unicode = True,
			)
		
		dbpool = adbapi.ConnectionPool('MySQLdb',**dbparms) # 可变参数的传入方法
		
		return cls(dbpool)

	def process_item(self, item, spider):
		# 异步执行
		query = self.dbpool.runInteraction(self.do_insert, item)
		query.addErrback(self.handle_erro) # 处理异常
		
	def handle_erro(self, failure):
		# 处理异步插入数据异常
		print(failure)
		
	def do_insert(self, cursor, item):
		# 执行插入数据逻辑
		self.cursor.execute(insert_sql)
		
		

你可能感兴趣的:(爬虫)