# 创建一个scrapy爬虫项目,spider_name为项目名
scrapy startproject spider_name
# 创建爬虫,名字为 spider_name,允许爬取的域名为'baidu。com'
scrapy genspider spider_name 'baodu.com'
# 创建crawl爬虫,名字为 spider_name,允许爬取的域名为'baidu。com'
scrapy genspider -t crawl spider_name 'baodu.com'
from scrapy.cmdline import execute
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
execute(['scrapy', 'crawl', 'qsbk']) # qsbk为要运行的爬虫名称
- 从start_url开始,start-request函数向strat_url发送请求,获取响应,如有需要,此函数可自定义,比如携带cookies登录
def start_request(self):
start_url = url
yield scrapy.Request(start_url, callback=self.parse, cookise=self.cookies)
- parse函数接收响应,处理数据,获取到下一个分类或者下一页url和数据
form copy import deepcopy
def parse():
item = {}
"""
此处为数据提取
"""
# yield item
yield scrapy.Requst(url,callback,meta={"item":deepcopy(item)})
- 得到url,继续发送请求,scapy.Request(url, callback, meta),
- 得到数据,传递到pipelines中,定义保存方式
# pipelines.py
# 保存数据到mongodb数据库中
from pymongo import mongoClient
class BookPipeline(object):
# 爬虫开启时,运行一次,链接数据库
def open_spider(self):
client = mongoClient('host', 'port')
self.collection = client['dbname']['tablename']
# 保存数据
def process_item(self, item, spider):
self.collection.insert(dict(item)) #item为一个对象,保存到数据库需要转成字典
# robots协议,默认为True
ROBOTSTXT_OBEY = False
# log信息显示:
LOG_LEVEL = 'WARNING'
# 下载延迟,设置大一些
DOWNLOAD_DELAY = 1
# 是否禁用cookies,禁用,以防网站识别cookies来反爬, 如需携带cookies登录,则需要开启
COOKIES_ENABLED = False
# Mysql数据库连接配置
MYSQL_HOST = ''
MYSQL_DBNAME = ''
MYSQL_USER = ''
MYSQL_PASSWORD = ''
# setings 中需要的设置:
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
SCHEDULER_PERSIST = True
REDIS_URL = "redis://192.168.82.64:6379"
# spider.py 爬虫文件中继承的类不一样
def DangSpider(RedisSpider):
name = 'dang'
allowed_domains = ['dangdang.com']
# start_urls = ['http://book.dangdang.com/']
redis_key = "dang" # 爬虫开启时,往redis中lpush一个dang的键,值为strat_url,则start_url只会被爬取一次
# spider.py 使用crawlspider时:
def DangSpider(RedisCrawlSpider):
name = 'dang'
allowed_domains = ['dangdang.com']
# start_urls = ['http://book.dangdang.com/']
redis_key = "dang"
rules = {
# xpath的方式提取链接,匹配的li标签下的所有url
Rule(LinkExtractor(restrict_xpaths=("//div[@class='categoryRefinementsSection']/ul/li",)), follow=True)
}
import mysqldb
import MySQLdb.cursor
from twisted.enterprise import adbapi # 可以将mysql操作变成异步操作
class MysqlTwistrdPipline:
def __init__(self, dbpool):
self.dbpool = dbpool
@classmethod
def from_settings(cls,settings):
dbparms = dict(
host = settings['MYSQL_HOST'],
db = settings['MYSQL_DBNAME'],
user = settings['MYSQL_USER'],
psaawd = settings['MYSQL_PASSWORD'],
charset = 'utf8',
cursorclass = MySQLdb.cursors.DictCursor,
use_unicode = True,
)
dbpool = adbapi.ConnectionPool('MySQLdb',**dbparms) # 可变参数的传入方法
return cls(dbpool)
def process_item(self, item, spider):
# 异步执行
query = self.dbpool.runInteraction(self.do_insert, item)
query.addErrback(self.handle_erro) # 处理异常
def handle_erro(self, failure):
# 处理异步插入数据异常
print(failure)
def do_insert(self, cursor, item):
# 执行插入数据逻辑
self.cursor.execute(insert_sql)