#domain 域名
scrapy genspider [options] name domain
#不用手动配置相关的。。
在settings.py中添加
MY_USER_AGENT = [可用的User-Agent]
在middlewares.py文件中添加
import scrapy
from scrapy import signals
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
import random
class MyUserAgentMiddleware(UserAgentMiddleware): # 继承自UserAgentMiddleware
'''
设置User-Agent
'''
def __init__(self, user_agent):
self.user_agent = user_agent
@classmethod
def from_crawler(cls, crawler):
return cls(
user_agent=crawler.settings.get('MY_USER_AGENT') # 从settings里面取出USER_AGENT列表
)
def process_request(self, request, spider):
agent = random.choice(self.user_agent)
request.headers['User-Agent'] = agent
# 添加到headers里面,最后默认返回None
将自定义的MyUserAgentMiddleware类添加到DOWNLOADER_MIDDLEWARES
DOWNLOADER_MIDDLEWARES = {
'scrapy.downloadermiddleware.useragent.UserAgentMiddleware': None,
'myproject.middlewares.MyUserAgentMiddleware': 400,
}
Spider Middlewares同上
Items.py:
settings.py:定义项目的全局配置
以下是简单理解(其实我对它很好奇)
BOT_NAME = 'xiaozhu' 项目名称
SPIDER_MODULES = ['xiaozhu.spiders'] 爬虫文件路径
NEWSPIDER_MODULE = 'xiaozhu.spiders'
Crawl responsibly by identifying yourself (and your website) on the user-agent
设置模拟浏览器加载
USER_AGENT = 'qidianwang (+http://www.yourdomain.com)'
Obey robots.txt rules
是否遵守robot协议(默认为True表示遵守)
ROBOTSTXT_OBEY = False
Configure maximum concurrent requests performed by Scrapy (default: 16)
? scrapy 发起请求的最大并发数量(默认是16个)
CONCURRENT_REQUESTS = 32
Configure a delay for requests for the same website (default: 0)
See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
See also autothrottle settings and docs
设置下载延时,默认为0
DOWNLOAD_DELAY = 0
The download delay setting will honor only one of:
在每个域下允许发起请求的最大并发数(默认是8个)
CONCURRENT_REQUESTS_PER_DOMAIN = 16
针对每个ip允许发起请求的最大并发数量(默认0个)
1.在不为0的情况CONCURRENT_REQUESTS_PER_IP的设置优先级要比CONCURRENT_REQUESTS_PER_DOMAIN要高
2.不为0的情况下DOWNLOAD_DELAY就会针对于ip而不是网站了,
CONCURRENT_REQUESTS_PER_IP = 16
Disable cookies (enabled by default)
是否要携带cookies,默认为True表示携带
COOKIES_ENABLED = False
COOKIES_DEBUG 默认为False表示不追踪cookies
COOKIES_DEBUG = True
Disable Telnet Console (enabled by default)
====是一个扩展插件,通过TELENET可以监听到当前爬虫的一些状态,默认是True开启状态
TELNETCONSOLE_ENABLED = False
Override the default request headers:
=======请求头的设置
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8',
'Accept-Language': 'en',
'User-Agnet':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
}
Enable or disable spider middlewares
See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
爬虫中间件
SPIDER_MIDDLEWARES = {
'qidianwang.middlewares.QidianwangSpiderMiddleware': 543,
}
Enable or disable downloader middlewares
See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
下载中间件,自定义下载中间键需要在这里激活,后面的数字越小优先级越高,
DOWNLOADER_MIDDLEWARES = {
'qidianwang.middlewares.QidianUserAgentDownloadmiddlerware': 543,
'qidianwang.middlewares.QidianProxyDownloadMiddlerware':544,**
'qidianwang.middlewares.SeleniumDownlaodMiddlerware':543,
}
Enable or disable extensions
See https://doc.scrapy.org/en/latest/topics/extensions.html
EXTENSIONS================添加扩展
EXTENSIONS = {
'scrapy.extensions.telnet.TelnetConsole': None,
}
Configure item pipelines
See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
激活管道,后面跟的数字越小优先级越高
ITEM_PIPELINES = {
'qidianwang.pipelines.QidianwangPipeline': 300,
'scrapy_redis.pipelines.RedisPipeline': 400,
}
动态下载延时,(自动限速的扩展,默认情况下是关闭的)
使用步骤1.打开:AUTOTHROTTLE_ENABLED = True
Enable and configure the AutoThrottle extension (disabled by default)
See https://doc.scrapy.org/en/latest/topics/autothrottle.html
AUTOTHROTTLE_ENABLED = True
The initial download delay
初始的下载延时
AUTOTHROTTLE_START_DELAY = 5
The maximum download delay to be set in case of high latencies
最大的下载延时
AUTOTHROTTLE_MAX_DELAY = 60
The average number of requests Scrapy should be sending in parallel to
each remote server
发送到每一个服务器的并行请求数量
AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
Enable showing throttling stats for every response received:
是否要开启自动限速的DEBUG模式
AUTOTHROTTLE_DEBUG = False
Enable and configure HTTP caching (disabled by default)
See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
=====数据缓存的一个扩展(默认情况下是关闭的为HTTPCACHE_ENABLED = False)
HTTPCACHE_ENABLED = True
=====设置缓存超时的时间
HTTPCACHE_EXPIRATION_SECS = 0
=====设置缓存保存的路径
HTTPCACHE_DIR = 'httpcache'
=====缓存忽略的响应状态码设置为400表示忽略掉不缓存 : HTTPCACHE_IGNORE_HTTP_CODES = ['400']
HTTPCACHE_IGNORE_HTTP_CODES = []
缓存的储存插件,
HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
将log日志信息保存在本地文件
LOG_FILE = 'qdlogfile.log'
LOG_LEVEL = 'DEBUG'
如
def __init__(self,mongo_url,mongo_db):
self.mongo_url = mongo_url
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
return els(
mongo_uri=crawler.settings.get('MοNGO_URI ’),
mongo_db=crawler.settings.get(’ MONGO_DB')
)
选择器:选择元素的一种方式,不同的选择其实用不同的方法
selector:可以脱离scrapy单独使用,一个强大的网页解析库
body= '
生成了一个Selector 选择器对象
xpath:response .xpath () 是scrapy中对response.selector.xpath ()方法设置的快捷方式,得到Selectorlist 类型的变量, 由 Selector 对象组成的列表
xpath非常好用,跟正则搭配很完美
对于异步加载,一方面可以分析ajax找到对应的接口抓取,也可以用selenium或splash模拟浏览器
代码案例(对代码还没有完全理解)
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from scrapy.http import HtmlResponse
from logging import getLogger
class SeleniumMiddleware(object):
def __init__(self, timeout=None, service_args=[]):
self.logger = getLogger(__name__)
self.timeout = timeout # 设置页面超时时间
self.browser = webdriver.PhantomJS(service_args=service_args)
self.browser.set_window_size(1400, 700) # 设置页面大小
self.browser.set_page_load_timeout(self.timeout)
self.wait = WebDriverWait(self.browser, self.timeout)
def __del__(self):
self.browser.close()
def process_request(self, request, spider):
"""
用PhantomJS抓取页面
:param request: Request对象
:param spider: Spider对象
:return: HtmlResponse
"""
self.logger.debug('PhantomJS is Starting')
page = request.meta.get('page', 1)
try: # 调用 PhantomJS 对象的 get ()方法访问 Request 的对应的 URL ,相当于
Request 对象里获取请求链接,然后再用 PhantomJS 加载,不再使用 Scrapy 里的 Downloader
self.browser.get(request.url)
if page > 1:
input = self.wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager div.form > input')))
submit = self.wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager div.form > span.btn.J_Submit')))
input.clear()
input.send_keys(page)
submit.click()
self.wait.until(
EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#mainsrp-pager li.item.active > span'), str(page)))
self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.m-itemlist .items .item')))
return HtmlResponse(url=request.url, body=self.browser.page_source, request=request, encoding='utf-8',
status=200)
except TimeoutException:
return HtmlResponse(url=request.url, status=500, request=request)
@classmethod
def from_crawler(cls, crawler):
return cls(timeout=crawler.settings.get('SELENIUM_TIMEOUT'),
service_args=crawler.settings.get('PHANTOMJS_SERVICE_ARGS'))
settings.py里设置
DOWNLOADER_MIDDLEWARES = {
"scrapyseleniumtest.middlewares.SeleniumMiddleware": 543,
}
Item Pipeline里
import pymongo
class MongoPipeline(object):
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
return cls(mongo_uri=crawler.settings.get('MONGO_URI'), mongo_db=crawler.settings.get('MONGO_DB'))
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def process_item(self, item, spider):
self.db[item.collection].insert(dict(item))
return item
def close_spider(self, spider):
self.client.close()
settings.py 添加
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = { # 开启这个类的调用
'scrapyseleniumtest.pipelines.MongoPipeline': 300,
}
MONGO_URI = 'localhost'
MONGO_DB = 'taobao'
#注明这两个
单爬虫
单机限制于网络速度
IO吞吐量 网线限制
多爬虫问题
数据共享:同步
自动分工机制,充分利用资源提高效率
Redis功能(内存数据库,同时保存到硬盘):
安装(略)
内容保存数据库:Mongdb,Mysql等(运行在内存,数据保存在硬盘?,控制保存时间)
简单原理理解
由Redis管理请求队列(排重保存),scrapy自己的请求队列就不再使用(在settings里面设置队列为远程)