HTML渲染分为两类。动态加载和静态加载。静态加载就是整个HTML文档都是有服务器渲染之后,发送给客户端进行展示。但是随着技术拓展,越来越多的网页上的有效数据是通过js动态加载出来的,如果我们还是用传统的方式爬取,那么结果可想而知。今天介绍Scrapy+Splash模拟浏览器行为,动态加载js之后,获取有效数据。
1、安装Scrapy
C:\Users\Administrator>pip install scrapy
#成功标志
Successfully installed Scrapy-1.5.1
2、创建Scrapy项目
C:\Users\Administrator>scrapy startproject ZhipinSpider
#成功标志,在Administrator文件夹下生成一个ZhipinSpider文件夹,结构如下:
ZhipinSpider
scrapy.cfg
ZhipinSpider
__init__.py
items.py
middlewares.py
pipelines.py
settings.py
spiders
__init__.py
job_position.py
3、编写类
3.1、job_position.py
# 导入访问MySQL的模块
import mysql.connector
class ZhipinspiderPipeline(object):
# 定义构造器,初始化要写入的文件
def __init__(self):
self.conn = mysql.connector.connect(user='root',
password='root',
host='192.168.36.58', port='3306',
database='directrecruit',
use_unicode=True)
self.cur = self.conn.cursor()
# 重写close_spider回调方法,用于关闭数据库资源
def close_spider(self, spider):
print('----------关闭数据库资源-----------')
# 关闭游标
self.cur.close()
# 关闭连接
self.conn.close()
def process_item(self, item, spider):
self.cur.execute("INSERT INTO job_inf VALUES(null, %s, %s, %s, %s, %s, %s, %s, %s, %s)",
(item['title'], item['salary'], item['company'], item['url'], item['work_addr'], item['industry'], item.get('company_size'), item['recruiter'], item['publish_date']))
self.conn.commit()
3.2、items.py
import scrapy
class ZhipinspiderItem(scrapy.Item):
# 工作名称
title = scrapy.Field()
# 工资
salary = scrapy.Field()
# 招聘公司
company = scrapy.Field()
# 工作详细链接
url = scrapy.Field()
# 工作地点
work_addr = scrapy.Field()
# 行业
industry = scrapy.Field()
# 公司规模
company_size = scrapy.Field()
# 招聘人
recruiter = scrapy.Field()
# 发布时间
publish_date = scrapy.Field()
3.3、middlewares.py
# -*- coding: utf-8 -*-
# Define here the models for your spider middleware
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
class ZhipinspiderSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Response, dict
# or Item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class ZhipinspiderDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
3.4、pipelines.py
# 导入访问MySQL的模块
import mysql.connector
class ZhipinspiderPipeline(object):
# 定义构造器,初始化要写入的文件
def __init__(self):
self.conn = mysql.connector.connect(user='root',
password='root',
host='192.168.36.58', port='3306',
database='directrecruit',
use_unicode=True)
self.cur = self.conn.cursor()
# 重写close_spider回调方法,用于关闭数据库资源
def close_spider(self, spider):
print('----------关闭数据库资源-----------')
# 关闭游标
self.cur.close()
# 关闭连接
self.conn.close()
def process_item(self, item, spider):
self.cur.execute("INSERT INTO job_inf VALUES(null, %s, %s, %s, %s, %s, %s, %s, %s, %s)",
(item['title'], item['salary'], item['company'], item['url'], item['work_addr'], item['industry'], item.get('company_size'), item['recruiter'], item['publish_date']))
self.conn.commit()
3.5、settings.py
BOT_NAME = 'ZhipinSpider'
SPIDER_MODULES = ['ZhipinSpider.spiders']
NEWSPIDER_MODULE = 'ZhipinSpider.spiders'
ROBOTSTXT_OBEY = True
# 配置默认的请求头
DEFAULT_REQUEST_HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36",
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3'
}
# 配置使用Pipeline
ITEM_PIPELINES = {
'ZhipinSpider.pipelines.ZhipinspiderPipeline': 300,
}
此处遵守采集协议,采集boss直聘上的招聘信息。只能采集静态页面。
3、安装Splash服务
先安装docker服务,如何安装docker服务,前面博客有详细步骤。
[root@Docker ~]# docker run -p 8050:8050 scrapinghub/splash
Unable to find image 'scrapinghub/splash:latest' locally
latest: Pulling from scrapinghub/splash
7b722c1070cd: Pull complete
5fbf74db61f1: Pull complete
ed41cb72e5c9: Pull complete
7ea47a67709e: Pull complete
b9ea67282e79: Pull complete
8d0589f2b410: Pull complete
11f417145dc7: Pull complete
14d670a8125e: Pull complete
81d8bf1e3bdc: Pull complete
Digest: sha256:ec1198946284ccadf6749ad60b58b2d2fd5574376857255342a913ec7c66cfc5
Status: Downloaded newer image for scrapinghub/splash:latest
2019-07-19 03:15:03+0000 [-] Log opened.
2019-07-19 03:15:03.976877 [-] Splash version: 3.3.1
2019-07-19 03:15:03.978587 [-] Qt 5.9.1, PyQt 5.9.2, WebKit 602.1, sip 4.19.4, Twisted 18.9.0, Lua 5.2
2019-07-19 03:15:03.978696 [-] Python 3.5.2 (default, Nov 12 2018, 13:43:14) [GCC 5.4.0 20160609]
2019-07-19 03:15:03.978763 [-] Open files limit: 1048576
2019-07-19 03:15:03.978805 [-] Can't bump open files limit
2019-07-19 03:15:04.090084 [-] Xvfb is started: ['Xvfb', ':485890213', '-screen', '0', '1024x768x24', '-nolisten', 'tcp']
QStandardPaths: XDG_RUNTIME_DIR not set, defaulting to '/tmp/runtime-root'
2019-07-19 03:15:04.300570 [-] proxy profiles support is enabled, proxy profiles path: /etc/splash/proxy-profiles
2019-07-19 03:15:04.300725 [-] memory cache: enabled, private mode: enabled, js cross-domain access: disabled
2019-07-19 03:15:04.473283 [-] verbosity=1, slots=20, argument_cache_max_entries=500, max-timeout=90.0
2019-07-19 03:15:04.474206 [-] Web UI: enabled, Lua: enabled (sandbox: enabled)
2019-07-19 03:15:04.475292 [-] Site starting on 8050
2019-07-19 03:15:04.475377 [-] Starting factory
2019-07-19 03:15:04.475596 [-] Server listening on http://0.0.0.0:8050
4、在Scrapy中使用Splash服务
4.1、安装scrapy-splash
C:\Users\Administrator>pip install scrapy-splash
Collecting scrapy-splash
Downloading https://files.pythonhosted.org/packages/64/19/aa6e9559ca16a4daec98f6451748dd1cae9a91e7f43069cc1d294f7576bc/scrapy_splash-0.7.2-py2.py3-none-any.whl
Installing collected packages: scrapy-splash
Successfully installed scrapy-splash-0.7.2
4.2、修改代码
settings.py
# Splash服务器地址
SPLASH_URL = 'http://192.168.244.133:8050'
# 开启两个下载中间件,并调整HttpCompressionMiddlewares的次序
DOWNLOADER_MIDDLEWARES = {
'scrapy_splash.SplashCookiesMiddleware': 723,
'scrapy_splash.SplashMiddleware': 725,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
}
# 设置去重过滤器
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
# 用来支持cache_args(可选)
SPIDER_MIDDLEWARES = {
'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
}
DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'