Scrapy是用纯Python实现一个为了爬取网站数据、提取结构性数据而编写的应用框架,用途非常广泛。
框架的力量,用户只需要定制开发几个模块就可以轻松的实现一个爬虫,用来抓取网页内容以及各种图片,非常之方便
Scrapy 使用了 Twisted['twɪstɪd](其主要对手是Tornado)异步网络框架来处理网络通讯,可以加快我们的下载速度,不用自己去实现异步框架,并且包含了各种中间件接口,可以灵活的完成各种需求。
Scrapy主要包括了以下组件:
Scrapy运行流程大概如下:
scrapy startproject recruit
创建爬虫程序
cd recruit
scrapy genspider Position hr.tencent.co
自动创建目录及文件
文件说明:
注意:一般创建爬虫文件时,以网站域名命名
设置数据存储模板
import scrapy
class PositionItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 职位名称
position_name = scrapy.Field()
# 职位链接
position_link = scrapy.Field()
# 职位类别
position_type = scrapy.Field()
# 招聘人数
people_num = scrapy.Field()
# 工作地点
work_address = scrapy.Field()
# 发布时间
publish_time = scrapy.Field()
编写爬虫
# -*- coding: utf-8 -*-
import scrapy
from recruit.items import PositionItem
class PositionSpider(scrapy.Spider):
name = 'Position'
allowed_domains = ['hr.tencent.co']
start_urls = ["https://hr.tencent.com/position.php?&start=0"]
def parse(self, response):
position_lists = response.xpath('//tr[@class="even"] | //tr[@class="odd"]')
for postion in position_lists:
item = PositionItem()
position_name = postion.xpath("./td[1]/a/text()").extract()
position_link = postion.xpath("./td[1]/a/@href").extract()
position_type = postion.xpath("./td[2]/text()").extract()
people_num = postion.xpath("./td[3]/text()").extract()
work_address = postion.xpath("./td[4]/text()").extract()
publish_time = postion.xpath("./td[5]/text()").extract()
item["position_name"] = position_name
item["position_link"] = position_link
item["position_type"] = position_type
item["people_num"] = people_num
item["work_address"] = work_address
item["publish_time"] = publish_time
yield item
设置配置文件
settings.py增加如下内容
ITEM_PIPELINES = {
'recruit.pipelines.RecruitPipeline': 300,
}
编写数据处理脚本
pipelines.py
import json
class RecruitPipeline(object):
def open_spider(self,spider):
self.file = open("position.json","w",encoding="utf-8")
def process_item(self, item, spider):
dict_item = dict(item)
json_str = json.dumps(dict_item,ensure_ascii=False)+"\n"
self.file.write(json_str)
return item
def close_spider(self,spider):
self.file.close()
在和scrapy.cfg同级目录创建文件start.py
start.py
from scrapy import cmdline
cmdline.execute("scrapy crawl Position -o position.xml".split())
运行结果:
以上是抓取第一页的方法 抓取全部的方法是一样的 只要分析好每一页的url的区别就好,下面是抓取全部的招聘信息
可以看到一共3720个职位
分析url得出每加10就是下一页
第一页:https://hr.tencent.com/position.php?&start=0
第二页:https://hr.tencent.com/position.php?&start=10
第三页:https://hr.tencent.com/position.php?&start=20
我们在Position.py中定义变量 offset
下面是是Position.py中的代码:
# -*- coding: utf-8 -*-
import scrapy
from recruit.items import PositionItem
class PositionSpider(scrapy.Spider):
name = 'Position'
allowed_domains = ['hr.tencent.com']
# 偏移量
offset = 0
url = "https://hr.tencent.com/position.php?&start="
start_urls = [url + str(offset) + "#a", ]
def parse(self, response):
print("response.url==", response.url)
postion_lists = response.xpath('//tr[@class="even"]|//tr[@class="odd"]')
for postion in postion_lists:
item = PositionItem()
position_name = postion.xpath('./td[1]/a/text()').extract()[0]
position_link = postion.xpath('./td[1]/a/@href').extract()[0]
position_type = postion.xpath('./td[2]/text()').get()
people_num = postion.xpath('./td[3]/text()').extract()[0]
work_address = postion.xpath('./td[4]/text()').extract()[0]
publish_time = postion.xpath('./td[5]/text()').extract()[0]
item["position_name"] = position_name
item["position_link"] = position_link
item["position_type"] = position_type
item["people_num"] = people_num
item["work_address"] = work_address
item["publish_time"] = publish_time
yield item
# 请求下一页
total_page = response.xpath('//div[@class="left"]/span/text()').extract()[0]
print("total_page===", total_page)
if self.offset < int(total_page):
# 每一页,相差10
self.offset += 10
# 每一页的请求链接
new_url = self.url + str(self.offset) + "#a"
# 往scrapy引擎添加请求
yield scrapy.Request(new_url, callback=self.parse)pipeline
pipelines.py
import json
class RecruitPipeline(object):
def open_spider(self,spider):
self.file_name = open(spider.name+"_position.xml","w")
def process_item(self, item, spider):
python_dict = dict(item)
json_text = json.dumps(python_dict, ensure_ascii=False) + "\n"
self.file_name.write(json_text)
return item
def close_spider(self,spider):
self.file_name.close()
settings.py
# -*- coding: utf-8 -*-
# Scrapy settings for recruit project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'recruit'
SPIDER_MODULES = ['recruit.spiders']
NEWSPIDER_MODULE = 'recruit.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'recruit (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/6.0)',
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'recruit.middlewares.RecruitSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'recruit.middlewares.RecruitDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'recruit.pipelines.RecruitPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
运行得到的结果:
一共3720个文件