利用redis实现分布式爬虫
# Scrapy settings for lagou project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
# http://doc.scrapy.org/topics/settings.html
#
SPIDER_MODULES = ['lagou.spiders']
NEWSPIDER_MODULE = 'lagou.spiders'
#USER_AGENT = 'scrapy-redis (+https://github.com/rolando/scrapy-redis)'
# 本地重复过滤
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
# 计划调度器,将请求队列处理分发
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 是否将本地请求队列持久化到远程服务器
SCHEDULER_PERSIST = True
# 使用框架提供的队列
SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue"# 常用,优先级队列
#SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue"# FIFO队列,先进先出
#SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderStack"# LIFO队列,后进先出
ITEM_PIPELINES = {
'lagou.pipelines.lagouPipeline': 300,
'scrapy_redis.pipelines.RedisPipeline': 400,
}
# 日志级别
# LOG_LEVEL = 'DEBUG'
# Introduce an artifical delay to make use of parallelism. to speed up the
# crawl.
# 爬取间隔
DOWNLOAD_DELAY = 30
# 请求头
DEFAULT_REQUEST_HEADERS = {
'Referer': 'https://www.lagou.com/jobs/list_%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0?city=%E5%8C%97%E4%BA%AC&cl=false&fromSearch=true&labelWords=&suginput=',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
}
# COOKIES不用
COOKIES_ENABLED = False
# 机器人规则不遵守
ROBOTSTXT_OBEY = False
# 重试
RETRY_ENABLE = True
RETRY_TIMES = 5 # 重试次数,次
DOWNLOAD_TIMEOUT = 5 # 超时时长,秒
# 连接远程redis服务,可连接redis集群实现分布式
REDIS_HOST = '10.25.34.65'
REDIS_PORT = 6379
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/topics/items.html
from scrapy.item import Item, Field
from scrapy.loader import ItemLoader
from scrapy.loader.processors import MapCompose, TakeFirst, Join
class ExampleItem(Item):
# 框架默认字段
name = Field()
description = Field()
link = Field()
crawled = Field()
spider = Field()
url = Field()
# 自定义字段
positionName = Field()
companyFullName = Field()
companyShortName = Field()
companySize = Field()
financeStage = Field()
district = Field()
education = Field()
workYear = Field()
salary = Field()
positionAdvantage = Field()
class ExampleLoader(ItemLoader):
default_item_class = ExampleItem
default_input_processor = MapCompose(lambda s: s.strip())
default_output_processor = TakeFirst()
description_out = Join()
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/topics/item-pipeline.html
from datetime import datetime
import pandas
class lagouPipeline(object):
def process_item(self, item, spider):
# 框架默认
item["crawled"] = datetime.utcnow()
item["spider"] = spider.name
# 自定义
positionName = item['positionName']
companyFullName = item['companyFullName']
companyShortName = item['companyFullName']
companySize = item['companyFullName']
financeStage = item['companyFullName']
district = item['companyFullName']
education = item['companyFullName']
workYear = item['companyFullName']
salary = item['companyFullName']
positionAdvantage = item['companyFullName']
data=[companyFullName,companyShortName,companySize,financeStage,district,positionName
,workYear,education,salary,positionAdvantage]
columns=['公司全名', '公司简称', '公司规模', '融资阶段', '区域', '职位名称', '工作经验', '学历要求', '工资', '职位福利']
df=pandas.DataFrame(data=data,index=None,columns=columns)
df.to_csv('北京-机器学习.csv',index=None)
return item
import json
import math
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from lagou.items import ExampleItem
class DmozSpider(CrawlSpider):
name = 'dmoz'
allowed_domains = ['www.lagou.com']
start_urls=['https://www.lagou.com/jobs/positionAjax.json?px=default&city=北京&needAddtionalResult=false']
# rules = [
# Rule(LinkExtractor(
# allow=(r'支持正则表示匹配爬虫域www.lagou.com内所有链接')
# ), callback='start_requests', follow=True),
# ]
def start_requests(self):
print('start_requests--------------------------------------------------------')
url = 'https://www.lagou.com/jobs/positionAjax.json?px=default&city=北京&needAddtionalResult=false'
yield scrapy.FormRequest(
url= url,
formdata={
'first': 'true',
'pn': '1',
'kd': '机器学习'
},
callback=self.get_pagenum,
)
def get_pagenum(self,response):
# 确定总页数
meta = json.loads(response.body)
print(meta)
jobnum = meta['content']['positionResult']['totalCount']
pagedemo=math.ceil(jobnum / 15)
if pagedemo>30:
pagenum=30
else:
pagenum=pagedemo
print(f'总页数:{pagenum}')
url = response.url
for num in range(1,pagenum+1):
yield scrapy.FormRequest(
url= url,
formdata={
'first': 'true',
'pn': str(num),
'kd': '机器学习'
},
callback=self.get_message,
)
def get_message(self,response):
# json.loads获取json数据列表
meta=json.loads(response.body)
print(f'meta:{meta}')
item = ExampleItem()
joblist = meta['content']['positionResult']['result']
for job in joblist:
item['positionName'] = job['positionName']
item['companyFullName'] = job['companyFullName']
item['companyShortName'] = job['companyShortName']
item['companySize'] = job['companySize']
item['financeStage'] = job['financeStage']
item['district'] = job['district']
item['education'] = job['education']
item['workYear'] = job['workYear']
item['salary'] = job['salary']
item['positionAdvantage'] = job['positionAdvantage']