scrapy-redis分布式爬虫智联招聘项目实践


运行平台: Windows + Linux-ubunto
Python版本: Python3.5
IDE: pycharm
其他工具: Chrome浏览器 MySQL Redis
Git-hub项目地址:https://github.com/HAOyanWEI24/scrapy-redis_zhilian


一,首先我们来创建项目
  • scrapy startproject zhaopin
  • cd zhaopin
  • scrapy genspider zhaopin zhilian.com
二,在 items.py文件中定义我们要爬取的字段
class ZhaopingItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    job_name = scrapy.Field()#工作名称
    job_link = scrapy.Field()#工作链接
    job_info = scrapy.Field()#工作信息
    company = scrapy.Field()#公司
    address = scrapy.Field()#地址
    salary = scrapy.Field()#薪资
    # company_info = scrapy.Field()#公司信息

三,在 spider/zhaopin.py文件中写我们的业务逻辑
  • from scrapy_redis.spiders import RedisCrawlSpider导入 scrapy_redis模块
  • class ZhaopinSpider(scrapy.Spider):此处注掉
  • class ZhaopinSpider(RedisCrawlSpider):#继承scrapy-redis中定义好的类
# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
from ..items import ZhaopingItem
import redis
from scrapy_redis.spiders import RedisCrawlSpider


# class ZhaopinSpider(scrapy.Spider):
class ZhaopinSpider(RedisCrawlSpider):#继承scrapy-redis中定义好的类
    name = 'zhaopin'
    redis_key = 'ZhaopinSpider:start_urls'

    allowed_domains = ['zhilian.com']
    start_urls = ['http://zhilian.com/']

    def start_requests(self):
        """智联招聘"""
        url = "https://sou.zhaopin.com/jobs/searchresult.ashx?jl=北京&kw=python"
        yield Request(url, callback=self.parseMainPage)

    def parseMainPage(self,response):
        item = ZhaopingItem()
        #获取网页中职位的url数据

        urls = response.xpath('//td[@class="zwmc"]/div/a')
        # print(urls)
        for url in urls:
            url = url.xpath('@href').extract()[0]
            # print(url)
            #通过回调函数每次调取本页url信息
            yield Request(url, meta={'item': item}, callback=self.parseDetails, dont_filter=True)
        """
        下一页网页通过拼串形式暂时爬取100页的招聘信息
        """
        for i in range(100):
            url = 'https://sou.zhaopin.com/?jl=北京&jt=&kw=python&kt='+str(i)
            print(url)
            print(">>>>>>>>>>>>>>>")
            yield scrapy.Request(url, callback=self.parseMainPage)

    """
    公司职位详细信息提取
    """
    def parseDetails(self,response):
        item = response.meta['item']
        job_name = response.xpath('//div[@class="fixed-inner-box"]/div[1]/h1/text()').extract()#工作名称
        job_info = response.xpath('//div[@class="tab-inner-cont"]/p[2]/text()').extract()#工作信息
        salary = response.xpath('//ul[@class="terminal-ul clearfix"]/li[1]/strong/text()').extract()#工作薪资
        address = response.xpath('//div[@class="tab-inner-cont"]/h2/text()').extract()#公司地址
        company = response.xpath('//p[@class="company-name-t"]/a/text()').extract()#公司名称
        job_link = response.xpath('//p[@class="company-name-t"]/a/@href').extract()#公司网址
        # company_info = response.xpath('//p[@class="company-name-t"]/a/@href').extract()#公司介绍
        """
        由于有些解析网页标签不存在,通过抛出异常的方式,将此标签解析为空,最后将他返回出去
        """
        try:
            item['company'] = company[0]
            item['job_info'] = job_info[0]
            item['job_name'] = job_name[0]
            item['address'] = address[0]
            item['salary'] = salary[0]
            item['job_link'] = job_link[0]
        except Exception as e:
            item['company'] = "空"
            item['job_info'] = "空"
            item['job_name'] = "空"
            item['salary'] = "空"
            item['address'] = "空"
            item['job_link'] = "空"
        yield item

四,在 settings文件中我们设置的关键参数部分设置
"""
scrapy-redis设置方法
"""

#按照sorted 排序顺序出队列,建议使用某一个,这样才能在redis数据库中看到,其实可以不写不影响结果
SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue"

#使用scrapy-redis自己调度器,不使用scrapy默认的调度器,负责去重
SCHEDULER ="scrapy_redis.scheduler.Scheduler"

#使用scrapy-redis自己的组件去重,不使用scrapy默认的去重
DUPEFILTER_CLASS ="scrapy_redis.dupefilter.RFPDupeFilter"

#调度状态持久化,不清理redis缓存,允许暂停/启动爬虫
SCHEDULER_PERSIST =True

#redis
REDIS_HOST ='127.0.0.1'#此处需填写你的Master端的IP端口号
REDIS_PORT =6379
#模拟浏览器的请求头
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) ' \
             'AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.54 Safari/536.5'

HTTPERROR_ALLOWED_CODES = [403]

ITEM_PIPELINES = {
    'zhaoping.pipelines.ZhaopingPipeline': 300,
    'zhaoping.pipelines.ExamplePipeline': 300,
    # 下面这个管道是必须要启用的--支持数据存储到redis数据库里
    'scrapy_redis.pipelines.RedisPipeline': 400,

}
  • REDIS_HOST ='127.0.0.1'#此处需填写你的Master端的IP端口号
  • REDIS_PORT =6379

五,在windows下和Linux-Ubunto中的pycharm分别运行文件
  • scrapy crawl zhilian
Windows下运行状态
Linux下运行状态
Redis-desktop Manager数据显示
MySQL 建表语句
create database job_items charset='utf8'; #建库
"""
建表
"""
create table sina_items(
id bigint not null primary key auto_increment,
job_name varchar(255) not null,
job_link varchar(255) not null,
job_info varchar(255) not null,
company varchar(255) not null,
address varchar(255) not null,
salary varchar(100) not null,
)charset=utf8;

你可能感兴趣的:(scrapy-redis分布式爬虫智联招聘项目实践)