Boss直聘scrapy爬虫

爬取boss直聘热门岗位信息,源码如下

-- coding: utf-8 --

import scrapy
from Boss.items import BossItem
from scrapy import Request
from copy import deepcopy

class BossSpider(scrapy.Spider):
name = ‘boss’
allowed_domains = [‘zhipin.com’]
#起始网址
start_urls = [‘https://www.zhipin.com/?sid=sem_pz_bdpc_dasou_title‘]
def parse(self, response):
#遍历大分类
divs=response.xpath(‘//*[@id=”main”]/div/div[2]/div[1]/div[2]/a’)
item = BossItem()
for div in divs:
#大分类名称和url
item[‘Name’]=div.xpath(‘./text()’).extract_first()
item[‘URL’]=’https://www.zhipin.com‘+div.xpath(‘./@href’).extract_first()
yield Request(item[‘URL’],callback=self.parse_content,meta={‘item’:deepcopy(item)})

def parse_content(self,response):
item = response.meta[‘item’]
divs=response.xpath(‘//*[@id=”main”]/div/div[2]/ul/li’)
for div in divs:
#详情页信息
item[‘job’]=div.xpath(‘./div/div[1]/h3/a/div[1]/text()’).extract_first()
item[‘price’]=div.xpath(‘./div/div[1]/h3/a/span/text()’).extract_first()
item[‘addrr’]=div.xpath(‘./div/div[1]/p/text()[1]’).extract_first()
item[‘yaoqiu’]=div.xpath(‘./div/div[1]/p/text()[3]’).extract_first()
item[‘name’]=div.xpath(‘./div/div[2]/div/h3/a/text()’).extract_first()
item[‘url’]=’https://www.zhipin.com/‘+div.xpath(‘./div/div[1]/h3/a/@href’).extract_first()
yield item

你可能感兴趣的:(Boss直聘scrapy爬虫)