R | 爬猎聘网职位酬薪

library(rvest)
library(magrittr)
library(dplyr)
library(tidyr)


### 单页抓取数据
get_job_on_page <- function(url){
     
##1、通过CSS 选择器,抓取网页的数据
  #url <- c("https://www.liepin.com/zhaopin/?init=-1&headckid=c88035ff1557e3f8&fromSearchBtn=2&ckid=c88035ff1557e3f8°radeFlag=0&sfrom=click-pc_homepage-centre_searchbox-search_new&key=CRA&siTag=xKLlO2y_xees_Q4GxkmxTA%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_fp&d_ckId=de8e60921cc286bb43b240c184600370&d_curPage=1&d_pageSize=40&d_headId=de8e60921cc286bb43b240c184600370&curPage=0")
  jobs_webpage <- read_html(url)
  joblists <- jobs_webpage %>% html_nodes(css = '.condition') %>% html_attr('title') %>% strsplit('_') %>% as.data.frame() %>% t() %>% as.data.frame()
  
##2、数据过滤
  rownames(joblists) <- NULL
  names(joblists) <- c("salary", "city", "education", "experience")
  #t1 <- joblists[joblists$salary != "面议",]
  #t2 <- subset(joblists, salary != "面议")
  
  joblists %<>% filter(salary != "面议") %>% filter(city   != "不限") %>% filter(!(grepl("省", city)))
  
  joblists$salary <- sub("万", "", joblists$salary) %>% strsplit("-") %>% lapply(function(x){
     mean(as.numeric(x))}) %>% unlist()
  
  joblists %<>% apply(1,function(x){
     
    if(grepl("-", x[2])){
     
      x[2] <- strsplit(x[2],"-")[[1]][1]
      }
    x
    }) %>% t() %>% as.data.frame()
  
###3、拆分行
  joblists <- separate_rows(joblists, city,  convert = TRUE)
  return(joblists)  
}

get_job_on_all_page <- function(keyword, maxpage){
     
  base_url <- paste0('https://www.liepin.com/zhaopin/?init=-1&headckid=c88035ff1557e3f8&fromSearchBtn=2&ckid=c88035ff1557e3f8°radeFlag=0&sfrom=click-pc_homepage-centre_searchbox-search_new&key=',keyword,'&siTag=xKLlO2y_xees_Q4GxkmxTA%7EfA9rXquZc5IkJpXC-Ycixw&d_sfrom=search_fp&d_ckId=de8e60921cc286bb43b240c184600370&d_curPage=1&d_pageSize=40&d_headId=de8e60921cc286bb43b240c184600370&curPage=')
  jobs <- data.frame()
  for (i in 0:(maxpage-1)) {
     
    print(i)
    total_url <- paste0(base_url,i)
    jobs <- rbind(jobs, get_job_on_page(total_url))
  }
  return(jobs)
}

CRA_jobs <- get_job_on_all_page("CRA", 99)
write.table(CRA_jobs , 'CRA_jobs.txt', sep='\t',quote=F,row.names=F,col.names=T)

模仿:https://ask.hellobi.com/blog/R_shequ/11523

你可能感兴趣的:(#,R,爬虫,R,爬虫)