R语言爬虫 电影票房-艺恩网

一、爬取目的:
爬取数据用于论文-大数据背景下我国电影票房预测研究
数据来源:http://www.cbooo.cn/

二、思路解析:

  1. 爬取首页 电影名称+ID
  2. 拼接 http://www.cbooo.cn/m/ + ID 获取电影详情页
library(tidyverse)
library(httr)
library(jsonlite)
library(rlist)
library(plyr)
##tidyverse:包含R语言常用的8个包,合集
##httr:相当于py的Request库,

#获取总览页
yien_data <- data.frame()
for (i in 1:395){
    url <- paste('http://www.cbooo.cn/Mdata/getMdata_movie?area=50&type=0&year=0&initial=%E5%85%A8%E9%83%A8&pIndex=',i,seq='')
    
    Sys.sleep(0.5) 
    
    response <- GET(url, user_agent="Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36")
   
    result <- fromJSON(content(response,as="text"))
    yien_data1 <- as.data.frame(result[[1]])
    #合并不等长的数据框rbind.fill()
    yien_data <- rbind.fill(yien_data,yien_data1)
    print(paste('已完成',i,seq=' '))
}
#获取详情页
details_data <- data.frame()
for (i in 1:nrow(yien_data)){
  movieID<- yien_data[i,2]
  details_url <- paste('http://www.cbooo.cn/m/',movieID,sep = '')
  
  Sys.sleep(0.2)
  
  session = details_url %>% html_session(add_headers(`User-Agent`="Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"))
  
  movie_list <-  session %>% html_nodes('#top > div:nth-child(3) > div.mainbox.fr > div > div.ziliaoku > div.ziliaofr > div.cont > p') %>% html_text()
  movie_list <- gsub('\r\n','',movie_list)
  movie_list <- gsub(' ','',movie_list)
  movie_df <- as.data.frame(str_split_fixed(movie_list, ":", 2))
  
  for (j in 1:nrow(movie_df)){
    if (j==1){
      details_data[i,j] <- as.character(movie_df[j,1])
    }else if(j==2){
      details_data[i,j] <- as.character(movie_df[j,1])
    }else if(movie_df[j,1] == '类型'){
      details_data[i,3] <- as.character(movie_df[j,2])
    }else if(movie_df[j,1] == '片长'){
      details_data[i,4] <- as.character(movie_df[j,2])
    }else if(movie_df[j,1] == '上映时间'){
      details_data[i,5] <- as.character(movie_df[j,2])
    }else if(movie_df[j,1] == '制式'){
      details_data[i,6] <- as.character(movie_df[j,2])
    }else if(movie_df[j,1] == '国家及地区'){
      details_data[i,7] <- as.character(movie_df[j,2])
    }else if(movie_df[j,1] == '发行公司'){
      details_data[i,8] <- as.character(movie_df[j,2])
    }else{
      details_data[i,9] <- as.character(movie_df[j,2])
      }
    }
  }
  
yien_newfile <- cbind(yien_data,details_data)

最终数据形式如下:
R语言爬虫 电影票房-艺恩网_第1张图片

你可能感兴趣的:(R语言)