R语言爬虫实践一

R语言爬虫-上海市二手房信息

由于最近一直在关注房产信息,心血来潮想看一下最近上海市二手房的信息,所以利用R语言爬了1万+套二手房信息供参考,下面是抓取网页信息的代码,还请多多指教。

参考代码如下:

library(RCurl)  
library(XML) 
library(raster)
library(stringr)
# 读取上海二手房数据  
start_url = "http://sh.centanet.com/ershoufang/g1/"
# 构造请求头  
cust_header =c("User-Agent"="Mozilla/5.0 (Windows NT 6.1; WOW64; rv:26.0) Gecko/20100101 Firefox/26.0",  
               "Accept"="text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
               "Accept-Language"="en-us",  
               "Connection"="keep-alive")  
# 读取PageSouce  
pagesource <- getURL(start_url,httpheader=cust_header,.encoding="utf-8")  
# 解析页数  
parseTotalPage <- function(pagesource){  
  doc <- htmlParse(pagesource)    
  as.numeric(str_split(sapply(getNodeSet(doc, '//div[contains(@class,"result-lists")]//div[contains(@class,"select-bar clearfix")]//p[contains(@class,"pagerNum fr")]//span[@class="mr_10"]//text()'), xmlValue),"/")[[1]][2]) 
}  

# 解析页面内容,获取小区名称、房间类型、房间面积、房间朝向|装修风格|年限、房间位置、房间价格、房间单价等信息
parseContent <-  function(pagesource){  
  # 解析网页源代码
  doc <- htmlParse(pagesource)
  # 获取小区名称
  district_name <- sapply(getNodeSet(doc, '//div[contains(@class,"item-info fl")]//p[contains(@class,"f14 f000 mb_10")]//a[@class="f000 mr_10"]//text()'), xmlValue)  
  # 获取房间类型
  room_type <- sapply(getNodeSet(doc, '//div[contains(@class,"item-info fl")]//p[contains(@class,"f14 f000 mb_10")]//span[@class="f000 mr_10"]//text()'), xmlValue)  
  # 获取房间面积
  room_area <- sapply(getNodeSet(doc, '//div[contains(@class,"item-info fl")]//p[contains(@class,"f14 f000 mb_10")]//span[@class="f000"]//text()'),xmlValue)
  # 获取房间朝向等信息(包括朝向、楼层、装修、房间年份)
  room_direction <- sapply(getNodeSet(doc, '//div[contains(@class,"item-info fl")]//p[contains(@class,"f7b mb_10")]//text()'),xmlValue)
  room_direction <- trim(str_split(trim(paste(room_direction,collapse = "")),"\n")[[1]])
  room_direction <- room_direction[which(nchar(room_direction) != 0)]
  # 获取房间位置信息(包括房间所在上海区域、所在镇、哪条路)
  room_location <- trim(sapply(getNodeSet(doc, '//div[contains(@class,"item-info fl")]//p[@class="f7b mb_15"]//text()'),xmlValue))
  room_location <- room_location[which(nchar(room_location) != 0)]
  room_info <- data.frame()
  for(i in 1:length(str_split(room_location," "))){
    room_region <- str_split(str_split(room_location," ")[[i]][1],"-")[[1]][1]
    room_town <- str_split(str_split(room_location," ")[[i]][1],"-")[[1]][2]
    room_road <- str_split(room_location," ")[[i]][2]
    room_data <- data.frame(room_region,room_town,room_road)
    room_info <- rbind(room_info,room_data)
  }
  # 获取房间总价
  room_price <- sapply(getNodeSet(doc, '//div[contains(@class,"item-pricearea fr")]//p[@class="price-nub cRed"]//text()'), xmlValue)  
  # 获取房间单价
  room_avgprice <- sapply(getNodeSet(doc, '//div[contains(@class,"item-pricearea fr")]//p[@class="f14 f000 mb_15 fsm"]//text()'), xmlValue)  
  # 整合信息
  result <- data.frame(district_name, room_type, room_area, room_direction ,room_info$room_region, room_info$room_town, room_info$room_road, room_price, room_avgprice)
}  
# 获取总页数和第一页内容  
total_page <- parseTotalPage(pagesource)
pageresults <- parseContent(pagesource)
# 生成2-n页url  
page = 1:(total_page -1)  
url_list = ""  
url_list[page] = paste0("http://sh.centanet.com/ershoufang/g",page +1,"/")

# 循环读取url,并进行下载解析  
for (url in url_list){  
  pagesource <- getURL(url,httpheader=cust_header,.encoding="utf-8")  
  pageresult <- parseContent(pagesource)  
  pageresults <- rbind(pageresults,pageresult)  
}  
# 将结果写入并保存
write.table(pageresults,"./pachong/pa_house.csv",row.names = TRUE)

你可能感兴趣的:(R语言)