由于最近一直在关注房产信息,心血来潮想看一下最近上海市二手房的信息,所以利用R语言爬了1万+套二手房信息供参考,下面是抓取网页信息的代码,还请多多指教。
参考代码如下:
library(RCurl)
library(XML)
library(raster)
library(stringr)
# 读取上海二手房数据
start_url = "http://sh.centanet.com/ershoufang/g1/"
# 构造请求头
cust_header =c("User-Agent"="Mozilla/5.0 (Windows NT 6.1; WOW64; rv:26.0) Gecko/20100101 Firefox/26.0",
"Accept"="text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language"="en-us",
"Connection"="keep-alive")
# 读取PageSouce
pagesource <- getURL(start_url,httpheader=cust_header,.encoding="utf-8")
# 解析页数
parseTotalPage <- function(pagesource){
doc <- htmlParse(pagesource)
as.numeric(str_split(sapply(getNodeSet(doc, '//div[contains(@class,"result-lists")]//div[contains(@class,"select-bar clearfix")]//p[contains(@class,"pagerNum fr")]//span[@class="mr_10"]//text()'), xmlValue),"/")[[1]][2])
}
# 解析页面内容,获取小区名称、房间类型、房间面积、房间朝向|装修风格|年限、房间位置、房间价格、房间单价等信息
parseContent <- function(pagesource){
# 解析网页源代码
doc <- htmlParse(pagesource)
# 获取小区名称
district_name <- sapply(getNodeSet(doc, '//div[contains(@class,"item-info fl")]//p[contains(@class,"f14 f000 mb_10")]//a[@class="f000 mr_10"]//text()'), xmlValue)
# 获取房间类型
room_type <- sapply(getNodeSet(doc, '//div[contains(@class,"item-info fl")]//p[contains(@class,"f14 f000 mb_10")]//span[@class="f000 mr_10"]//text()'), xmlValue)
# 获取房间面积
room_area <- sapply(getNodeSet(doc, '//div[contains(@class,"item-info fl")]//p[contains(@class,"f14 f000 mb_10")]//span[@class="f000"]//text()'),xmlValue)
# 获取房间朝向等信息(包括朝向、楼层、装修、房间年份)
room_direction <- sapply(getNodeSet(doc, '//div[contains(@class,"item-info fl")]//p[contains(@class,"f7b mb_10")]//text()'),xmlValue)
room_direction <- trim(str_split(trim(paste(room_direction,collapse = "")),"\n")[[1]])
room_direction <- room_direction[which(nchar(room_direction) != 0)]
# 获取房间位置信息(包括房间所在上海区域、所在镇、哪条路)
room_location <- trim(sapply(getNodeSet(doc, '//div[contains(@class,"item-info fl")]//p[@class="f7b mb_15"]//text()'),xmlValue))
room_location <- room_location[which(nchar(room_location) != 0)]
room_info <- data.frame()
for(i in 1:length(str_split(room_location," "))){
room_region <- str_split(str_split(room_location," ")[[i]][1],"-")[[1]][1]
room_town <- str_split(str_split(room_location," ")[[i]][1],"-")[[1]][2]
room_road <- str_split(room_location," ")[[i]][2]
room_data <- data.frame(room_region,room_town,room_road)
room_info <- rbind(room_info,room_data)
}
# 获取房间总价
room_price <- sapply(getNodeSet(doc, '//div[contains(@class,"item-pricearea fr")]//p[@class="price-nub cRed"]//text()'), xmlValue)
# 获取房间单价
room_avgprice <- sapply(getNodeSet(doc, '//div[contains(@class,"item-pricearea fr")]//p[@class="f14 f000 mb_15 fsm"]//text()'), xmlValue)
# 整合信息
result <- data.frame(district_name, room_type, room_area, room_direction ,room_info$room_region, room_info$room_town, room_info$room_road, room_price, room_avgprice)
}
# 获取总页数和第一页内容
total_page <- parseTotalPage(pagesource)
pageresults <- parseContent(pagesource)
# 生成2-n页url
page = 1:(total_page -1)
url_list = ""
url_list[page] = paste0("http://sh.centanet.com/ershoufang/g",page +1,"/")
# 循环读取url,并进行下载解析
for (url in url_list){
pagesource <- getURL(url,httpheader=cust_header,.encoding="utf-8")
pageresult <- parseContent(pagesource)
pageresults <- rbind(pageresults,pageresult)
}
# 将结果写入并保存
write.table(pageresults,"./pachong/pa_house.csv",row.names = TRUE)