https://www.kaggle.com/c/house-prices-advanced-regression-techniques
数据集是kaggle里面的House Prices
加载所需要的包
library(tidyverse)
library(reshape2)
读取数据
housing = read.csv('housing.csv')
简单看一下数据
head(housing)
summary(housing)
看一下列名字
colnames(housing)
画图看一下各个变量的分布
ggplot(data = melt(housing), mapping = aes(x = value)) +
geom_histogram(bins = 30) + facet_wrap(~variable, scales = 'free_x')
中位数插值
housing$total_bedrooms[is.na(housing$total_bedrooms)] = median(housing$total_bedrooms , na.rm = TRUE)
将total_bedrooms 和 total_rooms处理为mean_number_bedrooms and mean_number_rooms
housing$mean_bedrooms = housing$total_bedrooms/housing$households
housing$mean_rooms = housing$total_rooms/housing$households
drops = c('total_bedrooms', 'total_rooms')
housing = housing[ , !(names(housing) %in% drops)]
head(housing)
下面是将ocean_proximity处理成哑变量
housing1 <- housing[c(-7,-8)]
housing1 <- scale(housing1)
library(fastDummies)
housing2 <- dummy_cols(housing,remove_first_dummy = F)
housing3 <- data.frame(housing2[11:15],housing1,housing$median_house_value)
library(randomForest)
library(caret)
将数据集的80%划分为训练集,20%划分为测试集
set.seed(1234)
#将数据集的80%划分为训练集,20%划分为测试集
trainIndex = createDataPartition(housing3$housing.median_house_value, p=0.8,
list = FALSE,
times = 1)
#createDatapartition会自动从y的各个level随机取出等比例的数据来,组成训练集
#训练集
dataTrain = housing3[trainIndex,]
#测试集
dataTest = housing3[-trainIndex,]
随机森林
library(randomForest)
names(dataTrain)
set.seed(1)
train_y = dataTrain[,'housing.median_house_value']
train_x = dataTrain[, names(dataTrain) !='housing.median_house_value']
head(train_y)
head(train_x)
rf_model = randomForest(train_x, y = train_y , ntree = 500, importance = TRUE)
names(rf_model)
变量重要性
rf_model$importance
包外误差
oob_prediction = predict(rf_model)
train_mse = mean(as.numeric((oob_prediction - train_y)^2))
oob_rmse = sqrt(train_mse)
oob_rmse
看看在模型测试集上的效果
test_y = dataTest[,'housing.median_house_value']
test_x = dataTest[, names(dataTest) !='housing.median_house_value']
y_pred = predict(rf_model , test_x)
test_mse = mean(((y_pred - test_y)^2))
test_rmse = sqrt(test_mse)
test_rmse