本代码是基于《The influence of the neighbourhood environment on peer-to-peer accommodations: A random forest regression analysis》文章来实现的 ,具体可参考网站https://doi.org/10.1016/j.jhtm.2022.02.028。
本文的附件及文章可在官网中下载。 需要更多资源 qq:1148721210 微信:hongqiang_jiang
引用格式为: Jiang, H.,Mei, L.,Wei, Y., et al. (2022). The influence of the neighbourhood environment on peer-to-peer accommodations: A random forest regression analysis. Journal of Hospitality and Tourism Mangement, 51: 105-118.
Multiple linear regression and Random forest regression
# Use the software RStudio 4.0.5
# -*- coding: UTF-8 -*-
# This code uses All room as an example, other types of Airbnb such as entire home/apt, private room, shared room codes are also used to avoid redundancy and will be omitted from the classification codes.
## Loading packages and data------------------------------------------------------------
library(randomForest)
library(pheatmap)
library(extrafont)
library(corrplot)
library(car)
setwd("C:/Users/Desktop/Airbnb") # Setting up the work path
Data_Airbnb <- read.csv("Airbnb_data.csv", sep = ",") # Reading data
## Multiple linear regression---------------------------------------------------------------
Lm_Airbnb <- lm(Airbnb~c("PopDen", "PGDP", "HPrice", "Distance", "BusDen", "MetroDen", "CaterDen", "ShopDen", "RecrDen", "UnivDen", "HotelDen", "AttrDen") ,data = Data_Airbnb) # Modelling
summar(Lm_Airbnb) # View fitting results
lm.pred_Airbnb <- predict(lm_Airbnb, Data_Airbnb) # Predicted results
lm.pred_Airbnb1 <- data.frame(forest.pred_Airbnb, Data_Airbnb) # Comparison of predicted and actual results
# Multicollinearity test
vif(Lm_Airbnb, digits = 3) # Variance inflation factor(VIF)
## Random forest regression--------------------------------------------------------------
set.seed(1234) # Setting up random number seeds
Rf_Airbnb <- randomForest(Airbnb ~ c("popDen", "PGDP", "HPrice", "Distance", "BusDen", "MetroDen", "CaterDen", "ShopDen", "RecrDen", "UnivDen", "HotelDen", "AttrDen"), data = Data_Airbnb, ntree = 500, importance = TRUE) # Modelling
# Cross-validation
set.seed(1234)
result <- rfcv(Data_Airbnb[ ,2:12], Data_Airbnb$Airbnb, cv.fold = 2, scale = "log", step = 0.5) # rfcv is a random forest cross-validation function
result$error.cv # View the crossover error rate table
# Results of random forest regression
forest.pred_Airbnb <- predict(Rf_Airbnb, Data_Airbnb) # Predicted results
forest.pred_Airbnb1 <- data.frame(forest.pred_Airbnb, Data_Airbnb) # Comparison of predicted and actual results
# Checking out the chart
opar <- par(no.readonly = TRUE)
par(lwd = 2, cex = 1, cex.axis = 1, font = 2, cex.lab = 1, tck = -.02)
plot(forest.pred_Airbnb, main = " ", lwd = 2, font.lab = 2, font = 2, ann = FALSE, family = 'Times')
title(xlab = "Number of feature", ylab = "Cross-valication error", font.lab = 2)
par(opar)
# Variable importance - %lncMSE
varImpPlot(forest.pred_Airbnb, family = 'Times')
dev.off()
# The partial dependencies of variables
opar <- par(no.readonly = TRUE)
partialPlot(forest.pred_Airbnb, Data_Airbnb, PopDen, "0", main = " ", xlab = " ", ylab = " ", col = "black")
partialPlot(forest.pred_Airbnb, Data_Airbnb, PGDP, "0", main = " ", xlab = " ", ylab = " ", col = "black") # The same applies to the other variables "Distance", "BusDen", etc.
# Comparison of multiple linear regression and random forest regression results ----------------------------------------------------------------------------------------------
# R-value
cor(lm.pred_Airbnb, Data_Airbnb$Airbnb) # Multiple linear regression R-value
cor(forest.pred_Airbnb, Data_Airbnb$Airbnb) # Random forest regression R-value
# Mean absolute error (MAE)
MAE <- function(actual, predicted){mean(abs(actual - predicted))} # Formula to define MAE
MAE(lm.pred_Airbnb, Data_Airbnb$Airbnb) # Mean absolute error of multiple linear regression
MAE(forest.pred_Airbnb, Data_Airbnb$Airbnb) # Mean absolute error of random forest regression
dev.off()