R语言数据分析必备

一. 加载模块包

install.packages("tidyverse")
library(tidyverse)

其他方法

# initial settings
libary_path <- paste(getwd(), "packages",sep="/")
dir.create(libary_path,showWarnings = FALSE)
.libPaths(libary_path)


if(!require(testthat)){
    install.packages("testthat")
    library(testthat)
}
if(!require(repr)){
    install.packages("repr")
    library(repr)
}
if(!require(MASS)){
    install.packages("MASS")
    library(MASS)
}
if(!require(gclus)){
    install.packages("gclus")
    library(gclus)
}
if(!require(GGally)){
    install.packages("GGally")
    library(GGally)
}
if(!require(ggcorrplot)){
    install.packages("ggcorrplot")
    library(ggcorrplot)
}
if(!require(tidyverse)){
    install.packages("tidyverse")
    library(tidyverse)
}


# Plot size deppening on your screen resolution to 4 x 3
options(repr.plot.width=4, repr.plot.height=3)

二. 读取数据:读取写入excel.xlsx

方法1:将数据和文件放置在同一文件夹下直接读取
rt <- read.table("3.txt",head=TRUE)
方法2:设置路径后读取
#读取txt数据
setwd("D:\\data")
rt <- read.table("3.txt",head=TRUE)
方法3:弹窗手动选择数据
data <- read.delim(file.choose())
data <- read.csv(file.choose())
方法4:直接路径读取数据
data <- read.delim("D:/data/xxx.csv")
方法5:复制表格内容
#右键复制表格内容后输入如下代码 以便导入复制在内存中数据:
data=read.table("clipboard",sep="\t",header = T)
制作数据
c1 <- 1:5  
c2 <- c(2,5,1,8,6)
c3 <- c("w","q","r","t","p")
c4 <- c("i","w","u","z","v")
cite <- data.frame(c1,c2,c3,c4) ##生成数据框
查看数据
View(data)
str(data)
summary(data)
查看行和列的个数是(多少行 多少列)
dim(data)
#结果是:1,2 这样1行2列

查看某列元素(不重复)

unique(data$列名)
unique(data[,1])
table(data[,2])

查看 列名 & 行名

names(data)
row.names(data)

设置 列名 & 行名

names(data) = c("A","B","C")
row.names(data)=c("A","B","C")

增加一列 总计

data$total=rowSums(data)

增加一行 总计

data[7,]=colSums(data)
#这里7表示行数+1,所以如下也可以
#data[nrow(data)+1,]=colSums(data)
#或者使用合并方法
#rbind(df1,colSums(data))

三. 数据处理

改变数值类型

for (i in 2:dim(df)[2]){
df[,i]=as.numeric(df[,i])}

取消科学计数,保留小数

options(scipen = 200)

去重查看

unique(data)

标准化("max-min"结果肯定大于0) 和 归一化(结果不一定大于0)

#################  自定义标准化 #####################################
normalize = function(x) {return ((x - min(x)) / (max(x) - min(x)))}
data[,-11]=normalize(data[,-11])
head(data)
##或者构造新的数据集
#data_Stand=as.data.frame(lapply(data[,-11],normalize))
#data_Stand$class=data$class
#head(data_Stand)

################# 归一化函数包scale() #####################################
#第11列是标签列(目标列)
data[,-11]=scale(data[,-11])
#scale(x, center = TRUE, scale = TRUE)
View(data)

处理查看空缺值

################## Processing  "NAN"  ################## 
na_flag = apply(is.na(data), 2, sum)
View(na_flag)
#删掉所在行
dataset<- na.omit(data)
na_flag = apply(is.na(dataset), 2, sum)
View(na_flag)

删除空值所在列

##################删除空值大于30的列##################
flag <- apply(data, 2, function(x) sum(is.na(x)) <= 30)
newdata <- data[, which(flag)]
na_flag = apply(is.na(newdata ), 2, sum)
View(na_flag)

盖帽法替换异常值

############################### 数据处理 #####################
#绘制频率分布直方图
par(mfrow=c(2,5))
for(i in 1:10)
{
hist(data[,i],main=paste(names(data)[i],"的频率分布直方图"),xlab=NA) 
}  


#绘制箱线图检验离群值
par(mfrow=c(2,5))
for(i in 1:10)
{
boxplot(data[i],main=paste(names(data)[i],"的箱线图"))   
}   

#或者另一种方法绘制箱线图检验离群值
par(mfrow=c(1,1))
boxplot(data[,-11])

###### 绘制散点图 #######
##par(mfrow=c(4,4))
##for(i in 2:17)
##{
##plot(data[,i],main=paste(names(data)[i],"的散点图"),xlab=NA,ylab=NA)
##} 
#######################

#采用盖帽法替换异常值,用10%处的数据覆盖分布在10%以下的数据,用90%处的数据覆盖分布在99%以上的数据。
#这里的10%和90%取值有些极端,及供参考。
block<-function(x,lower=T,upper=T){
  if(lower){
    q1<-quantile(x,0.1)
    x[x<=q1]<-q1
  }
  if(upper){
    q99<-quantile(x,0.90)
    x[x>q99]<-q99
  }
  return(x)
}

data[,-11] = sapply(data[,-11],block)
par(mfrow=c(1,1))
boxplot(data[,-11],frame = T)
str(data)
head(data)

相关图

##### 绘制替换异常值后的相关图  ##########
#相关系数cor
b=cor(data[,-11])
b
#安装ggpubr包可视化
#install.packages("ggpubr")
#install.packages("corrplot")
library("corrplot")
par(mfrow=c(1,1))
corrplot(b)
#或通过饼图查看相关性
corrplot(b,method="pie")
#用颜色显示,同时显示相关系数。
corrplot(b,method="color",addCoef.col="grey")
#显示数字,增强图形可读性。
col=colorRampPalette(c("navy", "white", "firebrick3")) #设置颜色
corrplot(b,add=TRUE, type="lower", method="number",diag=FALSE,tl.pos="n", cl.pos="n",col=col(10))


#绘制变量间相关性多图混合
par(mfrow=c(1,1))
library(PerformanceAnalytics)
chart.Correlation(data[,-11])

关于日期

#选定日期
start=as.Date("2020-1-22",'%Y-%m-%d')#起始日期
end = as.Date("2021-1-4",'%Y-%m-%d') #终止日期
ndays = end - start + 1              #总天数
#构造每日日期向量dates
dates = seq(from=start, by=1, length.out=ndays)
View(dates)

拼接变量和字符串

paste("xxx",i)

四. 划分训练集和测试集

方法1

data=data("iris")
set.seed(2021)
train_sample = sample(10000,8000) #10000个里面随机抽取8000个
#train_sample = sample(nrow(iris),0.8*nrow(iris)) #10000个里面随机抽取8000个
train=data[train_sample, ]
test=data[-train_sample, ]

方法2

data("iris")
set.seed(2)
ind = sample(2,nrow(iris),replace = TRUE,prob = c(0.7,0.3))
trainset = iris[ind == 1,]
testset = iris[ind == 2,]

查看测试集和样本集

#查看测试集和样本集
str(classdata_train)
str(classdata_test)
#接着我们查看所抽取训练集和测试集标签中各字母所占的比例
dataclassper=prop.table(table(data$class))
dataclassper
train=prop.table(table(classdata_train$class))
train
test=prop.table(table(classdata_test$class))
test

五. 条件,循环,函数

1条件

1.1条件选行 给 一个新的向量

a=df[df$列名 == 条件,]

例子

a=iris[iris$Species=="virginica",]
head(a)

Sepal.Length Sepal.Width Petal.Length Petal.Width Species
6.3 3.3 6.0 2.5 virginica
5.8 2.7 5.1 1.9 virginica
7.1 3.0 5.9 2.1 virginica

按行条件筛选

library(tidyfst)
library(survival)  # 加载数据集所需的包
data("colon") # 加载数据集

# 单条件筛选
filter_dt(colon, sex == 1)  # 筛选colon数据集中男性患者

# 多条件筛选
# 筛选≥50岁的男性患者
filter_dt(colon,sex==1 & age>=50)

# 筛选男性或年龄≥50岁的患者
filter_dt(colon, sex == 1 | age >= 50)

# 筛选age中大于age平均值的行
colon %>% filter_dt(age > mean(age, na.rm = TRUE)) 

# 筛选男性中age大于age平均值的行
colon %>% filter_dt(sex == 1 & age > mean(age, na.rm = TRUE))

# 筛选 50 ≤ age < 60的患者
filter_dt(colon, age >= 50 & age < 60)
# 筛选 50 ≤ age ≤ 60的患者
colon %>% filter_dt(between(age,50,60))

# 筛选肿瘤分化程度differ为1和2,且性别为男性的行
colon %>% filter_dt(differ %in% c(1,2), sex == 1)


# 筛选不同性别中年龄大于各自性别的年龄平均值的行  
colon %>% 
  group_dt(
    by = sex,
    filter_dt(age > mean(age, na.rm = TRUE))
  )

iris %>% filter_dt(Sepal.Length == max(Sepal.Length))

2循环

2.1循环添加元素到空向量

Vector=c()
for (i in 1:5){
Vector=c(Vector,i)
}
Vector

2.2循环添加向量到空数据框

D=data.frame()
for (i in 1:5){
D=rbind(D,f(i))
}
Vector

六 各类算法模型

1分类算法

data = read.csv(file.choose())
View(data)

#histplot
par(mfrow=c(2,5))
for(i in 1:10)
{
hist(data[,i],main=paste(names(data)[i]),xlab=NA,col="pink") 
}  

#################  standarzation #####################################
normalize = function(x) {return ((x - min(x)) / (max(x) - min(x)))}
data[,-11]=normalize(data[,-11])
head(data)



#boxplot
par(mfrow=c(2,5))
for(i in 1:10)
{
boxplot(data[i],main=paste("Boxplot of",names(data)[i]))   
}   



#Map method
block<-function(x,lower=T,upper=T){
  if(lower){q1<-quantile(x,0.1);x[x<=q1]<-q1}
  if(upper){q99<-quantile(x,0.90);x[x>q99]<-q99}
  return(x)
}

data[,-11] = sapply(data[,-11],block)
par(mfrow=c(1,1))
boxplot(data[,-11],frame = T,col="purple")
str(data)
head(data)


Model_comparison = function(data){
#load required packages
if(!require(nnet)){
    install.packages("nnet")
    library(nnet)
}
if(!require(gmodels)){
    install.packages("gmodels")
    library(gmodels)
}
if(!require(MASS)){
    install.packages("MASS")
    library(MASS)
}
if(!require(C50)){
    install.packages("C50")
    library(C50)
}
if(!require(kernlab)){
    install.packages("kernlab")
    library(nnet,gmodels,MASS,C50,kernlab,randomForest)
}
if(!require(randomForest)){
    install.packages("randomForest")
    library(randomForest)
}
#Partition data set
set.seed(2021)
train_sample = sample(nrow(data),0.8*nrow(data))
train=data[train_sample, ]
test =data[-train_sample, ]

#set the vectors of time and Accuracy
t=c()
Accuracy=c()

######################  logistic regression   ######################
ptm <- proc.time()
model = glm(as.factor(train$class)~., data=train, family=binomial())
a=proc.time()-ptm
t[1]=a[[1]]
prob = predict(model, test, type="response")#type="response"Get the prediction probability
pred = factor(prob > 0.5, levels=c(FALSE, TRUE), labels=c("g", "h"))#If the probability is greater than 0.5, it is true and labeled as malignant
agreement = pred == test$class
Accuracy[1]=prop.table(table(agreement))[[2]]

################  Linear discriminant analysis   ##################
ptm <- proc.time()
model =lda(as.factor(train$class)~.,data=train)
a=proc.time()-ptm
t[2]=a[[1]]
predictions=predict(model , newdata = test)
newGrop = predictions$class
agreement = newGrop == test$class
Accuracy[2]=prop.table(table(agreement))[[2]]

###########   Quadratic discriminant analysis   ####################
ptm <- proc.time()
model =qda(as.factor(train$class)~.,data=train)
a=proc.time()-ptm
t[3]=a[[1]]
predictions=predict(model , newdata = test)
newGrop = predictions$class
agreement = newGrop == test$class
Accuracy[3]=prop.table(table(agreement))[[2]]

###########   Bayes discriminant analysis   ########################
ptm <- proc.time()
model =lda(as.factor(train$class)~.,data=train,prior = c(1,1)/2)
a=proc.time()-ptm
t[4]=a[[1]]
predictions=predict(model , newdata = test)
newGrop = predictions$class
agreement = newGrop == test$class
Accuracy[4]=prop.table(table(agreement))[[2]]

############################## SVM   ################################
ptm <- proc.time()
class_classifier = ksvm(as.factor(class)~., data = train, kernel = "vanilladot")  
a=proc.time()-ptm
t[5]=a[[1]]
predictions = predict(class_classifier, test)
agreement <- predictions == test$class
Accuracy[5]=prop.table(table(agreement))[[2]]

########################## decision trees ##############################
#使用C5.0函数获得决策树模型
ptm <- proc.time()
data_model = C5.0(train[,-11], as.factor(train$class))
a=proc.time()-ptm
t[6]=a[[1]]
predictions = predict(data_model, test)
table(predictions, test$class)
agreement <- predictions == test$class
Accuracy[6]=prop.table(table(agreement))[[2]]

############################## Random Forest ################################
ptm <- proc.time()
data.rf = randomForest(as.factor(class) ~ ., data=train, importance=TRUE, proximity=TRUE)
a=proc.time()-ptm
t[7]=a[[1]]
predictions = predict(data.rf, test)
agreement <- predictions == test$class
Accuracy[7]=prop.table(table(agreement))[[2]]

############################ neural network ############################ 
model=nnet(as.factor(train$class)~.,size=20,data=train)
a=proc.time()-ptm
t[8]=a[[1]]
predictions =predict(model,test,type='class')
agreement = predictions == test$class
Accuracy[8]=model_nnet=prop.table(table(agreement))[[2]]

#Summary of statistical results
df=data.frame(rbind(Accuracy,t))
names(df) = c("logistic regression","Linear discriminant analysis","Quadratic discriminant analysis","Bayes discriminant analysis","SVM","decision trees","Random Forest","neural network")
df=t(df)
names(df) = c("Accuracy","Time")
#Export results
write.csv(df,"D:/result.csv")
return(df)
}
result=Model_comparison(data)
View(result)

回归

#install.packages("randomForest")
#install.packages("dplyr")
#install.packages("ggplot2")
#install.packages("pROC")
#install.packages("nnet")
#install.packages("DMwR")
#install.packages("kernlab")
library(randomForest)
library(dplyr)
library(ggplot2)
library(nnet)
library(DMwR)
library(e1071)

################################  load data  ###########################
data=read.csv("D:/data/cardekho.csv")
########################图###################
##图一
library(ggplot2)
str(data)
ggplot(data,aes(x=brand,y=selling_price,fill=brand))+
  geom_violin()+
  geom_boxplot(width=0.3)+
  labs(x = "",y="selling_price",Title="")+
  facet_wrap(~brand,scale="free")+
  theme_classic()
##图二
ggplot(data,aes(x=brand,y=selling_price,fill=brand))+
  geom_bar(stat = 'identity')+
  labs(x = "brand",y="selling_price",Title="")+
  theme_classic() +
  coord_flip()
##图三
library(ggridges)
ggplot(data) +
  geom_density(aes(x = selling_price,fill=brand))+ 
  facet_wrap(~brand,scale="free")+
  theme_classic()
##图4
ggplot(data,aes(y=as.factor(brand) ,x=max_cost_price,fill=brand))+
  geom_density_ridges()+
  theme_classic()


################## Processing  "NA"  ################## 
na_flag = apply(is.na(data), 2, sum)
View(na_flag)
#删掉所在行
dataset<- na.omit(data)
na_flag = apply(is.na(dataset), 2, sum)
View(na_flag)
str(dataset)

####################### 数据整理 ################
df=dataset[-5]
df$car_name=as.numeric(as.factor(df$car_name))
df$brand=as.numeric(as.factor(df$brand))
df$model=as.numeric(as.factor(df$model))
df$seller_type=as.numeric(as.factor(df$seller_type))
df$fuel_type=as.numeric(as.factor(df$fuel_type))
df$transmission_type=as.numeric(as.factor(df$transmission_type))
str(df)
View(df)

########相关###########
#install.packages("corrplot")
library("corrplot")
b=cor(df)
b
corrplot(b,method="color",addCoef.col="grey")
write.csv(b,"D:/data/cor.csv")

plot(df$max_power,df$selling_price,col='3')
plot(df$max_cost_price,df$selling_price,col='4')
plot(df$engine,df$selling_price,col='5')


## 挑选属性
testdata=df
ddf=testdata[,-1]
df=ddf[,-2]
str(df)

#################  自定义标准化 #####################################
normalize = function(x) {return ((x - min(x)) / (max(x) - min(x)))}
df[,-14]=normalize(df[,-14])
str(df)



#200位数以内不使用科学计数法
options(scipen = 200)
###################### 开始模型准备 ########################
#Partition data set
set.seed(2021)
train_sample = sample(nrow(df),0.99*nrow(df))
train=df[train_sample, ]
test =df[-train_sample, ]
### 未处理数据作为对比的对照模型
#set.seed(2021)
#train_sample = sample(nrow(testdata),0.99*nrow(df))
#train=testdata[train_sample, ]
#test =testdata[-train_sample, ]

#####################################################################
#####################################################################
########################       线性回归      ########################
#####################################################################
#####################################################################
model = lm(train$selling_price~., data=train)
pred=predict(model,newdata = test)

regr.eval(test$selling_price,pred)
print("R-squared");cor(test$selling_price,pred)^2
result=data.frame(实际=test$selling_price,线性回归=pred)
plot(test$selling_price,pred,main="linear regression") 

#####################################################################
#####################################################################
#############################  randomForest  ########################
#####################################################################
#####################################################################
set.seed(2021)
model2 = randomForest(train$selling_price~., data=train, importance=TRUE)
pred2 = predict(model2, test)

regr.eval(test$selling_price,pred2)
print("R-squared");cor(test$selling_price,pred2)^2
result$随机森林回归=pred2
plot(test$selling_price,pred2,main="Random forest regression") 

#####################################################################
#####################################################################
########################       svm回归      ########################
#####################################################################
#####################################################################
library(e1071)
model3 = svm(train$selling_price~., data=train)
pred3=predict(model3,newdata = test)

regr.eval(test$selling_price,pred3)
print("R-squared");cor(test$selling_price,pred3)^2
result$svm=pred3
plot(test$selling_price,pred3,main="svm regression") 

#####################################################################
#####################################################################
########################       决策树回归      ########################
#####################################################################
#####################################################################
library(rpart)
library(rpart.plot)
model4=rpart(train$selling_price~., data=train)
pred4=predict(model4,newdata = test)

regr.eval(test$selling_price,pred4)
print("R-squared");cor(test$selling_price,pred4)^2
result$决策树=pred4
plot(test$selling_price,pred4,main="Decision tree regression") 

library(rattle)
fancyRpartPlot(model4)
#####################################################################
#####################################################################
########################       xgboost     ########################
#####################################################################
#####################################################################
#install.packages("xgboost")
library("xgboost")
library(Matrix)

traindata1 <- Matrix(data.matrix(train[,-16]),sparse=T) # 利用Matrix函数,将sparse参数设置为TRUE,转化为稀疏矩阵
traindata2 <- data.matrix(train[,16]) # 将因变量转化为numeric
traindata3 <- list(data=traindata1,label=traindata2) # 将自变量和因变量拼接为list

dtrain <- xgb.DMatrix(data = traindata3$data, label = traindata3$label) # 构造模型需要的xgb.DMatrix对象,处理对象为稀疏矩阵
dtest = xgb.DMatrix(data.matrix(test[,-16])) # 构造模型需要的xgb.DMatrix对象,处理对象为稀疏矩阵

param <- list(max_depth=20,eta=0.03) # 定义模型参数
model <- xgboost(params=param,data = dtrain,nrounds = 15)
pred5 <- predict(model,dtest)
 
regr.eval(test$selling_price,pred5)
print("R-squared");cor(test$selling_price,pred5)^2

result$xgboost=pred5 
plot(test$selling_price,pred5,main="xgboost regression") 
View(result)
#去D盘data文件夹,result表格中复制需要的内容哈~
write.csv(result,"D:/data/result.csv")

#############################################################
#############################################################
######################## 误差曲线图 ############################
#############################################################

result$随机森林误差 = abs(result$实际-result$随机森林回归)
result$线性回归误差 = abs(result$实际-result$线性回归)
result$svm误差 = abs(result$实际-result$svm)
result$决策树误差 = abs(result$实际-result$决策树)
result$xgboost误差 = abs(result$实际-result$xgboost)

result$随机森林相对误差 = abs(result$实际-result$随机森林回归)/result$实际
result$线性回归相对误差 = abs(result$实际-result$线性回归)/result$实际
result$svm相对误差 = abs(result$实际-result$svm)/result$实际
result$决策树相对误差 = abs(result$实际-result$决策树)/result$实际
result$xgboost相对误差 = abs(result$实际-result$xgboost)/result$实际
View(result)

#误差
x = 1:length(result$线性回归误差)
plot(x, result$随机森林误差, type = "n", xlab = "", ylab = "", ylim = range(result$xgboost误差, result$随机森林误差) * c(10,1))
points(x, result$线性回归误差, type = "o", pch = 1, col = 1, lty = 1, lwd = 2)
points(x, result$随机森林误差, type = "o", pch = 2, col = 2, lty = 2, lwd = 2)
points(x, result$svm误差, type = "o", pch = 3, col = 3, lty = 3, lwd = 2)
points(x, result$决策树误差, type = "o", pch = 3, col = 4, lty = 4, lwd = 2)
points(x, result$xgboost误差, type = "o", pch = 3, col = 5, lty = 5, lwd = 2)
title(main = "误差图", xlab = "测试集序号", ylab = "误差")
legend("topleft", inset=.05, c("线性回归误差","随机森林误差","svm","决策树误差","xgboost误差"),lty=c(1, 2,3,4,5), pch=c(1, 2,3,4,5),col=c(1, 2,3,4,5))

#相对误差
x = 1:length(result$线性回归相对误差)
plot(x, result$随机森林相对误差, type = "n", xlab = "", ylab = "", ylim = range(result$线性回归相对误差, result$随机森林相对误差) * c(11,1))
points(x, result$线性回归相对误差, type = "o", pch = 1, col = 1, lty = 1, lwd = 2)
points(x, result$随机森林相对误差, type = "o", pch = 2, col = 2, lty = 2, lwd = 2)
points(x, result$svm相对误差, type = "o", pch = 3, col = 3, lty = 3, lwd = 2)
points(x, result$决策树相对误差, type = "o", pch = 3, col = 4, lty = 4, lwd = 2)
points(x, result$xgboost相对误差, type = "o", pch = 3, col = 5, lty = 5, lwd = 2)
title(main = "相对误差图", xlab = "测试集序号", ylab = "误差")
legend("topleft", inset=.05, c("线性回归相对误差","随机森林相对误差","svm","决策树相对误差","xgboost相对误差"),lty=c(1, 2,3,4,5), pch=c(1, 2,3,4,5),col=c(1, 2,3,4,5))

#相对误差部分观察
x = 1:length(result$线性回归相对误差)
plot(x, result$随机森林相对误差, type = "n", xlab = "", ylab = "", ylim = range(result$决策树相对误差, result$随机森林相对误差) * c(10,1))
points(x, result$随机森林相对误差, type = "o", pch = 2, col = 2, lty = 2, lwd = 2)
points(x, result$svm相对误差, type = "o", pch = 3, col = 3, lty = 3, lwd = 2)
points(x, result$决策树相对误差, type = "o", pch = 3, col = 4, lty = 4, lwd = 2)
title(main = "相对误差图", xlab = "测试集序号", ylab = "误差")
legend("topleft", inset=.05, c("随机森林相对误差","svm","决策树相对误差"),lty=c(2,3,4), pch=c(2,3,4),col=c(2,3,4))


##########随机森林参数调整############
A=c()
#注意循环次数越大,耗时越久(这里to表示最多尝试树的数目,by为间隔)
ns=seq(from=50, to=350, by=50)
for (i in ns){
set.seed(2021)
pred=randomForest(train$selling_price~., data=train, importance=TRUE, ntree=i) %>% predict(test)
p_acc=regr.eval(test$selling_price,pred)
A=c(A,p_acc[[4]])
}
A


plot(ns,A,main="随机森林所包含树的数目与MAPE关系图", ylab="MAPE", xlab="随机森林所包含的树的数目", frame.plot = TRUE,axes=FALSE)
lines(ns,A)
axis(1, at = ns, labels = formatC(ns, format = "fg"))
axis(2, at = seq(from=0.25, to=0.30, by=0.005), labels = paste(round(seq(from=25, to=30, by=0.5),2),"%"))
text(ns,A,round(A,4),pos=1,cex=0.7,offset=-.8)




#####################################################################
#####################################################################
#############################  最优 randomForest  ####################
#####################################################################
#####################################################################
set.seed(2021)
model2 = randomForest(train$selling_price~., data=train, importance=TRUE,ntree=300)
pred2 = predict(model2, test)

regr.eval(test$selling_price,pred2)
print("R-squared");cor(test$selling_price,pred2)^2
plot(test$selling_price,pred2,main="Random forest regression") 

str(df)
fill=brand

ggplot(df) +
  geom_density_ridges(aes(y = brand, x = selling_price))





你可能感兴趣的:(R语言数据分析必备)