好看
https://www-bcf.usc.edu/~gareth/ISL/ISLR%20First%20Printing.pdf
https://rpubs.com/ppaquay/65561
https://www.cnblogs.com/xuancaoyy/p/5309966.html
https://blog.csdn.net/rojyang/article/details/85321244
https://blog.csdn.net/weixin_36372879/article/details/80493968
https://blog.csdn.net/yawei_liu1688/article/details/78891050
# MD
多变量/离/群/值/的查找方法,也就是异常值。用卡/方检验进行考察(0.005作显著性判别)。在r中,马氏/距离的计算仅需图t,m,s
```
d=read.csv("hmeq.csv",na.strings="")
dim(d)
View(d)
dc =d[complete.cases(d),] #remove missing values
dim(dc)
mdist = function(x) #马氏距离function定义
{
t = as.matrix(x) #将x转换为矩阵
p = dim(t)[2] #选择列
m = apply(t,2,mean) #对列求均值
s = var(t) #方差,此步即可获得协方差矩阵
return(mahalanobis(t,m,s))
}
#数据/分类
dc1 = dc[dc$BAD==1,] #数据集的第一列是定性变量 1/0
dc0 = dc[dc$BAD==0,]
#计算各分组的马氏距离
mdc1 = mdist(dc1[,-c(1,5,6)]) #去除第1,5,6列,因为不是数值型
mdc0 = mdist(dc0[,-c(1,5,6)])
c=qchisq(0.99,10) #卡方分布检验拟合度 异常值,99%的应该服从什么分布,这个临界值是多少
#10是自由度,一共13个变量,减去1,5,6列之后是10个
mdc=mdist(dc[,-c(1,5,6)])
#用马氏/距离进行筛选
x1=dc1[mdc1 x0=dc0[mdc0 #逻辑/回归 lg.fit = glm(BAD~.,data=d,family=binomial) #第一个位置写什么是y什么是x,BAD~.表示以BAD作Y,其余为X都为x。family若不指明默认是线性回归,binomial表示是逻辑回归 summary(lg.fit$fitted.values) #L/DA pred1 = predict(lg.fit,type="response") pred1[pred1>0.3] <-1 #将上一步的结果归类到以0.3为临界值的分类器中 pred1[pred1<=0.3] <-0 table(pred1,dc$BAD) #混淆/矩阵 library(pROC) roc(dc$BAD,pred1,plot=TRUE,print.auc=TRUE,legacy.axes=TRUE) #反应变量,predictor,画出图,标出auc面积,确保横坐标(0,1)排序 ``` ``` heart <- read.csv("heart.csv",header=T,na.strings="?") summary(heart) #无缺失值 names(heart) heart$target[heart$target==1]<-"Yes" heart$target[heart$target==0]<-"No" set.seed(1234) #设定随机取数的范围 train<-sample(nrow(heart),0.7*nrow(heart)) #以3:7的比例抽样 theart<-heart[train,] #训练集 vheart<-heart[-train,] #测试集 library(rpart) dtree<-rpart(target~.,data=theart,method="class",parms=list(split="information")) printcp(dtree) print(dtree) tree<-prune(dtree,cp=dtree$cptable[which.min(dtree$cptable[,"xerror"]),"CP"]) #自动剪掉xerror最小的那个 opar<-par(no.readonly=T) #可以生成一个可以修改的当前图形参数列表 par(mfrow=c(1,2)) #install.packages("rpart.plot") library(rpart.plot) rpart.plot(dtree,branch=1,type=2,fallen.leaves=T,cex=0.6,sub="Before") rpart.plot(tree,branch=1,type=4,fallen.leaves=T,cex=0.6,sub="After") par(opar) predtree<-predict(tree,newdata=vheart,type="class") #利用预测集进行预测 table(vheart$target,predtree,dnn=c("True","Predict")) #输出混淆矩阵 ``` ``` d=read.csv("train.csv",header=TRUE) dc=d[complete.cases(d),] d0=d[d$y==0,] d1=d[d$y==1,] d2=d[d$y==2,] d3=d[d$y==3,] #产生与各类别内变量数量相等的一到十的数值,为之后给数字贴标签做铺垫 label0=sample(c(1:10),dim(d0[1]),replace=TRUE) label1=sample(c(1:10),dim(d1[1]),replace=TRUE) label2=sample(c(1:10),dim(d2[1]),replace=TRUE) label3=sample(c(1:10),dim(d3[1]),replace=TRUE) d0_train=d0[label0<=5,] d0_test=d0[label0>5,] d1_train=d1[label1<=5,] d1_test=d1[label1>5,] d2_train=d2[label2<=5,] d2_test=d2[label2>5,] d3_train=d3[label3<=5,] d3_test=d3[label3>5,] d_train=rbind(d0_train,d1_train,d2_train,d3_train) d_test=rbind(d0_test,d1_test,d2_test,d3_test) library(nnet) re_log=multinom(y~.-id,data=d_train) #y~.id 除去变量id外的所有变量作为x #这一步类似于glm,只是生成的模型是面对多种response的逻辑回归 pred_log=predict(re_log,newdata=d_test) #按照上一步的模型跑/测试集的数据,注意data来源 前面写newdata tab_log=table(d_test$y,pred_log) #出现类似混淆矩阵看模型。d_test$y是只看test中的y library(rpart) re_id3=rpart(y~.-id,data=d_train,method="class") #分类时选择class re_id3_mistake=rpart(y~.-id,data=d_train) library(RWeka) re_id3=rpart(y~.-id,data=d_train,method="class",parms=list(split="information")) re_CART=rpart(y~.-id,data=d_train,method="class",parms=list(split="gini")) pred_id3=predict(re_id3,newdata=d_test) pred_CART=predict(re_CART,newdata=d_test,type="class") re_CART=rpart(y~.-id,data=d_train,method="class",parms=list(split="gini"),control=rpart.control(cp=0.001)) #cp默认为0.01,通过cp调整剪枝,smaller cp,more trees table(d_test$y,pred_CART) re_CART=rpart(y~.-id,data=d_train,method="class",parms=list(split="gini"),control=rpart.control(cp=0.0001)) #树的修剪 min=which.min(re_CART$cptable[,4]) min #选择最小的error re_CART_f=prune(re_CART,cp=re_CART$cptable[min,1]) table(d_test$y,pred_CART) plot(re_CART) #randomforest d_train$y=as.factor(d_train$y) re_rf=randomForest(y~.-id,data=d_train,ntree=5) pred_rf=predict(re_rf,newdata=d_test,type="prob") d_train$y[d_train$y>=1]=1 d_test$y[d_test$y>=1]=1 ``` ``` library(randomForest) library(rpart) library(rpart.plot) heart <- read.csv("heart.csv",header=T,na.strings="?") summary(heart) data.index = sample(c(1,2), nrow(heart), replace = T, prob = c(0.7, 0.3)) train_data = heart[which(data.index == 1),] #训练集 test_data = heart[which(data.index == 2),]#测试集 n<-length(names(train_data)) rate=matrix() for (i in 1:(n-1)) { mtry=i for(j in (1:100)) { set.seed(1234) rf_train=randomForest(as.factor(train_data$target)~.,data=train_data,mtry=i,ntree=j) rate[(i-1)*100+j]=mean(rf_train$err.rate) } } z=which.min(rate) print(z) set.seed(1234) rf_train<-randomForest(as.factor(train_data$target)~.,data=train_data,mtry=1,ntree=500,importance=TRUE,proximity=TRUE) importance<-importance(rf_train) barplot(rf_train$importance[,1],main="importance") box() importance(rf_train,type=2) varImpPlot(x=rf_train,sort=TRUE,n.var=nrow(rf_train$importance)) #show the importance print(rf_train) hist(treesize(rf_train)) max(treesize(rf_train)) MDSplot(rf_train,train_data$target,palette=rep(1,2),pch=as.numeric(train_data$target)) pred<-predict(rf_train,newdata=test_data) pred_out_1<-predict(object=rf_train,newdata=test_data,type="prob") table<-table(pred,test_data$target) sum(diag(table))/sum(table) plot(margin(rf_train,test_data$target)) ``` bootstrap 1.总体数量不知道; 2.从部分样本有放回的重采样i次,将多次抽样的估计量(均值等)作为整体分布的结果 3.新的观测个数=原来的样本个数 4.新的抽样样本=自助抽样法 蒙特卡洛 1.总体数量知道 2.从中抽取一个样本(或多个),用这个抽取样本的估计量当作整体估计 ``` n=100 #蒙特卡洛仿真 alpha=c() library(MASS) for (i in 1:100){ mu1=c(0,0) sigma1=matrix(c(1,0.5,0.5,1.25),nrow=2) rand1=mvrnorm(n=100,mu=mu1,Sigma=sigma1) #n随机样本个数,mu均值,sigma协方差 X=rand1[,1] Y=rand1[,2] alpha[i]=(var(Y)-cov(X,Y))/(var(X)+var(Y)-2*cov(X,Y)) } rand1 for (j in 1:100){ ran=rand1[sample(c(1:100),100,replace=TRUE),] #此处c(1:100)指的是有放回的抽100次,后一个100是重复100次 X=ran[,1] Y=ran[,2] alpha[j]=(var(Y)-cov(X,Y))/(var(X)+var(Y)-2*cov(X,Y)) } #rand1用来储存多元正态分布新的观测值(满足分布的新值) #ran是将rand1中的100个数,随机有放回的抽取,形成新的一组response ``` ``` set.seed(1) y=rnorm(100) x=rnorm(100) y=x-2*x^2+rnorm(100) #(b) plot(x,y) #(c)compute LOOCV error error=c() d=cbind(x,y)#cbind负责数据按列进行拼接 d=as.data.frame(d) for (i in 1:100) { m1=glm(y~x,data=d[-i,])#不用第i行的模型(LOOCV的特点) pred_m1=predict(m1,newdata=d[i,])#计算这个模型在第i行的预测值 error[i]=d[i,2]-pred_m1#计算预测值和真值的误差 } error #i可以是1-100间的任意数,所以我们得到了100个error sum(error^2) library(boot) #这边要加载包,不然cv函数跑不出来 m1=glm(y~x,data=d) m1r=cv.glm(data=d,glmfit=m1,K=100) m1r$delta m2=glm(y~poly(x,2),data=d) m2r=cv.glm(data=d,glmfit=m2,K=100) m2r$delta m3=glm(y~poly(x,3),data=d) m3r=cv.glm(data=d,glmfit=m3,K=100) m3r$delta m4=glm(y~poly(x,4),data=d) m4r=cv.glm(data=d,glmfit=m4,K=100) m4r$delta ``` ``` #数据清理 #1.空值 data=data[complete.cases(data),]#去空值 data=data[!complete.cases(wine),]#显示空值 #2.去重复值 data=unique(data) #3.查看缺失值 c=is.na(data) #4.标记缺失值 data=read.csv("data.csv",na.string="") ``` 1. PCA 2. k-means ``` X <- rbind(matrix(rnorm(20*50, mean = 0), nrow = 20), matrix(rnorm(20*50, mean=0.7), nrow = 20), matrix(rnorm(20*50, mean=1.4), nrow = 20)) #rbind把行row结合在一起,cbind把列结合在一起 X.pca = prcomp(X) plot(X.pca[,1:2], col=c(rep(1,20), rep(2,20), rep(3,20))) #pca,1重复20次,2重复20次。col表示print颜色 res = kmeans(X, centers = 3) #k=3表示有3类 true_class = c(rep(1,20), rep(2,20), rep(3,20)) table(res$cluster, true_class) #看行,如果集中在一个类,那就是perfectly clustered ```## Tree
## Tree2
## Decisiontre/e
## Bootstrap
## Cross-validation
## data cleaning
## Clustering(k-means)& PCA
## L/D/A & shu