Version:1.0StartHTML:000000193EndHTML:000341168StartFragment:000323903EndFragment:000341070StartSelection:000323903EndSelection:000341070SourceURL:https://www.jianshu.com/writer
data() #列出已载入的包中的所有数据集。
data(package = .packages(all.available = TRUE)) #列出已安装的包中的所有数据集。
y = rep(c(1, 2, 3), c(20, 20, 20))
生成20个1 20个2 20个3
去除空值
na.omit(A) A[complete.cases(A),]
rnorm()函数产生一系列的随机数,随机数个数,均值和标准差都可以设定
cor() 函数计算两两变量之间的相关系数的矩阵
#数据中心化: scale(data,center=T,scale=F)
####数据标准化: scale(data,center=T,scale=T) 或默认参数scale(data)
进行pca之前一般先变量标准化。
y=c(rep(-1,10),rep(1,10))
rep 重复函数 -1 重复出现十次
无监督学习:仅有x值 来
两种主要类型无监督学习:聚类分析,主成分分析
定性的响应变量,定性变量也称为分类变量。
线性回归的因变量(Y)是连续变量,自变量(X)可以是连续变量,也可以是分类变量
logistic 回归与线性回归恰好相反,因变量一定要是分类变量,不可能是连续变量。分类变量既可以是二分类,也可以是多分类,多分类中既可以是有序,也可以是无序。
最小二乘法(https://www.zhihu.com/question/37031188
)
竖直投影下来 计算(y-ybar)^2最小
决策树(https://blog.csdn.net/u010089444/article/details/53241218)
ID3算法
选择信息增益最大的方向进行分支标准
https://blog.csdn.net/xiaohukun/article/details/78055132
信息增益: 信息熵-条件熵
在决策树算法的学习过程中,信息增益是特征选择的一个重要指标,它定义为一个特征能够为分类系统带来多少信息,带来的信息越多,说明该特征越重要,相应的信息增益也就越大。
https://www.zhihu.com/question/22104055
信息熵越大说明事件的无序程度越高
信息熵越小说明事件的有序程度越高
https://blog.csdn.net/wxn704414736/article/details/80512705
CART
gini越小 越纯
最小的切分点最为最优切分点 使用该切分点将数据切分为两个子集
分类 回归 监督学习
聚类 非监督学习
https://blog.csdn.net/chenKFKevin/article/details/70547549
————————————
pca 降维工具
协方差矩阵——PCA实现的关键
https://www.zhihu.com/question/41120789
pinkyjie.com/2011/02/24/covariance-pca/
### prcomp(data,scale=TRUE) scale对数据进行标准化处理
prcomp pca主成分分析函数
————————————
混淆矩阵
https://www.zhihu.com/question/36883196
支持向量机 (文本分类问题)
https://www.zhihu.com/question/21094489
knn
kmeans https://zhuanlan.zhihu.com/p/31580379
import numpy as np
import pandas as pd
import sys
import matplotlib.pyplot as plt
wine=pd.read_excel(r'D:\未名学院\第4节课\作业材料\winequality-white.xlsx')
wine.info()
Xvar=wine[['fixed acidity','volatile acidity','citric acid','residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','alcohol']]
Yvar=wine['quality']
t=Xvar.corr()
画图
plt.figure(figsize=(10,8))
sns.heatmap(np.abs(t),annot=True)
https://www.kaggle.com/xvivancos/tutorial-clustering-wines-with-k-means
https://www.kaggle.com/maitree/wine-quality-selection
cov_sdc=cov(wine)
eigen(cov_sdc)
res.pca <- PCA(wine[,-12], graph = TRUE)
eig.val <- get_eigenvalue(res.pca)
eig.val
#数据导入
wine=read.csv()
wine= read.csv('winequality-white.csv',header=TRUE)
wine=winequality_white
#data cleaning
wine = wine[complete.cases(wine),]
#PCA
library(stringr)
library(FactoMineR)
#绘图
res.pca <- PCA(wine[,-12], graph = TRUE)#delete Y=quality, plot the PCA graph
sdc=scale(wine)
pca.d=prcomp(sdc)
summary(pca.d)
#PCA降维
wine=wine[,-9:-11]
#查看定性变量分布,确定定性变量
hist(wine$quality)
#分类
wine0 = wine[wine$quality==3,]
wine1 = wine[wine$quality==4,]
wine2 = wine[wine$quality==5,]
wine3 = wine[wine$quality==6,]
wine4 = wine[wine$quality==7,]
wine5 = wine[wine$quality==8,]
#抽样
label0= sample(c(1:10),dim(wine0[1]),replace= TRUE)
label1= sample(c(1:10),dim(wine1[1]),replace= TRUE)
label2= sample(c(1:10),dim(wine2[1]),replace= TRUE)
label3= sample(c(1:10),dim(wine3[1]),replace= TRUE)
label4= sample(c(1:10),dim(wine4[1]),replace= TRUE)
label5= sample(c(1:10),dim(wine5[1]),replace= TRUE)
wine0_train = wine0[label0<=5,]
wine0_test = wine0[label0>5,]
wine1_train = wine1[label1<=5,]
wine1_test = wine1[label1>5,]
wine2_train = wine2[label2<=5,]
wine2_test = wine2[label2>5,]
wine3_train = wine3[label3<=5,]
wine3_test = wine3[label3>5,]
wine4_train = wine4[label4<=5,]
wine4_test = wine4[label4>5,]
wine5_train = wine5[label5<=5,]
wine5_test = wine4[label5>5,]
wine_train = rbind(wine0_train,wine1_train,wine2_train,wine3_train,wine4_train,wine5_train)
wine_test = rbind(wine0_test,wine1_test,wine2_test,wine3_test,wine4_test,wine5_test)
re_log = multinomial(quality~.,data= wine_train)
wine_train$quality = as.factor(wine_train$quality)
re_rf = randomForest(quality~,data = wine_train,ntree=5)
######################################
library(rpart)
library(rattle)
library(rpart.plot)
library(RColorBrewer)
#########################################
ID3 方法生成树枝
re_id3 <-rpart(quality~.,data=wine_train,method="class", parms=list(split="information"))
fancyRpartPlot(re_id3)
########################################
CART 方法生成树枝
re_CART = rpart(quality~.,data= wine_train,method = "class",parms = list(split="gini"),control=rpart.control(cp=0.000001))
fancyRpartPlot(re_CART,main = "CART")
min = which.min(re_CART$cptable[,4])
re_CART_f = prune(re_CART,cp=re_CART$cptable[min,1])
pred_id3 = predict(re_id3,newdata = wine_test)
pred_CART = predict(re_CART,newdata = wine_test,type="class")
table(wine_test$quality,pred_CART)
wine_train$quality= as.factor(wine_train$quality)
re_rf = randomForest(quality~.,data=wine_train,ntree=50)
pred_rf=predict(re_rf,newdata=wine_test,type="prob")
wine$quality
liear regression
library(ggplot2) # Data visualization
library(readr) # CSV file I/O, e.g. the read_csv function
library(corrgram)
library(lattice) #required for nearest neighbors
library(FNN) # nearest neighbors techniques
library(pROC) # to make ROC curve
install.packages('corrgram')
library(corrgram)
linear_quality = lm(quality ~ fixed acidity+volatile acidity+citric acid+residual sugar+chlorides+free sulfur dioxide+total sulfur dioxide+density, data=wine)
corrgram(wine, lower.panel=panel.shade, upper.panel=panel.ellipse)
wine$poor <- wine$quality <= 4
wine$okay <- wine$quality == 5 | wine$quality == 6
wine$good <- wine$quality >= 7
head(wine)
summary(wine)
############# KNN
class_knn10 = knn(train=wine[,1:8], test=wine[,1:8], cl=wine$good, k =10)
class_knn20 = knn(train=wine[,1:8],test=wine[,1:8], cl = wine$good, k=20)
table(wine$good,class_knn10)
table(wine$good,class_knn20)
########################################
wine123=winequality_white
wine123$poor <- wine$quality <= 4
wine123$okay <- wine$quality == 5 | wine$quality == 6
wine123$good <- wine$quality >= 7
library(rpart) #for trees
tree1 = rpart(good~ alcohol + sulphates+ pH , data = wine123, method="class")
rpart.plot(tree1)
summary(tree1)
pred1 = predict(tree1,newdata=wine123,type="class")
summary(pred1)
summary(wine123$good)
比较模型的准确度
tree2 = rpart(good~ alcohol + volatile acidity +citric acid+ pH , data = wine123, method="class")
tree2 = rpart(good ~ alcohol + volatile acidity + citric acid + sulphates, data = wine123, method="class")
rpart.plot(tree2)
tree2= rpart(good ~ alcohol + volatile acidity + citric acid + sulphates, data = wine123 ,method='class')
pred2 = predict(tree2,newdata=wine123,type="class")
summary(pred2)
summary(wine123$good)
cor() 函数计算两两变量之间的相关系数的矩阵
#数据中心化: scale(data,center=T,scale=F)
####数据标准化: scale(data,center=T,scale=T) 或默认参数scale(data)
无监督学习:仅有x值,
两种主要类型无监督学习:聚类分析,主成分分析
定性的响应变量,定性变量也称为分类变量。
线性回归的因变量(Y)是连续变量,自变量(X)可以是连续变量,也可以是分类变量
logistic 回归与线性回归恰好相反,因变量一定要是分类变量,不可能是连续变量。分类变量既可以是二分类,也可以是多分类,多分类中既可以是有序,也可以是无序。
最小二乘法(https://www.zhihu.com/question/37031188
)
竖直投影下来 计算(y-ybar)^2最小
决策树(https://blog.csdn.net/u010089444/article/details/53241218)
ID3算法
选择信息增益最大的方向进行分支标准
https://blog.csdn.net/xiaohukun/article/details/78055132
信息增益: 信息熵-条件熵
在决策树算法的学习过程中,信息增益是特征选择的一个重要指标,它定义为一个特征能够为分类系统带来多少信息,带来的信息越多,说明该特征越重要,相应的信息增益也就越大。
https://www.zhihu.com/question/22104055
信息熵越大说明事件的无序程度越高
信息熵越小说明事件的有序程度越高
https://blog.csdn.net/wxn704414736/article/details/80512705
CART
gini越小 越纯
最小的切分点最为最优切分点 使用该切分点将数据切分为两个子集
分类 回归 监督学习
聚类 非监督学习
https://blog.csdn.net/chenKFKevin/article/details/70547549
pca 降维工具
协方差矩阵——PCA实现的关键
https://www.zhihu.com/question/41120789
pinkyjie.com/2011/02/24/covariance-pca/
协方差计算
特征向量
向量的转置
混淆矩阵
https://www.zhihu.com/question/36883196
支持向量机 (文本分类问题)
https://www.zhihu.com/question/21094489
knn
kmeans https://zhuanlan.zhihu.com/p/31580379
import numpy as np
import pandas as pd
import sys
import matplotlib.pyplot as plt
wine=pd.read_excel(r'D:\未名学院\第4节课\作业材料\winequality-white.xlsx')
wine.info()
Xvar=wine[['fixed acidity','volatile acidity','citric acid','residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','alcohol']]
Yvar=wine['quality']
t=Xvar.corr()
画图
plt.figure(figsize=(10,8))
sns.heatmap(np.abs(t),annot=True)
https://www.kaggle.com/xvivancos/tutorial-clustering-wines-with-k-means
https://www.kaggle.com/maitree/wine-quality-selection
cov_sdc=cov(wine)
eigen(cov_sdc)
res.pca <- PCA(wine[,-12], graph = TRUE)
eig.val <- get_eigenvalue(res.pca)
eig.val
#数据导入
wine=read.csv()
wine= read.csv('winequality-white.csv',header=TRUE)
wine=winequality_white
#data cleaning
wine = wine[complete.cases(wine),]
#PCA
library(stringr)
library(FactoMineR)
#绘图
res.pca <- PCA(wine[,-12], graph = TRUE)#delete Y=quality, plot the PCA graph
sdc=scale(wine)
pca.d=prcomp(sdc)
summary(pca.d)
#PCA降维
wine=wine[,-9:-11]
#查看定性变量分布,确定定性变量
hist(wine$quality)
#分类
wine0 = wine[wine$quality==3,]
wine1 = wine[wine$quality==4,]
wine2 = wine[wine$quality==5,]
wine3 = wine[wine$quality==6,]
wine4 = wine[wine$quality==7,]
wine5 = wine[wine$quality==8,]
#抽样
label0= sample(c(1:10),dim(wine0[1]),replace= TRUE)
label1= sample(c(1:10),dim(wine1[1]),replace= TRUE)
label2= sample(c(1:10),dim(wine2[1]),replace= TRUE)
label3= sample(c(1:10),dim(wine3[1]),replace= TRUE)
label4= sample(c(1:10),dim(wine4[1]),replace= TRUE)
label5= sample(c(1:10),dim(wine5[1]),replace= TRUE)
wine0_train = wine0[label0<=5,]
wine0_test = wine0[label0>5,]
wine1_train = wine1[label1<=5,]
wine1_test = wine1[label1>5,]
wine2_train = wine2[label2<=5,]
wine2_test = wine2[label2>5,]
wine3_train = wine3[label3<=5,]
wine3_test = wine3[label3>5,]
wine4_train = wine4[label4<=5,]
wine4_test = wine4[label4>5,]
wine5_train = wine5[label5<=5,]
wine5_test = wine4[label5>5,]
wine_train = rbind(wine0_train,wine1_train,wine2_train,wine3_train,wine4_train,wine5_train)
wine_test = rbind(wine0_test,wine1_test,wine2_test,wine3_test,wine4_test,wine5_test)
re_log = multinomial(quality~.,data= wine_train)
wine_train$quality = as.factor(wine_train$quality)
re_rf = randomForest(quality~,data = wine_train,ntree=5)
######################################
library(rpart)
library(rattle)
library(rpart.plot)
library(RColorBrewer)
#########################################
ID3 方法生成树枝
re_id3 <-rpart(quality~.,data=wine_train,method="class", parms=list(split="information"))
fancyRpartPlot(re_id3)
########################################
CART 方法生成树枝
re_CART = rpart(quality~.,data= wine_train,method = "class",parms = list(split="gini"),control=rpart.control(cp=0.000001))
fancyRpartPlot(re_CART,main = "CART")
min = which.min(re_CART$cptable[,4])
re_CART_f = prune(re_CART,cp=re_CART$cptable[min,1])
pred_id3 = predict(re_id3,newdata = wine_test)
pred_CART = predict(re_CART,newdata = wine_test,type="class")
table(wine_test$quality,pred_CART)
wine_train$quality= as.factor(wine_train$quality)
re_rf = randomForest(quality~.,data=wine_train,ntree=50)
pred_rf=predict(re_rf,newdata=wine_test,type="prob")
wine$quality
liear regression
library(ggplot2) # Data visualization
library(readr) # CSV file I/O, e.g. the read_csv function
library(corrgram)
library(lattice) #required for nearest neighbors
library(FNN) # nearest neighbors techniques
library(pROC) # to make ROC curve
install.packages('corrgram')
library(corrgram)
linear_quality = lm(quality ~ fixed acidity+volatile acidity+citric acid+residual sugar+chlorides+free sulfur dioxide+total sulfur dioxide+density, data=wine)
corrgram(wine, lower.panel=panel.shade, upper.panel=panel.ellipse)
wine$poor <- wine$quality <= 4
wine$okay <- wine$quality == 5 | wine$quality == 6
wine$good <- wine$quality >= 7
head(wine)
summary(wine)
############# KNN
class_knn10 = knn(train=wine[,1:8], test=wine[,1:8], cl=wine$good, k =10)
class_knn20 = knn(train=wine[,1:8],test=wine[,1:8], cl = wine$good, k=20)
table(wine$good,class_knn10)
table(wine$good,class_knn20)
########################################
wine123=winequality_white
wine123$poor <- wine$quality <= 4
wine123$okay <- wine$quality == 5 | wine$quality == 6
wine123$good <- wine$quality >= 7
library(rpart) #for trees
tree1 = rpart(good~ alcohol + sulphates+ pH , data = wine123, method="class")
rpart.plot(tree1)
summary(tree1)
pred1 = predict(tree1,newdata=wine123,type="class")
summary(pred1)
summary(wine123$good)
比较模型的准确度
tree2 = rpart(good~ alcohol + volatile acidity +citric acid+ pH , data = wine123, method="class")
tree2 = rpart(good ~ alcohol + volatile acidity + citric acid + sulphates, data = wine123, method="class")
rpart.plot(tree2)
tree2= rpart(good ~ alcohol + volatile acidity + citric acid + sulphates, data = wine123 ,method='class')
pred2 = predict(tree2,newdata=wine123,type="class")
summary(pred2)
summary(wine123$good)