set.seed(1010)
(A=matrix(rnorm(12,-1,1),3,4)) #赋值表达式加括号意味着打印结果
(B=matrix(rnorm(12,1,1),4,3))
(2*A+3*t(B))
(C=A%*%B)#%*%是矩阵乘号
import numpy as np
np.random.seed(1010)
A=np.random.normal(-1,1,size=12).reshape(3,4)
B=np.random.normal(1,1,size=12).reshape(4,3)
2*A+3*B.T
C=np.dot(A,B)#或者C=A.dot(B)
print(A,"\n",B,"\n",C)
det(C)#行列式
CI=solve(C)#逆矩阵
CI%*%C
round(CI%*%C,14)
K=A%x%B #%x%为Kronecker乘号
dim(K) #维数
(M=B%*%solve(t(B)%*%B)%*%t(B))#幂等阵
round(M%*%M-M,14)
print (np.linalg.det(C))#行列式
CI=np.linalg.inv(C)#逆矩阵
print (CI.dot(C))
print (np.round(CI.dot(C),14))
K=np.kron(A,B)#Kronecker乘积
K.shape #维数
M=np.dot(np.dot(B,np.linalg.inv(\
np.dot(B.T,B))),B.T)
print(M,"\n",np.round(np.dot(M,M)-M,14))
set.seed(1010)
X=matrix(rnorm(100),25,4)
XC=cor(X)
a=eigen(XC)
a$values #特征值, 可简写为a$va
a$vectors #特征向量, 可简写为a$ve
round(t(a$vectors)%*%a$vectors,14)
(tr=sum(a$va));sum(diag(XC))
np.random.seed(1010)
X=np.random.normal(25,4,size=100)
X.shape=25,4
XC=np.corrcoef(X.T)
w,v=np.linalg.eig(XC)#特征值及向量
np.round(v.T.dot(v),14)
print(w,"\n",v)
print (np.sum(w))
print (np.sum(np.diag(XC)))
s=svd(X)
s$u
s$d #D的对角线元素
s$v
round(t(s$u)%*%s$u,14)
round(t(s$v)%*%s$v,14)
round(s$u%*%diag(s$d)%*%t(s$v)-X,10)
sqrt(eigen(t(X)%*%X)$va)#等于s$d
u,d,v=np.linalg.svd(X,full_matrices=False)
print (u,"\n",d,"\n",v)
np.round(np.dot(u.T,u),14)
np.round(np.dot(v.T,v),14)
np.round(np.dot(np.dot(u,np.diag(d)),v)-X)
eva,eve=np.linalg.eig(np.dot(X.T,X))
print(np.sqrt(eva),d)
b=qr(B)
qr.Q(b)#Q矩阵
qr.R(b)#R矩阵
b$rank #B的秩
Q,R=np.linalg.qr(B)
print(Q)
print(R)
np.linalg.matrix_rank(B)#B的秩
CXC=np.linalg.cholesky(XC)
print(np.round(np.dot(CXC,CXC.T)-XC,15))
w=read.csv("BostonHousing2.csv")[,-c(1:5)]
w$chas=factor(w$chas)
w=read.csv("BostonHousing2.csv")[,-c(1:5)]
library(GGally)
ggpairs(w[,-5])
summary(w)
a=lm(cmedv~.,w);a$coefficients
CV=function(data=w,Z=10,seed=1010){
n=nrow(data);N=1:n;Z=10
set.seed(seed)
mm=sample(rep(1:Z,ceiling(n/Z))[N])
K=list();for(i in 1:Z)K[[i]]=N[mm==i]
return(K)
}
Z=10;mm=CV(w)
y_test=NULL->y_pred
for(i in 1:Z){
at=lm(cmedv~.,w[-mm[[i]],])
y_test=c(y_test,w$cmedv[mm[[i]]])
y_pred=c(y_pred,predict(at,w[mm[[i]],]))
}
sse=sum((y_test-y_pred)^2)
sst=sum((y_test-mean(y_test))^2)
nmse=sse/sst;r2=1-nmse; print(c(nmse,r2))
a=lm(cmedv~.,w)
summary(a)
anova(a)
shapiro.test(a$residuals)
(w=read.csv("Artif0.csv"))
(w1=read.csv("Artif1.csv"))
a=lm(Income~.,w) #或者a=lm(Income~Age+Edu+Sex,w)
a1=lm(Income~factor(Age)+factor(Edu)+factor(Sex),w1)
for (i in c(1,2,4)) w1[,i]=factor(w[,i])
a1=lm(Income~.,w1)
summary(a)
summary(a1)
anova(a)
anova(a1)
w=read.csv("BostonHousing2.csv")[,-c(1:5)]
library(rpart.plot);(a=rpart(cmedv~.,w))
rpart.plot(a,extra = 1)
b=lm(cmedv~.,w); a=rpart(cmedv~.,w);resid=w$cmedv-predict(a,w)
b.mse=mean((b$res)^2);a.mse=mean(resid^2)
par(mfrow=c(1,2))
plot(b$res,ylab="residuals",main="Linear Regression: MSE=21.50")
plot(resid,ylab="residuals",main="Decision Tree: MSE=15.66")
CV=function(data=w,Z=10,seed=1010){
n=nrow(data);N=1:n;Z=10
set.seed(seed)
mm=sample(rep(1:Z,ceiling(n/Z))[N])
K=list();for(i in 1:Z)K[[i]]=N[mm==i]
return(K)
}
n=nrow(w);N=1:n;Z=10;mm=CV()
rs=NULL;
for(i in 1:Z){
at=rpart(cmedv~.,w[-mm[[i]],])
pred=predict(at,w[mm[[i]],])
rs=rbind(rs,data.frame(y=w[mm[[i]],1],pred=pred))
}
mse=mean((apply(rs,1,diff))^2)
mst=mean((rs[,1]-mean(rs[,2]))^2)
mse/mst #0.264
################
rs2=NULL;
for(i in 1:Z){
al=lm(cmedv~.,w[-mm[[i]],])
pred=predict(al,w[mm[[i]],])
rs2=rbind(rs2,data.frame(y=w[mm[[i]],1],pred=pred))
}
mse2=mean((apply(rs2,1,diff))^2)
mst2=mean((rs2[,1]-mean(rs2[,2]))^2)
mse2/mst2 #0.2739
set.seed(1010)
rsb=NULL;
for(i in 1:Z){
ab=bagging(cmedv~.,w[-mm[[i]],])
pred=predict(ab,w[mm[[i]],])
rsb=rbind(rsb,data.frame(y=w[mm[[i]],1],pred=pred))
}
mseb=mean((apply(rsb,1,diff))^2)
mstb=mean((rsb[,1]-mean(rsb[,2]))^2)
mseb/mstb #0.1789336
w=read.csv("BostonHousing2.csv")[,-c(1:5)]
a.rf=randomForest(cmedv~.,w,importance=TRUE, localImp=TRUE,proximity=TRUE)
layout(t(1:2))
for (i in 1:2)
barplot(t(a.rf$importance[,i]),names.arg=row.names(a.rf$importance),
cex.names = 0.7,horiz=TRUE,col=4,las=1,xlab=colnames(a.rf$importance)[i])
matplot(1:13,a.rf$local,type = "l",xaxt="n",
xlab="Variable",ylab = "Local importance")
axis(side = 1,at = 1:13,labels = rownames(a.rf$local),las=0,cex.axis=1)
par(mfrow=c(1,3))
partialPlot(a.rf, pred.data=w, x.var=lstat)
partialPlot(a.rf, pred.data=w, x.var=rm)
partialPlot(a.rf, pred.data=w, x.var=zn)
plot(outlier(a.rf$proximity), type="h",ylab="Outlying measure")
r3=NULL;
for(i in 1:Z){
a3=randomForest(cmedv~.,w[-mm[[i]],])
pred=predict(a3,w[mm[[i]],])
r3=rbind(r3,data.frame(y=w[mm[[i]],1],pred=pred))
}
mse3=mean((apply(r3,1,diff))^2)
mst3=mean((r3[,1]-mean(r3[,2]))^2)
mse3/mst3 #0.1202
library(ipred);library(mboost);library(party);library(kernlab)
library(ranger);library(kknn);library(randomForest);library(partykit)
CV=function(data=w,Z=10,seed=1010){
n=nrow(data);N=1:n;Z=10
set.seed(seed)
mm=sample(rep(1:Z,ceiling(n/Z))[N])
K=list();for(i in 1:Z)K[[i]]=N[mm==i]
return(K)
}
Z=10;D=1;mm=CV(w);ff=cmedv ~ .
sst=sum((w[,D]-mean(w[,D]))^2);sse=vector()
J=1;set.seed(1010);sse.a=0
for(i in 1:Z) {
m=mm[[i]]
a=lm(ff,data =w[-m,])
sse.a=sse.a+sum((w[m,D]-predict(a,w[m,]))^2)
}
sse[J]=sse.a
J=J+1;set.seed(1010);sse.a=0
for(i in 1:Z) {
m=mm[[i]]
a=bagging(ff,data =w[-m,])
sse.a=sse.a+sum((w[m,D]-predict(a,w[m,]))^2)
}
sse[J]=sse.a
J=J+1;set.seed(1010);sse.a=0
for(i in 1:Z) {
m=mm[[i]]
a=blackboost(ff,data =w[-m,])
sse.a=sse.a+sum((w[m,D]-predict(a,w[m,]))^2)
}
sse[J]=sse.a
J=J+1;set.seed(1010);sse.a=0
for(i in 1:Z) {
m=mm[[i]]
a=cforest(ff,data =w[-m,])
sse.a=sse.a+sum((w[m,D]-predict(a,w[m,]))^2)
}
sse[J]=sse.a
J=J+1;set.seed(1010);sse.a=0
for(i in 1:Z) {
m=mm[[i]]
a=ctree(ff,data =w[-m,])
sse.a=sse.a+sum((w[m,D]-predict(a,w[m,]))^2)
}
sse[J]=sse.a
J=J+1;set.seed(1010);sse.a=0
for(i in 1:Z) {
m=mm[[i]]
a=ranger(ff,data =w[-m,])
sse.a=sse.a+sum((w[m,D]-predict(a,w[m,])$predictions)^2)
}
sse[J]=sse.a
J=J+1;set.seed(1010);sse.a=0
for(i in 1:Z) {
m=mm[[i]]
yp=kknn(formula=ff,train=w[-m,],test=w[m,])$fitted
sse.a=sse.a+sum((w[m,D]-yp)^2)
}
sse[J]=sse.a
J=J+1;set.seed(1010);sse.a=0
for(i in 1:Z) {
m=mm[[i]]
a=randomForest(ff,data =w[-m,])
sse.a=sse.a+sum((w[m,D]-predict(a,w[m,]))^2)
}
sse[J]=sse.a
J=J+1;set.seed(1010);sse.a=0
for(i in 1:Z) {
m=mm[[i]]
a=ksvm(as.matrix(w[-m,-D]),w[-m,D])
sse.a=sse.a+sum((w[m,D]-predict(a,w[m,-D]))^2)
}
sse[J]=sse.a
nmse=sse/sst
r2=1-nmse
reg.names=c('lm','bagging','blackboost','cforest','ctree','ranger',
'kknn','randomForest','ksvm')
barplot(nmse[order(nmse)],horiz = TRUE, names.arg=reg.names[order(nmse)],
col=4,las=2)
title("NMSE for 9 regression methods")
library(caret)
rf=train(cmedv~., data = w, method = "rf")
rf
rf$finalModel
tr=trainControl(method="repeatedcv",number=10,repeats=4)
rf1=train(cmedv~., data = w, method = "rf", trControl = tr )
tg=data.frame(mtry = seq(2, 10, by =2))#mtry 只有随机森林有
rf2=train(cmedv~.,data=w, method="rf",trControl=tr,tuneGrid = tg)
rf2; rf2$results
tr3=trainControl(method = "cv", number = 10, savePredictions = TRUE)
rf3=train(cmedv~., data = w, method = "rf", trControl = tr3)
Res = function(model,model.label,tm , grid = NULL) {
MM=sum((model$pred[,2]-mean(model$pred[,2]))^2)
MSE=sum(apply(model$pred[,1:2],1,diff)^2)
rmse=sqrt(MSE)
nmse=sum(apply(model$pred[,1:2],1,diff)^2)/MM
rsquare=1-nmse
mae=sum(abs(apply(model$pred[,1:2],1,diff)))
perf.grid = NULL
if (is.null(grid)) {
perf.grid = data.frame(Predictor = c(model.label),
NMSE=nmse,Rsquare=rsquare,
RMSE =rmse, MAE=mae, time = c(tm[[3]]))} else {
.grid = data.frame(Predictor = c(model.label),
NMSE=nmse,Rsquare=rsquare,
RMSE =rmse, MAE=mae, time = c(tm[[3]]))
perf.grid = rbind(grid, .grid)
}
perf.grid
}
M.pack=c("earth","ipred", "plyr", "e1071","mboost", "plyr", "import",
"party", "xgboost", 'ranger','gbm','randomForest', 'foreach',
'quantregForest','RRF','inTrees', 'brnn','h2o')
tc_cv<- trainControl(method="cv", number=10, savePredictions = TRUE)
M.label=c('lm','treebag','bagEarth','bagEarthGCV','blackboost',
'ctree','xgbTree','ranger','gbm','parRF','ranger','Rborist','rf','brnn',
'gbm','kknn')
M.name=c("Linear Regression","Bagged CART","Bagged MARS",
"Bagged MARS using gCV Pruning",
"Boosted Tree", "Conditional Inference Tree",
"eXtreme Gradient Boosting","Random Forest",
"Stochastic Gradient Boosting", "Parallel Random Forest",
"Random Forest (ranger)", "Random Forest(Rborist)", "Random Forest(rf)",
"Bayesian Regularized Neural Networks", "Stochastic Gradient Boosting",
"k-Nearest Neighbors")
length(M.name);length(M.label)
Model=list()
for(i in (1:length(M.label))){
set.seed(669); ptm <- proc.time()
Model[[i]]=train(cmedv~.,data = w,method=M.label[i],trControl=tc_cv)
tm = proc.time() - ptm
if(i==1)
perf.grid = Res (Model[[i]],M.name[i],tm , grid = NULL)
else
perf.grid = Res (Model[[i]],M.name[i],tm , grid = perf.grid)
}
perf.grid
(RR=perf.grid[order(perf.grid[,2],decreasing = T),])
RR=perf.grid[order(perf.grid[,2],decreasing = T),]
reg.names=RR[,1]
par(mar=c(4,9,2,1))
barplot(t(RR[,2]),cex.names=.5,horiz=TRUE,
names.arg=reg.names, col=4,las=2)
title(paste0("NMSE for ", nrow(RR)," regression methods"))
w=read.csv("oliveoil.csv")
w$nation=factor(c(rep("Greece",5),rep("Italy",5),rep("Spain",6)))
library(car)
scatterplotMatrix(~K232+brown+DK|nation,data=w, smooth=FALSE,
reg.line=FALSE, ellipse=TRUE, by.groups=TRUE, diagonal="none")
a=lm(cbind(K232, DK, brown)~nation,data=w)
summary(a)
(manova.w=Anova(a))
summary((manova.w))
linearHypothesis(a,"0.4*nationSpain+0.6*nationItaly",verbose=TRUE)
linearHypothesis(a,"*nationSpain=nationItaly",verbose=TRUE)
import numpy as np
import matplotlib
matplotlib.use('TkAgg')#使用Windows不需要这个命令
import matplotlib.pyplot as plt
%matplotlib inline
import scipy.stats as stats
import pandas as pd
import os
os.getcwd()
os.chdir('/work/')
w=pd.read_csv('BostonHousing2.csv')
wn0=w.columns
wn=wn0[5:] #不选前面不参与建模的5个变量
f=plt.figure(figsize=(16, 8))
k=0
for i in range(len(wn)):
for j in range(len(wn)):
k=k+1
if i!=j:
f.add_subplot(len(wn),len(wn),k)
plt.scatter(w[wn[i]],w[wn[j]])
#plt.axis('off') #不显示坐标轴和边框
else:
f.add_subplot(len(wn),len(wn),k)
plt.scatter([0,1],[0,1])
plt.text(.5,.5, wn[i], \
ha='center', va='center',size=10) #打印变量名字
y=np.array(w[wn[0]])[:,np.newaxis] #转换成列向量
X=np.array(w[wn[1:]])
import statsmodels.api as sm
mod=sm.OLS(y,X)
res=mod.fit()
print (res.summary())
from sklearn import linear_model
regr = linear_model.LinearRegression(fit_intercept=False) #不用常数项
regr.fit(X, y)
print(regr.coef_) #输出估计的系数
#yhat=X.dot(regr.coef_.reshape(10,1)) #直接计算拟合值
#resid=y-yhat #直接计算残差
res=y-regr.predict(X) #计算残差
import scipy.stats as stats
import pylab
res.shape=res.shape[0] #样本量
f=plt.figure(figsize=(12,5))
f.add_subplot(121)
plt.scatter(regr.predict(X),res) #残差对拟合值图
plt.plot(regr.predict(X),np.ones(len(y)))
plt.xlabel('Fitted values')
plt.ylabel('Residuals')
f.add_subplot(122)
stats.probplot(res, dist="norm", plot=pylab) #残差的正态Q-Q图
X1=np.insert(X,0,1, axis=1)#在0列(axis=1意味着列)加上1
u=pd.read_csv('BostonHousing2.csv')
un0=u.columns
un=un0[5:] #不选前面不参与建模的5个变量
y=np.array(u[un[0]])[:,np.newaxis]
X=np.array(u[un[1:]])
print(un[0]);print(un[1:])
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn.model_selection import cross_val_score
import graphviz #为画图
regr = DecisionTreeRegressor(max_depth=4,random_state=1010)
regr=regr.fit(X,y)
dot_data = tree.export_graphviz(regr,feature_names=un[1:],out_file=None)
graph = graphviz.Source(dot_data)
graph.render("Bostonpy0") #输出决策树图文件:Bostonpy0.pdf
graph
f=plt.figure(figsize=(12,5))
f.add_subplot(111)
height = regr.feature_importances_
bars = un[1:]
y_pos = np.arange(len(bars))
plt.bar(y_pos, height)
plt.xticks(y_pos, bars, color='green')#变量名字
plt.yticks(color='green')
#plt.show()
reg.score(X,y)
from sklearn.ensemble import BaggingRegressor
regr = BaggingRegressor(n_estimators=100,oob_score=True,random_state=1010)
regr.fit(X, y)
print("Score:\n",regr.score(X,y))
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
regr = RandomForestRegressor(oob_score=True,random_state=1010)
regr.fit(X, y)
print("Variable importance:\n",regr.feature_importances_)
print("Score:\n",regr.oob_score_)
import numpy as np
import matplotlib
matplotlib.use('TkAgg')#使用Windows不需要这个命令
import matplotlib.pyplot as plt
%matplotlib inline
import scipy.stats as stats
import pandas as pd
def FCV(x,y,regr, cv=10,seed=1010):
np.random.seed(seed)
ind=np.arange(len(y))
np.random.shuffle(ind) #随机化下标
X_folds = np.array_split(x[ind], cv)
y_folds = np.array_split(y[ind], cv)
X2=np.empty((0,X.shape[1]), float)
y2=np.empty((0,y.shape[0]), float)
yp=np.empty((0,y.shape[0]), float)
for k in range(cv):
X_train = list(X_folds) #X_fold实际上已经是list
X_test = X_train.pop(k) #只有list才能pop,从中取出第k份
X_train = np.concatenate(X_train) #合并剩下的cv-1份
y_train = list(y_folds)
y_test = y_train.pop(k)
y_train = np.concatenate(y_train)
regr.fit(X_train,y_train.ravel()) #拟合选中的regr模型
y2=np.append(y2,y_test)
X2=np.append(X2,X_test)
yp=np.append(yp,regr.predict(X_test))
nmse=np.sum((y2-yp)**2)/np.sum((y2-np.mean(y2))**2)
r2=1-nmse
return(np.array([nmse, r2]))
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LassoLars, Lars,Lasso,\
LinearRegression
from sklearn.ensemble import AdaBoostRegressor, \
RandomForestRegressor,BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
names = ["N-Neighbors",
"Lasso Lars", "Lars", "Lasso",
"Linear Regression", "Decision Tree",
"Random Forest", "AdaBoost", "bagging", "SVR"]
regressors = [
KNeighborsRegressor(),
LassoLars(), Lars(), Lasso(), LinearRegression(),
DecisionTreeRegressor(),
RandomForestRegressor(n_estimators=500, oob_score=True),
AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),\
n_estimators=300),
BaggingRegressor(), SVR(C=1.0, epsilon=0.2)]
# 在各种回归方法上迭代
A=np.empty((0,2), float)
for reg in regressors:
tt=np.array(FCV(X,y,reg,10));tt.shape=(1,2)
A=np.append(A,tt,0) #把各种方法的回归结果合并
fig=plt.figure(figsize=(20,10))
ax=fig.add_subplot(121)
ax.bar(np.arange(np.array(A).shape[0]), np.array(A)[:,0])
ax.set_xticklabels(names) #标注回归方法
fig.autofmt_xdate() #回归方法标注斜放
ax.set_ylabel('NMSE')
ax.set_title('NMSE')
ax.set_xticks(np.arange(np.array(A).shape[0])+ 0.35)
bx=fig.add_subplot(122)
bx.bar(np.arange(np.array(A).shape[0]), np.array(A)[:,1])
bx.set_xticklabels(names)
fig.autofmt_xdate()
bx.set_ylabel('Score')
bx.set_title('Score')
bx.set_xticks(np.arange(np.array(A).shape[0])+ 0.35)
import pandas as pd
v=pd.read_csv('Artif0.csv')
print(v)
v1=pd.get_dummies(v, drop_first=False)
print(v1)
v2=pd.get_dummies(v, drop_first=True)
print(v2)
w=read.csv("pendigits.csv");w$V17=factor(w$V17)
library(klaR)
partimat(V17~V16+V14,data=w,method="lda")
partimat(V17~V16+V14,data=w,method="naiveBayes")
Fold=function(Z=5,w,D,seed=7777){
n=nrow(w);d=1:n
e=levels(w[,D]);Y=length(e)#因变量Y类
set.seed(seed)
dd=lapply(1:Y,function(i){
d0=d[w[,D]==e[i]];j=length(d0);ZT=rep(1:Z,ceiling(j/Z))[1:j]
id=cbind(sample(ZT,length(ZT)),d0);id})
#上面每个dd[[i]]是随机1:Z及i类的下标集组成的矩阵
mm=lapply(1:Z, function(i){
u=NULL;for(j in 1:Y) u=c(u,dd[[j]][dd[[j]][,1]==i,2]);u})
return(mm)
}
library(MASS)
w=read.csv("pendigits.csv")#读入数据
w[,17]=factor(w[,17]) #因子化哑元变量
Z=10;D=17
mm=Fold(Z=10,w,D=17,1010) #10折交叉验证集的确定
y=NULL->yh
for(i in 1:Z){
m=mm[[i]]
a=lda(V17~.,w[-m,])
y=c(y,w[m,D])
yh=c(yh,predict(a,w[m,])$class)
}
table(y,yh)
(er=sum(y!=yh)/dim(w)[1])
y=NULL->yh
for(i in 1:Z){
m=mm[[i]]
a=qda(V17~.,w[-m,])
y=c(y,w[m,D])
yh=c(yh,predict(a,w[m,])$class)
}
table(y,yh)
(er=sum(y!=yh)/dim(w)[1])
w=read.csv("Trans.csv")
a=glm(Donate~.,w,family=binomial)
summary(a)
table(w$Donate,(pred>0.5)*1 )
BI=function(D,w,ff,fm="binomial"){
a=glm(ff,w,family=fm)
z=predict(a,w,type="response")
L=max(levels(w[,D]))
ee=NULL
for(p in seq(.01,.99,.01)){
u=rep(L,nrow(w));u[!(z>p)]=min(levels(w[,D]))
e=sum(u!=w[,D])/nrow(w);ee=rbind(ee,c(p,e))}
I=which(ee[,2]==min(ee[,2]))
return(ee[min(I),])
}
ff=Donate~. #公式
w$Donate=factor(w$Donate)#该函数要求把因变量因子化
BI(D=4,w,ff) #调用程序
w=read.csv("Trans.csv");D=4
library(caTools)
set.seed(1010)
split <- sample.split(w[,D], SplitRatio = 0.75)#训练集占75%
#得到一个训练集和一个测试集
w_train <- subset(w, split == TRUE)
w_test <- subset(w, split == FALSE)
#Logistic 回归
a <- glm (Donate ~ ., data = w_train, family = binomial)
pred <- predict(a,w_test, type = 'response')
#四个点图:
library(ROCR)
par(mfrow=c(1,4),mar=c(4,4,4,2))
ROCRpred <- prediction(pred, w_test$Donate)
ROCRperf <- performance(ROCRpred, 'tpr','fpr')
plot(ROCRperf, colorize = TRUE, text.adj = c(-0.2,1.7))
title("ROC curve")
Precperf <- performance(ROCRpred, 'prec','rec')
plot(Precperf, colorize = TRUE, text.adj = c(-0.2,1.7))
title("Precision/recall graph")
Sensperf <- performance(ROCRpred, 'sens','spec')
plot(Sensperf, colorize = TRUE, text.adj = c(-0.2,1.7))
title("Sensitivity/specificity plot")
Liftperf <- performance(ROCRpred, 'lift','rpp')
plot(Liftperf, colorize = TRUE, text.adj = c(-0.2,1.7))
title("Lift chart")
pr <- prediction(pred, w_test$Donate)
auc <- performance(ROCRpred, measure = "auc")
auc <- [email protected][[1]]
auc
D=4;Z=10;n=nrow(w);
mm=Fold(Z,w,D,8888)
ff=Donate ~ .
E=vector()
J=1;E[J]=0
for(i in 1:Z){
m=mm[[i]]
a=glm(ff,w[-m,],family="binomial")
z=(predict(a,w[m,],type="response")>0.44)
u=rep(levels(w[m,D])[2],nrow(w[m,]));u[!z]=levels(w[m,D])[1]
E[J]=E[J]+sum(w[m,D]!=u)
}
J=J+1;E[J]=0
for(i in 1:Z){
m=mm[[i]]
a=lda(ff,w[-m,])
u=predict(a,w[m,])$class
E[J]=E[J]+sum(w[m,D]!=u)
}
J=J+1;E[J]=0
for(i in 1:Z){
m=mm[[i]]
a=qda(ff,w[-m,])
u=predict(a,w[m,])$class
E[J]=E[J]+sum(w[m,D]!=u)
}
E/n
w=read.csv("pendigits.csv");w[,17]=factor(w[,17])
library(rpart.plot)
(a=rpart(V17~.,w))
rpart.plot(a,type=1,extra=2)
sum(w$V17!=predict(a,w,type="class"))#1830
sum(w$V17!=predict(a,w,type="class"))/nrow(w)#0.1665
library(rpart.plot)
y=NULL->yh
for(i in 1:Z){
m=mm[[i]]
a=rpart(V17~.,w[-m,])
y=c(y,w[m,D])
yh=c(yh,predict(a,w[m,],type="class"))
}
table(y,yh)
(er=sum(y!=yh)/dim(w)[1])
set.seed(1010)
RF=randomForest(V17~.,w,importance=TRUE,proximity=TRUE,localImp=TRUE)
sum(w$V17!=predict(RF,w))/nrow(w)
par(mfrow=c(3,4))
for (i in 1:10)
barplot(t(RF$importance[,i]),names.arg=row.names(RF$importance),
cex.names = 0.4,col=4,las=0,xlab=paste("digit:",colnames(RF$importance)[i]))
for (i in 11:12)
barplot(t(RF$importance[,i]),names.arg=row.names(RF$importance),
cex.names = 0.4,col=2,las=0,xlab=colnames(RF$importance)[i])
matplot(1:16,RF$local,type = "l",xaxt="n",
xlab="Variable",ylab = "Local importance")
axis(side = 1,at = 1:16,labels = rownames(RF$local),las=0,cex.axis=1)
par(mfrow=c(1,2))
partialPlot(RF, pred.data=w, x.var=V16)
partialPlot(RF, pred.data=w, x.var=V6)
plot(outlier(RF$proximity), type="h",ylab="Outlying measure")
library(randomForest)
y=NULL->yh
for(i in 1:Z){
m=mm[[i]]
a=randomForest(V17~.,w[-m,])
y=c(y,w[m,D])
yh=c(yh,predict(a,w[m,]))
}
table(y,yh)
(er=sum(y!=yh)/dim(w)[1])
w=read.csv("pendigits.csv")#读入数据
w[,17]=factor(w[,17]) #因子化哑元变量
library(adabag)
set.seed(1010)
a=boosting(class~.,w)
table(w$V17,predict(a,w)$class)
set.seed(1010)
pen.cv=boosting.cv(V17~.,v=10, data=w)
pen.cv$confusion #混淆矩阵
pen.cv$error #误判率
w=read.csv("pendigits.csv")#读入数据
w[,17]=factor(w[,17]) #因子化哑元变量
Z=10;D=17
mm=Fold(Z=10,w,D=17,1010) #10折交叉验证集的确定
library(MASS);library(kknn)
library(randomForest);library(kernlab);library(adabag)
D=17;n=nrow(w)
ff=paste(names(w)[D],"~.");ff=as.formula(ff)
###########
E=vector()
#随机森林分类
J=1;error=NULL;set.seed(1010)
for(i in 1:Z){
m=mm[[i]]
a=randomForest(ff,data=w[-m,])
e=sum(w[m,D]!=predict(a,w[m,]))
error=c(error,e)
}
E[J]=sum(error)
#adaboost分类
J=J+1;error=NULL;set.seed(8888)
for(i in 1:Z){
m=mm[[i]]
a=boosting(ff,w[-m,])
e=sum(w[m,D]!=predict(a,w[m,])$class)
error=c(error,e)
}
E[J]=sum(error)
#支持向量机分类
J=J+1;error=NULL;set.seed(1010)
for(i in 1:Z){
m=mm[[i]]
a=ksvm(ff,w[-m,])
e=sum(w[m,D]!=predict(a,w[m,]))
error=c(error,e)
}
E[J]=sum(error)
#k最近邻方法分类
J=J+1;error=NULL;set.seed(1010)
for(i in 1:Z){
m=mm[[i]]
e=sum(w[m,D]!=kknn(ff, train= w[-m,],test=w[m,])$fit)
error=c(error,e)
}
E[J]=sum(error)
#线性判别分析分类
J=J+1;error=NULL;set.seed(1010)
for(i in 1:Z){
m=mm[[i]]
a=lda(ff,w[-m,])
e=sum(w[m,D]!=predict(a,w[m,])$class)
error=c(error,e)
}
E[J]=sum(error)
#二次判别分析分类
J=J+1;error=NULL;set.seed(1010)
for(i in 1:Z){
m=mm[[i]]
a=qda(ff,w[-m,])
e=sum(w[m,D]!=predict(a,w[m,])$class)
error=c(error,e)
}
E[J]=sum(error)
print(E/n)
barplot(E/n,names.arg = c("Random Forest",
"Adaboost","SVM","KNN","LDA","QDA"),col=4)
import numpy as np
import pandas as pd
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import confusion_matrix
def Fold(y,X,Z,seed=8888):
id0=np.arange(len(y))
part=[]
for i in np.unique(y):
part.append(id0[y==i]) #part is index of y=i
Zid=[];Xn=[];yn=[]
np.random.seed(seed)
for k in part:
np.random.shuffle(k)#for each y=i shuffle index
yn.extend(y[k])
Xn.extend(X[k])
Zid.extend((list(range(Z))*int(len(k)/Z+1))[:len(k)])
return Zid, yn, Xn
v= pd.read_csv('pendigits.csv',index_col=False)
X=np.array(v[v.columns[:16]])#自变量
y=np.array(v[v.columns[16]])#因变量
Zid, yn, Xn=Fold(y=y,X=X,Z=10,seed=1010)
yn=np.array(yn)
Xn=np.array(Xn)
def CCV(clf,Zid, Xn, yn):
y_pred=[];yN=[]
for j in np.unique(Zid): #j has Z kinds of values
clf.fit(Xn[Zid!=j],yn[Zid!=j])
yN.extend(yn[Zid==j])
y_pred.extend(clf.predict(Xn[Zid==j]))
y_pred=np.array(y_pred)
yN=np.array(yN)
error=np.sum(yN!=y_pred)/len(y)
cmatrix=confusion_matrix(yN,y_pred)
return(error,cmatrix)
lda=LinearDiscriminantAnalysis()
lda.fit(X,y)
y_pred=lda.predict(X)
y_score=lda.score(X,y)
print (confusion_matrix(y,y_pred))
print ('Misclassification rate=', 1-y_score)
lda=LinearDiscriminantAnalysis()
er,cm=CCV(lda)
print("Error rate=",er)
print (cm)
qda=QuadraticDiscriminantAnalysis()
qda.fit(X,y)
y_pred2=qda.predict(X)
y_score2=qda.score(X,y)
print( confusion_matrix(y,y_pred2))
print( 'Misclassification rate=', 1-y_score2)
qda=QuadraticDiscriminantAnalysis()
er,cm=CCV(qda)
print("Error rate=",er)
print (cm)
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import roc_curve, auc, confusion_matrix
w = pd.read_csv("trans.csv")
w['intercept'] = 1.0 #加上截距
X_cols=w.columns[[4,0,1,2]];y_col=w.columns[3]
X=w[X_cols];y=w[y_col]
X=np.array(X);y=np.array(y)
Zid=np.ones(len(y));Zid[:int(len(y)*0.20)]=0
X_train=X[Zid==1,:];y_train=y[Zid==1]
X_test=X[Zid==0,:];y_test=y[Zid==0]
result = sm.Logit(y_train, X_train).fit()
print (result.summary())
y_pred= result.predict(X_test)
fpr, tpr, thresholds =roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
print("Area under the ROC curve : %f" % roc_auc)
thresholds[np.argmin(np.abs(tpr-(1-fpr)))]
print(confusion_matrix(y_test, 1*(y_pred>0.3421)))
print(np.sum(y_test!=1*(y_pred>0.3421))/len(y_test))
i = np.arange(len(tpr)) # index for df
roc = pd.DataFrame({'fpr' : pd.Series(fpr, index=i),
'tpr' : pd.Series(tpr, index = i),
'1-fpr' : pd.Series(1-fpr, index = i),
'tf' : pd.Series(tpr - (1-fpr), index = i),
'thresholds' : pd.Series(thresholds, index = i)})
roc.ix[(roc.tf-0).abs().argsort()[:1]]
# Plot tpr vs 1-fpr
fig, ax = plt.subplots(figsize=(10,4.4))
plt.plot(roc['fpr'],roc['tpr'])
plt.plot(roc['1-fpr'],roc['tpr'])
plt.xlabel('FPR or 1-FPR')
plt.ylabel('TPR')
plt.title('Receiver operating characteristic')
ax.set_xticklabels([])
X=np.array(X);y=np.array(y)
Zid, yn, Xn=Fold(y=y,X=X,Z=10,seed=1010)
yn=np.array(yn)
Xn=np.array(Xn)
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
names = ["Linear Discriminant Analysis", "Quadratic Discriminant
Analysis","Logistic Regression"]
classifiers = [
LinearDiscriminantAnalysis(),
QuadraticDiscriminantAnalysis(),LogisticRegression()]
A=list()
for clf in classifiers:
np.random.seed(1010)
er,cm=CCV(clf)
A.append(er)
print(A)
from sklearn.ensemble import RandomForestClassifier
RF=RandomForestClassifier(n_estimators=500)
np.random.seed(1010)
er,cm=CCV(RF)
print("Error rate=",er)
print (cm)
import numpy as np
import matplotlib
matplotlib.use('TkAgg')#使用Windows不需要这个命令
import matplotlib.pyplot as plt
%matplotlib inline
import scipy.stats as stats
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
names = ["Nearest Neighbors", "Linear SVM", "Decision Tree",
"Random Forest", "Naive Bayes"]
classifiers = [
KNeighborsClassifier(3),
SVC(kernel="linear", C=0.025),
DecisionTreeClassifier(max_depth=5),
RandomForestClassifier(n_estimators=500),
GaussianNB()]
A=list()
for clf in classifiers:
np.random.seed(1010)
er,cm=CCV(clf)
A.append(er)
plt.bar(range(len(A)),A,)
plt.ylabel('Error rate')
plt.title('Error rate')
plt.xticks(np.arange(len(names)),names,rotation=90)
w=read.csv("full.aaup.csv")[,-(1:4)]#读入除前4个之外的13个变量
cor(w)
library(corrplot)
corrplot(cor(w), type = "lower", diag=FALSE, tl.col = "black", tl.srt = 45)
library(lattice)
rgb.palette <- colorRampPalette(c("white", "black"), space = "rgb")
levelplot(cor(w), las=1,main="Correlation matrix", xlab="", ylab="",
col.regions=rgb.palette(120), cuts=100, at=seq(0,1,0.01))
w=read.csv("full.aaup.csv")#读取数据
u=w[,-c(1,3,4)] #去掉无关定性变量(仅保存学校名称)
a=eigen(cor(u[,-1]))#解(不包括校名变量的)相关矩阵的特征值问题
a$va #输出特征值
data.frame(a$ve,row.names=names(u)[-1])[,1:4]#带有变量名的4个特征向量
(cca=(a$va)/sum(a$va))#贡献率
(ca=cumsum(a$va)/sum(a$va))#累积贡献率
#崖底碎石图及累积贡献率图
par(mfrow=c(1,2))
plot(1:(ncol(u)-1),a$va,type="o",pch=17,col=4,main="Scree Plot",
xlab="Component Number",ylab="Eigen Value")
plot(1:(ncol(u)-1),ca,type="o",pch=17,col=4,
main="Cumulative Contribution",
xlab="Component Number",ylab="Cumulative Contribution")
(b=eigen(diag(5)))
plot(1:5,b$values,type="o",pch=16,main ="meaningless scree plot")
(b=sweep(a$vec[,a$va>0],2,sqrt(a$value[a$va>0]),"*"))
pct=paste("(",round(a$va/sum(a$va)*100),"%)",sep="")
tt=NULL;for(i in 1:4)tt=c(tt,paste("Component", i,pct[i]))
plot(b[,1:2], type="n", xlab=tt[1], ylab=tt[2],main="Loadings",
xlim=c(-1,0.05))
text(b[,1],b[,2],names(u[,-1]));abline(h=0);abline(v=0)
par(mfrow=c(1,2))
sc=scale(u[,-1])%*%a$ve[,1:2] #计算得分
plot(sc[,1],sc[,2],type="n",ylim=c(-6,10),xlim=c(-11,7),
main="Sample Principal Components", xlab="Component 1",
ylab="Component 2")
text(sc[,1],sc[,2],u[,1],cex=.4)
abline(v=0,col=2);abline(h=0,col=2)
CA=(w$State=="CA")
plot(sc[CA,1],sc[CA,2],type="n",ylim=c(-6,4),xlim=c(-12,5),
main="Sample Principal Components for California Schools",
xlab="Component 1", ylab="Component 2")
text(sc[CA,1],sc[CA,2],u[CA,1],cex=1)
abline(v=0,col=2);abline(h=0,col=2)
par(mfrow=c(1,1))
u[CA,1][order(sc[CA,1],decreasing =F)][1:5]
u[CA,1][order(sc[CA,2],decreasing =T)][1:5]
library(jpeg)
xmy=readJPEG('XMY.jpg')
xmypca=list()
for(i in 1:3)xmypca[[i]]=prcomp(xmy[,,i], center = FALSE)
imge=list()
for (i in c(3,10,100)) {
imge[[i]] <- sapply(xmypca, function(y) {
comimge <- y$x[,1:i] %*% t(y$rotation[,1:i])
}, simplify = 'array')
writeJPEG(imge[[i]], paste('compXMY', round(i,0), '.jpg', sep = ''))
}
FS=file.info('XMY.jpg')$size
z=NULL
for(i in c(3,10,100))
z=c(z,file.info(paste("compXMY",i,".jpg",sep=""))$size/FS)
z
mk=readJPEG('market.jpg')
mksvd=list()
for(i in 1:3)mksvd[[i]]=svd(mk[,,i])
commk=mk;cmk=list()
FS=file.info('market.jpg')$size
for (i in c(5, 20, 100)){
for(k in 1:3){
commk[,,k]=mksvd[[k]]$u[,1:i] %*% diag(mksvd[[k]]$d[1:i]) %*%
t(mksvd[[k]]$v[,1:i])
writeJPEG(commk, paste('commk', round(i,0), '.jpg', sep = ''))
}
cmk[[i]]=commk
}
z=NULL;for(i in c(5,20, 100))
z=c(z,file.info(paste("commk",i,".jpg",sep=""))$size/FS);z
w=read.csv("LA.Neighborhoods.csv")
w$density=w$Population/w$Area #增加密度
u=w[,-c(12:15)]# 去掉人口, 面积, 经纬度̨
library("FactoMineR");library("factoextra")
row.names(u)=u[,1]
u.r=PCA(u[,-1], scale.unit=TRUE,graph=FALSE)#主程序
fviz_eig(u.r, addlabels = TRUE, ylim = c(0, 50))
(u.eigen <- get_eigenvalue(u.r))#值
fviz_contrib(u.r, choice="var", axes=1)
fviz_contrib(u.r, choice="var", axes=2)
fviz_pca_var(u.r)
fviz_pca_var(u.r, col.var = "cos2",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), repel = TRUE)
fviz_pca_ind(u.r,pointsize="cos2",pointshape=21,fill="#E7B800",repel=F)
import pandas as pd
import numpy as np
matplotlib.use('TkAgg')#使用Windows不需要这个命令
import matplotlib.pyplot as plt
%matplotlib inline
w=pd.read_csv("full.aaup.csv")
u=w[w.columns[4:]]
ava,ave=np.linalg.eig(np.corrcoef(u.T))
print("eigenvalues=\n",ava,"4 eigen vectors=\n",ave[:,:4])
cca=ava/np.sum(ava)#贡献率
ca=np.cumsum(ava)/np.sum(ava)#累积贡献率
fig=plt.figure(figsize=(10,4))
fig.add_subplot(121)
plt.scatter(range(1,14),ava)
plt.plot(range(1,14),ava)
plt.title('Scree Plot')
plt.xlabel('Component Number')
plt.ylabel('Eigen Value')
fig.add_subplot(122)
plt.scatter(range(1,14),ca)
plt.plot(range(1,14),ca)
plt.title('Cumulative Contribution')
plt.xlabel('Component Number')
plt.ylabel('Cumulative Contribution')
#plt.show()
loadings=np.sqrt(ava)*ave
plt.scatter(loadings[:,0],loadings[:,1])
for i in range(13):
plt.text(loadings[i,0],loadings[i,1],w.columns[4:][i])
plt.title('Loadings')
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.grid(True)
plt.show()
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
sc=scaler.fit_transform(u)su.dot(ave)
CA=np.array(w[w.columns[2]]=='CA')
name_CA=np.array(w[w.columns[1]][CA])
sc1=sc[CA,:]
fig=plt.figure(figsize=(10,8))
fig.add_subplot(121)
plt.scatter(sc[:,0],sc[:,1])
for i in range(u.shape[0]):
plt.text(sc[i,0],sc[i,1],w[w.columns[1]][i])
plt.title('Sample Principal Components')
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.grid(True)
fig.add_subplot(122)
plt.scatter(sc1[:,0],sc1[:,1])
for i in range(sc1.shape[0]):
plt.text(sc1[i,0],sc1[i,1],name_CA[i])
plt.title('Sample Principal Components for California Schools')
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.grid(True)
plt.show()
import pandas as pd
w=pd.read_csv("full.aaup.csv")
u=w[w.columns[4:]];X=np.array(u)
import numpy as np
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
Y=pca.fit_transform(X)#主成分得分
print(pca.explained_variance_ratio_) #方差百分比
plt.figure(figsize=(10,7))
plt.scatter(pca.components_[0],pca.components_[1])
for i in range(13):
plt.text(pca.components_[0,i],pca.components_[1,i],w.columns[4:][i],
fontsize=20)
plt.title('Loadings')
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.grid(True)
plt.show()
import scipy
import scipy.ndimage
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
from pylab import imread,subplot,imshow,title,gray,figure,show,NullLocator
def comp_2d(image_2d,numpc=100):
cov_mat = (image_2d.T - np.mean(image_2d , axis = 1)).T
eig_val, eig_vec = np.linalg.eigh(np.cov(cov_mat))
p = np.size(eig_vec, axis =1)
idx = np.argsort(eig_val)
idx = idx[::-1]
eig_vec = eig_vec[:,idx]
eig_val = eig_val[idx]
if numpc 0:
eig_vec = eig_vec[:, range(numpc)]
score = np.dot(eig_vec.T, cov_mat)
recon = (np.dot(eig_vec, score).T + np.mean(image_2d, axis = 1)).T
recon_img_mat = np.uint8(np.absolute(recon))
return recon_img_mat
def pltPCA(a,numpc):
a_np = np.array(a)
a_r = a_np[:,:,0]
a_g = a_np[:,:,1]
a_b = a_np[:,:,2]
a_r_recon, a_g_recon, a_b_recon = comp_2d(a_r,numpc),\
comp_2d(a_g,numpc), comp_2d(a_b,numpc)
recon_color_img = np.dstack((a_r_recon, a_g_recon, a_b_recon))
recon_color_img = Image.fromarray(recon_color_img)
fig = plt.figure(figsize=(4, 3), dpi=72)
ax = fig.add_axes([0.0, 0.0, 1.0, 1.0], frameon=False, aspect=1)
ax.set_xticks([])
ax.set_yticks([])
imshow(recon_color_img)
a = scipy.ndimage.imread("PF.jpg")
pltPCA(a,10);pltPCA(a,100)
w=read.csv("who1.csv")
library(corrplot)
corrplot(cor(w[,-c(1)]),type="lower",diag=FALSE,
tl.col="black",tl.srt=45)
w=read.csv("who1.csv")
a=factanal(scale(w[,-1]),factors=7,rotation ="none")
a$loadings
SS=apply(a$loadings^2,2,sum);PV=SS/ncol(w[,-1]);CV=cumsum(SS/ncol(w[,-1]))
rbind(SS,PV,CV)
a$uniquenesses
a5=factanal(scale(w[,-1]),factors=5,rotation ="varimax", scores="regression")
a5$loadings
a5$uniquenesses
a=factanal(scale(w[,-1]),factors=5,rotation ="none")
b=princomp(scale(w[,-1]))
B=matrix(c(1:4,4,5),3,2,b=T)
TT=c("FA with rotation","FA without rotation","PCA")
A=list(a5$loadings,a$loadings,b$loadings)
for(k in 1:3)
for (i in 1:3){
plot(A[[k]][,B[i,]],xlim=c(-1,1),ylim = c(-1,1),type="n")
text(A[[k]][,B[i,]],names(w[,-1]),cex=.6)
abline(h=0,lty=2);abline(v=0,lty=2);title(TT[k])
}
layout(t(1:2))
for (i in 1:2){
plot(a5$scores[,B[i,]],type="n",ylim=range(a5$scores[,B[i,2]]),
xlim=range(a5$scores[,B[i,1]]+c(-.3,.3)), main="Factor Scores")
abline(h=0);abline(v=0)
text(a5$scores[,B[i,]],labels=w[,1],cex=.5)
}
w=read.csv("peas2.csv")
a=factanal(scale(w),factors=4,rotation ="none" , scores="regression")
a$loading
a2=factanal(scale(w),factors=2,rotation ="varimax",
scores="regression")
a2$loadings
plot(a2$loadings[,1:2],type="n",xlim=c(-1.1,1.1))
text(a2$loadings[,1:2],labels=names(w),cex=.6)
abline(h=0,lty=2);abline(v=0,lty=2)
w=read.csv("full.aaup.csv");u=w[,-c(1,3,4)]
a=factanal(scale(u[,-1]),factors=6,rotation = "none")
a$loadings
a2=factanal(scale(u[,-1]),factors=2,rotation ="varimax",
scores="regression")
a2$loadings
layout(t(1:2))
plot(a2$loadings[,1:2],type="n",ylim=c(-1.1,1.1),
xlim=c(-1.1,1.2),main = "FA loadings")
abline(h=0,lty=2);abline(v=0,lty=2)
text(a2$loadings[,1:2], labels=names(u[,-1]),cex=.7)
plot(a2$scores[w[,3]=="CA",1:2],type="n",
ylim=c(-1.9,4),xlim=c(-2,5.5), main="Factor Scores")
abline(h=0,lty=2);abline(v=0,lty=2)
text(a2$scores[w[,3]=="CA",1:2],labels=u[w[,3]=="CA",1],cex=.7)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import FactorAnalysis
from sklearn import preprocessing
w=pd.read_csv("full.aaup.csv")
u=w[w.columns[4:]]
u_scale=preprocessing.scale(u)
fa=FactorAnalysis(tol=1e-8,max_iter=10000)
fa.n_components=2
fa.fit(np.array(u_scale))
fa.components_
print pd.DataFrame(fa.components_,columns=u.columns)
CA=np.array(w[w.columns[2]]=='CA')
name_CA=np.array(w[w.columns[1]][CA])
sc1=fa.fit_transform(u_scale)[CA,:]
fig=plt.figure(figsize=(10,8))
fig.add_subplot(121)
plt.scatter(fa.components_[0],fa.components_[1])
for i in range(len(u.columns)):
plt.text(fa.components_[0,i],fa.components_[1,i],u.columns[i])
plt.grid(True)
plt.title('Loadings')
plt.xlabel('Factor 1')
plt.ylabel('Factor 2')
fig.add_subplot(122)
plt.scatter(sc1[:,0],sc1[:,1])
for i in range(sc1.shape[0]):
plt.text(sc1[i,0],sc1[i,1],name_CA[i])
plt.title('Sample Scores for California Schools')
plt.xlabel('Factor 1')
plt.ylabel('Factor 2')
plt.grid(True)
plt.show()
w=read.csv("classclust.csv")
library(rpart.plot)
(a=rpart(factor(V3)~.,w))
table(factor(w[,3]),predict(a,w,type="class"))
rpart.plot(a)
b=kmeans(w[,-3],2)
par(mar=c(4,4,3,2))
layout(t(1:2))
plot(w[,-3],pch=(w[,3]+1),col=w[,3]+1)
title("2 Classes")
plot(w[,-3],type="n");text(w[,-3],label=b$cluster)
title("2 Clusters")
library(factoextra)
d=get_dist(state.x77, method = "euclidean", stand = T)
fviz_dist(d,gradient = list(low = "#00AFBB", mid = "white",
high = "#FC4E07"))
library(CluMix)
distmap(flower, what="subjects")
distmap(flower, what="variables")
library(factoextra)
a=get_clust_tendency(scale(state.x77), nrow(state.x77)-1,
graph = TRUE)
a$hopkins_stat;plot(a$plot)
rs=apply(state.x77, 2,function(x){runif(length(x), min(x), (max(x)))})
ra=get_clust_tendency(scale(rs), nrow(rs)-1, graph = TRUE)
ra$hopkins_stat;plot(ra$plot)
m=c("ward.D", "ward.D2", "single", "complete",
"average", "mcquitty", "median","centroid")
h=list();par(mfrow=c(2,4))
for(i in 1:length(m)){
h[[i]]=hclust(dist(scale(state.x77)), method=m[i])
plot(h[[i]] ,labels=rownames(state.x77),cex=.4,
main=paste("Method: ",m[i]))
}
plot(h[[1]],cex=.8,main="Method: ward.D")
memb=identify(h[[1]], N=3)
for(i in 1:3)
{print(paste("Class ", i));print(names(memb[[i]]))}
w=read.csv("who1.csv")
hw=hclust(dist(scale(w[,-1])))
plot(hw ,labels=w[,1],cex=.5,main="Who data hierarchical clustering")
library(pvclust)
w=read.csv("who1.csv")
fit <- pvclust((scale(t(w[,-1]))), method.hclust="ward",
method.dist="euclidean")
plot(fit,labels=w[,1],cex=.4)
pvrect(fit, alpha=.95) #选择AU p值大于0.95的群体
library(CluMix)
mix.heatmap(flower, dist.variables.method="distcor",
rowmar=7, legend.mat=TRUE)
layout(t(1:3))
plot(dendro.subjects(flower))
title("cluster for observations")
plot(dendro.variables(flower, method="associationMeasures"))
title("cluster for variables: association method")
plot(dendro.variables(flower, method="distcor"))
title("cluster for variables: distant correlation method")
w=read.csv("kmeansFig.csv")
a=kmeans(w,3) #确定分成3类
plot(w,pch=a$cluster)
text(a$center,expression(c[1],c[2],c[3]),col=2)
w=read.csv("kmeansFig.csv")
wss=vector()
for (i in 1:10) wss[i-1] = kmeans(w,centers=i)$tot.withinss
plot(1:10, wss, type="b", xlab="K",
ylab=expression(WSS[K]),main="Within groups sum of squares")
w=read.csv("kmeansFig.csv")
library(factoextra)
fviz_nbclust(w, kmeans, method = "wss") +
geom_vline(xintercept = 4, linetype = 2)+
labs(subtitle = "Elbow method")
w=read.csv("3g1.csv")
library("fpc")
set.seed(1010)
par(mfrow=c(2,3))
for(i in c(2,3,4)){
km <- kmeans(w, i)
plot(w,pch=km$cluster,col=km$cluster)
title(paste("k-means: k=", i))
}
for(i in c(1,5,7)){
db <- fpc::dbscan(w, eps = 0.2, MinPts =i)
plot(w,pch=db$cluster,col=db$cluster)
title(paste("DBSCAN: Minpts=", i))
}
w=read.csv("kmeansFig.csv")
library(KernSmooth)
est=bkde2D(w, bandwidth=rep(1,2)*.6)
contour(est$x1, est$x2, est$fhat, col = "blue",xlab="x",ylab="y")
points(w, col = "red", pch = 16)
library(mclust)
w=read.csv("kmeansFig.csv")
res = Mclust(w)
summary(res)
plot(res, what = c("BIC", "classification"))
w=read.csv("who1.csv")
res = Mclust(scale(w[,-1]))
summary(res)
plot(res, what = c("BIC", "classification"))
w[res$classification==1,1]
w[res$classification==2,1]
library(cluster)
(z=clusGap(w, FUN=kmeans, 5))
plot(z, main = "Gap statistic")
library(factoextra)
fviz_nbclust(w, kmeans, nstart = 10, method = "gap_stat", nboot = 50)+
labs(subtitle = "Gap statistic method")
library(cluster)
pr3 <- pam(w, 3)
si <- silhouette(pr3)
plot(si,col=c("red","green","blue"))
library(factoextra)
fviz_nbclust(w, kmeans, method = "silhouette")+
labs(subtitle = "Silhouette method")
library(factoextra)
w=read.csv("who1.csv")
fviz_nbclust(scale(w[,-1]), hcut, method = "silhouette")+
labs(subtitle = "Silhouette method")
w=read.csv("who1.csv")
w1=data.frame(w[,-1],row.names = w[,1])
res=hcut(scale(w1), k = 3, stand = TRUE)
fviz_dend(res, rect = TRUE,cex=.3,horiz = T)
fviz_silhouette(res)
fviz_cluster(res)
for(i in 1:3) print(w[res$cluster==i,-1])
w=read.csv("who1.csv")
w1=data.frame(w[,-1],row.names = w[,1])
library(NbClust)
a<-NbClust(scale(w1), distance = "euclidean", min.nc=2,
max.nc=8, method = "complete", index = "all")
w=read.csv("who1.csv")
w1=data.frame(w[,-1],row.names = w[,1])
library(fpc)
pamk.best <- pamk(scale(w1))
cat("estimated number of clusters:", pamk.best$nc, "\n")
plot(pam(w1, pamk.best$nc))
w=read.csv("who1.csv")
library(ICGE)
d=dist(scale(w[,-1]))
T=vector()
for (i in 2:10)
{part=pam(d, i)$clustering
T[i]=INCAindex(d, part)$Total}
plot(T, type="b", xlab="Number of clusters",
ylab="INCA", xlim=c(1.5, 10.5))
title("Percentage of objects well classified in the partition")
library("factoextra")
w=read.csv("who1.csv")
w1=data.frame(w[,-1],row.names = w[,1])
res=hcut(scale(w1), k = 8, stand = TRUE)
fviz_dend(res, rect = TRUE,cex=.3,horiz = T)
fviz_silhouette(res)
fviz_cluster(res)
for(i in 1:8) print(as.character(w[res$cluster==i,1]))
library(factoextra)
w=read.csv("seeds.csv")
a=get_clust_tendency(scale(w[,-8]), nrow(w)-1, graph = TRUE)
a$hopkins_stat
library(CluMix)
hm=mix.heatmap(scale(w[,-8]),rowmar = 14)
library(NbClust)
a<-NbClust(scale(w[,-8]), distance = "euclidean", min.nc=2,
max.nc=8, method = "complete", index = "all")
w=read.csv("seeds.csv")
hh=hclust(dist(scale(w[,-8])), "complete")
plot(hh,labels=w[,8],cex=.6)
id=identify(hh)#手工分成3份
res=hcut(scale(w[,-8]), k = 3, stand = TRUE)
fviz_cluster(res)
wt=t(w[,-8])
ht=hclust(dist(scale(wt)), "complete")
plot(as.dendrogram(ht),horiz=TRUE)
set.seed(1010)
a=kmeans(scale(w[,-8]),3)
fviz_cluster(a, w[, -8], ellipse.type = "norm")
res.hk <-hkmeans(scale(w[,-8]), 3)
fviz_cluster(res.hk)
w=read.csv("CTG.NAOMIT.csv")
w[,21:23]=apply(w[,21:23],2,factor)
w[,1:20]=scale(w[,1:20])
library(CluMix)
mix.heatmap(w[,-23], rowmar=7, dist.variables.method ="distcor")
plot(dendro.subjects(w[,-23]))
plot(dendro.variables(w[,-23], method="distcor"))
library(cluster)
cluster <- daisy(w[,-23], metric = c("gower"),
stand = FALSE, type = list())
try <- agnes(cluster,3)
rhc=cutree(try,k=3)
table(w$NSP,rhc)[,c(2,1,3)]
w=read.csv("full.aaup.csv")
w1=w[,-(1:4)]
library(CluMix)
mix.heatmap(scale(w1), rowmar=7)
library(NbClust)
res<-NbClust(scale(w1), distance = "euclidean", min.nc=2,
max.nc=8, method = "complete", index = "all")
library(factoextra)
ds=hcut(w1, k = 3, stand = TRUE)
fviz_cluster(ds)
dv=hcut(t(w1), k = 2, stand = TRUE)
fviz_cluster(dv)
for(i in 1:2) print(names(w1)[dv$cluster==i])
fviz_dend(dv, k = 2, k_colors = c("#1B9E77", "#D95F02"))
library(jpeg)
VG=readJPEG("VG.jpg")
imgDm=dim(VG)
VG_rgb <- data.frame(
x = rep(1:imgDm[2], each = imgDm[1]),
y = rep(imgDm[1]:1, imgDm[2]),
R = as.vector(VG[,,1]),
G = as.vector(VG[,,2]),
B = as.vector(VG[,,3]))
par(mfrow=c(2,2))
for (k in c(2,4,16,32)){
VG_km=kmeans(VG_rgb[, 3:5], centers = k)
kc=rgb(VG_km$centers[VG_km$cluster,])
plot(y~x, data=VG_rgb, col=kc,asp=1,pch = ".",
axes=F,xlab="",ylab="",
main=paste("k-Means Clustering of", k, "Colours"))
}
library(scales)
par(mfrow=c(2,2))
for (k in c(2,4,16,32)){
VG_km = kmeans(VG_rgb[, 3:5], centers = k)
show_col(rgb(VG_km$centers))
}
img=readJPEG("Greek.jpg")
imgDm <- dim(img);imgDm
img_rgb <- data.frame(
x = rep(1:imgDm[2], each = imgDm[1]),
y = rep(imgDm[1]:1, imgDm[2]),
R = as.vector(img[,,1]),
G = as.vector(img[,,2]),
B = as.vector(img[,,3]))
img_km = kmeans(img_rgb[, 3:5], centers = 2)
kc = rgb(img_km$centers[img_km$cluster,])
ukc=unique(kc)
kc1=kc;kc1[kc1==ukc[1]]="#000000";kc1[kc1==ukc[2]]="#ffffff"
kc2=kc;kc2[kc2==ukc[1]]="#ffffff";kc2[kc2==ukc[2]]="#000000"
plot(y~x,img_rgb,col=kc,asp=1,pch=".",axes=F,xlab="",ylab="")
plot(y~x,img_rgb,col=kc1,asp=1,pch=".",axes=F,xlab="",ylab="")
plot(y~x,img_rgb,col=kc2,asp=1,pch=".",axes=F,xlab="",ylab="")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from __future__ import division #为了整数除法得到浮点值
from sklearn.cluster import AgglomerativeClustering, KMeans
import scipy.cluster.hierarchy
from scipy.cluster.hierarchy import dendrogram, linkage,
set_link_color_palette
import seaborn as sns
%matplotlib inline
w=pd.read_csv("statex77.csv")
u=w[w.columns[1:]]
u.index=w[w.columns[0]] #把w的第一列变成index
del u.index.name #删去index的名字
u.head()
sns.clustermap(u, metric="correlation", method="complete", cmap="Blues",
standard_scale=1)
Z= linkage(u, method='average', metric='euclidean')
set_link_color_palette(['m', 'c', 'y', 'k'])
fig = plt.figure(figsize=(10, 5), dpi=72)
dendrogram(Z, labels=u.index, leaf_rotation=0, orientation="left",
leaf_font_size=6,color_threshold=None,above_threshold_color='blue')
set_link_color_palette(None)
#plt.show()
np.random.seed(8888)
cent,label=scipy.cluster.vq.kmeans2(u, 4, iter=20, thresh=1e-05,
minit='random', missing='warn', check_finite=True)
print("centroid:\n",cent)
for i in range(4):
print(u.index[label==i])
w=pd.read_csv("seeds.csv")
wnm=w.columns[:7] #除去ID
u=w[wnm]
u.describe()
cor = u.corr()
sns.heatmap(cor, square = True)
my_palette = dict(zip(w.ID.unique(), ["orange","yellow","brown"]))
row_colors = w.ID.map(my_palette)
sns.clustermap(u, metric="correlation", method="complete", cmap="Blues",
standard_scale=1, row_colors=row_colors)
Z = linkage(u, 'ward')
fig = plt.figure(figsize=(20, 10), dpi=72)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('sample index')
plt.ylabel('distance (Ward)')
dendrogram(Z, labels=u.index, leaf_rotation=90)
my_palette = plt.cm.get_cmap("Accent", 3)
w['ID']=pd.Categorical(w['ID'])
my_color=w['ID'].cat.codes
ax = plt.gca()
xlbls = ax.get_ymajorticklabels()
num=-1
for lbl in xlbls:
num+=1
val=my_color[num]
lbl.set_color(my_palette(val))
hc=AgglomerativeClustering(n_clusters=3, linkage='complete')
hc.fit(u)
hcl=hc.labels_.astype(np.int)
print ('1st original group','\n', hcl[:71])
print ('2nd original group','\n', hcl[71:141])
print ('3rd original group','\n', hcl[141:211])
kc=KMeans(n_clusters=3,random_state=1010)
kc.fit(u)
kcl=kc.predict(u)
print ('1st original group','\n', kcl[:71])
print ('2nd original group','\n', kcl[71:141])
print ('3rd original group','\n', kcl[141:211])
from sklearn.cluster import MiniBatchKMeans
from PIL import Image
dance = Image.open("Dance.jpg")
d=np.array(dance)
d = d / 255.0
d = d.reshape(np.prod(d.shape[:2]), 3)
plt.figure(figsize=(26,22))
fig.subplots_adjust(wspace=0)
pp=np.array([2,8,16,32])
for i in range(4):
K=220+i+1
plt.subplot(K)
k = MiniBatchKMeans(pp[i]);k.fit(d)
new = k.cluster_centers_[k.predict(d)]
dd = new.reshape(np.array(dance).shape)
imshow(dd)
w=read.csv("oliveoil.csv")
library(GGally)
ggpairs(w)
w=scale(read.csv("oliveoil.csv"))
S=cor(w)
Sxx=S[1:5,1:5];Syy=S[6:11,6:11]
Sxy=S[1:5,6:11];Syx=t(Sxy)
"%^%" <- function(x, n)
with(eigen(x), vectors %*% (values^n * t(vectors)))
Sxx12=Sxx%^%(-1/2)
Syy12=Syy%^%(-1/2)
K=Sxx12%*%Sxy%*%Syy12
svd.k=svd(K)
c=svd.k$u;d=svd.k$v;lambda=svd.k$d
a=Sxx12%*%c;b=Syy12%*%d
print(list(xcoef=a,ycoef=b, cor=lambda))
X1=read.csv("oliveoil.csv")[,1:5]
sqrt(1/diag(cov(X1)))*a
X.score=X%*%a; Y.score=Y%*%b
corr.X.x=cor(X,X.score); corr.Y.x=cor(Y,X.score)
corr.X.y=cor(X,Y.score); corr.Y.y=cor(Y,Y.score)
print(list(corr.X.x=corr.X.x,corr.Y.x=corr.Y.x,
corr.X.y=corr.X.y,corr.Y.y=corr.Y.y))
plot(lambda,type = "o",main="Scree Plot")
par(mfrow=c(2,2))
for(i in 1:4){
plot(X.score[,i],Y.score[,i],pch=17,cex=1.5,col=4,
ylim=range(Y.score),xlim=range(X.score),
xlab=paste("X.score #", i),ylab=paste("Y.score #", i))
dataEllipse(X.score[,i],Y.score[,i],add = T,lwd=1,grid=F)
}
stats=c("Wilks","Hotelling","Pillai","Roy")
ctest=function(lambda=lambda,stat=stat){
res=p.asym(lambda, 16, 5, 6, tstat = stat)
print(res)
if (stat!="Roy")
for(i in 1:2)
plt.asym(res,rhostart=i)
}
par(mfrow=c(3,2))
for(i in 1:4)ctest(lambda=lambda,stats[i])
w=read.csv("full.aaup.csv")
X=w[,5:12];Y=w[,13:17]
library(GGally)
ggpairs(w[,-c(1:4)])
pairs(w[,-(1:4)])#不显示图
cor(w[,-(1:4)])
library(candisc)
ccxy <- candisc::cancor(X, Y, set.names=c("X", "Y"))
ccxy
ccxy$coef
ccxy$structure
par(mfrow=c(1,3))
plot(ccxy, smooth=TRUE, id.n=3, ellipse.args=list(fill=TRUE))
plot(ccxy, which=2, smooth=TRUE)
plot(ccxy, which=3, smooth=TRUE)
par(mfrow=c(1,1))
plot(SC$X[,1],SC$Y[,1])
plot(SC$X[,2],SC$Y[,2])
par(mfrow=c(2,2))
for(i in 1:4){
t=ccxy$structure[[i]];rg=range(t)+c(-.1,.1)
plot(t[,1:2],type="n",xlim=rg,ylim=rg)
text(t[,1:2],rownames(t))
arrows(0,0,t[,1],t[,2],col=4,angle = 5)
abline(h=0,lty=2);abline(v=0,lty=2)
}
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cross_decomposition import CCA
w=pd.read_csv("full.aaup.csv")
u=w[w.columns[4:]]
X=np.array(u[u.columns[:8]])
Y=np.array(u[u.columns[8:]])
cc=CCA(3)
cc.fit(X,Y)
print (cc.x_weights_)
print (cc.y_weights_)
print (cc.x_loadings_)
print (cc.y_loadings_)
print (cc.x_scores_)
print (cc.y_scores_)
fig=plt.figure(figsize=(30,10))
for i in range(3):
fig.add_subplot(1,3,i+1)
plt.scatter(cc.x_scores_[:,i],cc.y_scores_[:,i])
plt.show()
w=read.csv("IncomeCA.csv")#读入数据
(ie=xtabs(~education+income,w))#生成二维表
library(ca)
(res=ca(ie))
library(factoextra)
fviz_ca_biplot(res,map ="rowprincipal",arrow = c(TRUE, TRUE))
library("gplots")
balloonplot(t(ie), main ="Education-Income", xlab ="", ylab="",
label = FALSE, show.margins = FALSE)
(res=ca(ie))
library("factoextra")
eig=get_eigenvalue(res);tr=sum(eig$eig);(ccor=sqrt(tr))
ch2=tr*sum(as.matrix(ie))
df=(nrow(ie) - 1)*(ncol(ie) - 1)
(pval=pchisq(ch2, df = df, lower.tail = FALSE))
fviz_screeplot(res) +
geom_hline(yintercept=25, linetype=2, color="red")
row=get_ca_row(res);col=get_ca_col(res)
library("corrplot")
corrplot(row$contrib, is.corr=FALSE)
corrplot(col$contrib, is.corr=FALSE)
w=read.csv("IncomeCA.csv")#读入数据
z=xtabs(~education+income+language.in.home,w)
ftable(z, row.vars =c(1,3), col.vars = "income")
library(ca)
(MCA=mjca(w,lambda="Burt"))
(JCA=mjca(w,lambda="JCA"))
par(mfrow=c(1,2))
plot(MCA)
plot(JCA)
(MCA=mjca(w,lambda="Burt"))
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
v= pd.read_csv('incomeCA.csv',index_col=False)
id=v.columns[np.array([0,2])]
w=v[id]
Z=w
for i in range(2):
Z=pd.concat([Z,pd.get_dummies(w[id[i]])],axis=1)
z=np.array(Z[Z.columns[2:]])
P=z[:,:6].T.dot(z[:,6:])/z.shape[0]
r=np.sum(P,axis=1);c=np.sum(P,axis=0)
Dr=np.diag(np.sqrt(r))
Dc=np.diag(np.sqrt(c))
r.shape=len(r),1
c.shape=len(c),1
S=np.dot(Dr.dot(P-r.dot(c.T)),Dc)
U,s,V=np.linalg.svd(S)
X=Dr.dot(U)
Y=Dc.dot(V.T)
T1=id[0]+":"+Z.columns[2:8]
T2=id[1]+":"+Z.columns[8:14]
plt.xlim(-0.5,0.5)
plt.scatter(X[:,0],X[:,1],color='r')
for i in range(6):
plt.text(X[i,0],X[i,1],T1[i],color='r')
plt.arrow(0,0, X[i,0],X[i,1],color='r',head_width=0.01,head_length=0.01)
plt.scatter(Y[:,0],Y[:,1],color='b')
for i in range(5):
plt.text(Y[i,0],Y[i,1],T2[i],color='b')
plt.arrow(0,0, Y[i,0],Y[i,1],color='b',head_width=0.01,head_length=0.01)
plt.grid()
plt.show()
v= pd.read_csv('income0.csv',index_col=False)
id=v.columns[np.array([0,4,13])]
w=v[id]
Z=w
for i in range(3):
Z=pd.concat([Z,pd.get_dummies(w[id[i]])],axis=1)
z=np.array(Z[Z.columns[3:]])
Z1=z[:,:6];Z2=z[:,6:11];Z3=z[:,11:]
Z01=np.concatenate((Z1.T.dot(Z1),Z1.T.dot(Z2),Z1.T.dot(Z3)),axis=1)
Z02=np.concatenate((Z2.T.dot(Z1),Z2.T.dot(Z2),Z2.T.dot(Z3)),axis=1)
Z03=np.concatenate((Z3.T.dot(Z1),Z3.T.dot(Z2),Z3.T.dot(Z3)),axis=1)
C=np.concatenate((Z01,Z02,Z03),axis=0)
Q=3;q=np.array([6,5,3])
n=sum(q);d=np.sum(C/Q/n,axis=0);D=np.diag(np.sqrt(d))
d=d.reshape(len(d),1)
U,s,V=np.linalg.svd((D.dot(C/Q/n-d.dot(d.T))).dot(D))
print("U:\n",U,"\n","s:\n",s,"\n","V:\n",V)
loc=cmdscale(UScitiesD)
x=-loc[, 1];y=-loc[, 2]#负号是为了保持习惯上的"上北下南左西右东"
plot(x, y,type="n",xlab="",ylab="",asp=1,main="US cities")
text(x, y, rownames(loc), cex = 1)
mmds=function(d,k=2){
d=as.matrix(d);n=nrow(d)
I=diag(n);one=matrix(1,n,1)
C=I-0.25*one%*%t(one)
B=-0.5*C%*%d%*%C
a=eigen(B); lambda=diag(a$values)
"%^%" <- function(x, n)
with(eigen(x), vectors %*% (values^n * t(vectors)))
Xp=t(lambda[1:k,1:k]%^%(1/2)%*%t(a$vectors[,1:k]))
row.names(Xp)=row.names(d)
return(Xp)}
library(MASS)
data("States",package = "carData")
df=States[,-1]
par(mar=c(1,1,3,1))
layout(t(1:3))
df.scal <- cmdscale(dist(df), k = 2, eig = T)
plot(df.scal$points, type = "n",
xlim=range(df.scal$points[,1])+c(-2000,1000),
ylim=range(df.scal$points[,2]),
xlab = "",main = "mMDS",tick=0,yaxt="n")
text(df.scal$points, labels = row.names(States),
col = 1, cex = 0.8)
df.sam <- sammon(dist(df))
plot(df.sam$points, type = "n",
xlim=range(df.sam$points[,1])+c(-2000,1000),
ylim=range(df.sam$points[,2]),
xlab = "",main = "Sammon's mapping",tick=0,yaxt="n")
text(df.sam$points, labels = row.names(States),
col = 1, cex = 0.8)
df.iso <- isoMDS(dist(df))
plot(df.iso$points, type = "n",
xlim=range(df.iso$points[,1])+c(-2000,1000),
ylim=range(df.iso$points[,2]),
xlab = "",main = "Kruscal MDS",tick=0,yaxt="n")
text(df.iso$points, labels = row.names(States),
col = 1, cex = 0.8)
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib.collections import LineCollection
from sklearn import manifold
from sklearn.metrics import euclidean_distances
from sklearn.decomposition import PCA
w=pd.read_csv("States.csv",index_col=0)#州名字在数据文件第一列
w=w.iloc[:,1:]#去掉地区名字列
X=np.array(w)
d= euclidean_distances(w)
seed=1010
mds = manifold.MDS(n_components=2, max_iter=3000, eps=1e-9,
random_state=seed,dissimilarity="precomputed", n_jobs=1)
pos = mds.fit(d).embedding_
nmds = manifold.MDS(n_components=2,metric=False, max_iter=3000,
eps=1e-12, dissimilarity="precomputed",random_state=seed,
n_jobs=1,n_init=1)
npos = nmds.fit_transform(d, init=pos)
# 改变尺度:
pos *= np.sqrt((X ** 2).sum()) / np.sqrt((pos ** 2).sum())
npos *= np.sqrt((X ** 2).sum()) / np.sqrt((npos ** 2).sum())
# 利用主成分分析变换:
clf = PCA(n_components=2)
X = clf.fit_transform(X)
pos = clf.fit_transform(pos)
npos = clf.fit_transform(npos)
fig = plt.figure(figsize=(12,4))
ax = fig.add_subplot(1,3, 1)
s = 10 #符号大小
plt.scatter(X[:, 0], X[:, 1], color='navy',marker='o',s=s,lw=0)
plt.title('True position')
for i in np.arange(len(npos[:,0])):
plt.text(X[i, 0],X[i, 1],w.index[i])
ax = fig.add_subplot(1,3, 2)
plt.scatter(pos[:,0],pos[:,1],color='turquoise',marker='^',s=s,lw=0)
for i in np.arange(len(pos[:,0])):
plt.text(pos[i, 0],pos[i, 1],w.index[i])
plt.title('mMDS')
ax = fig.add_subplot(1,3, 3)
plt.scatter(npos[:,0],npos[:,1],color='darkorange',marker='s',s=s,lw=0)
for i in np.arange(len(npos[:,0])):
plt.text(npos[i, 0],npos[i, 1],w.index[i])
plt.title('nMDS')
plt.show()