机器学习第二次作业——鸢尾花
1.加载Iris数据集
该数据集是一个字典
{
"data":...,
"target":...,
"target_names":...
"DESCR":...
}
data字段是numpy数据特征
target是花的类型
target_name是指花名
DESCR是数据集描述
2.数据集可视化
2.1 数据可视化展示
由上图可看出versicolor 和virgincia 是比较难以区分的,setosa相较于它们区分特征更明显
2.2代码实现:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
# plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['savefig.dpi'] = 300 #图片像素
plt.rcParams['figure.dpi'] = 300 #分辨率
def data_visualization_3D(df_Iris,tar):
flag1, flag2, flag3,flag4 = 0, 0, 0,0
labels = ["First", "Second", "Third"]
fig=plt.figure(figsize=(10,10))
xx=[[0,1,2],[1,2,3],[0,1,3],[0,2,3]]
yy=[["sepal_length (cm)","sepal_width (cm)","petal_length (cm)"],["sepal_width (cm)","petal_length (cm)","petal_width (cm)"],["sepal_length","sepal_width","petal_width"],["sepal_length","petal_length","petal_width"]]
for i in range(4):
ax=fig.add_subplot(221+i,projection="3d")
ax.scatter(df_Iris[tar==0,xx[i][0]],df_Iris[tar==0,xx[i][1]],df_Iris[tar==0,xx[i][2]],c="r",marker="o",label="setosa")
ax.scatter(df_Iris[tar == 1, xx[i][0]], df_Iris[tar == 1, xx[i][1]], df_Iris[tar == 1, xx[i][2]], c="b",
marker="x", label="versicolor")
ax.scatter(df_Iris[tar == 2, xx[i][0]], df_Iris[tar == 2, xx[i][1]], df_Iris[tar == 2, xx[i][2]], c="g",
marker="^", label="virginica")
ax.set_xlabel(yy[i][0])
ax.set_ylabel(yy[i][1])
ax.set_zlabel(yy[i][2])
plt.legend(loc=0)
plt.show()
data,tar=readData()
data_visualization_3D(data,tar)
3.MED线性分类
3.1 分类结果
3.2 量化指标
Accuracy: 1.0
Recall: 1.0
specificity: 1.0
F1_Score 1.0
3.3 核心代码
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
def get_Iris_linear(data,tar,flag):
linear_data=[data[i] for i in range(data.shape[0]) if tar[i]!=flag]
linear_tar=[tar[i] for i in range(data.shape[0]) if tar[i]!=flag]
return np.asarray(linear_data,dtype="float64"),np.asarray(linear_tar,dtype="float64")
def hold_out_partition(testRate, trainRate, data, tar): #留出法
import random
import numpy as np
testSet = []
testTar = []
trainSet = []
trainTar = []
listNum = []
for i in range(data.shape[0]):
if tar[i] == 1:
listNum.append(i)
for i in random.sample(listNum, int(testRate * 50)):
testSet.append(data[i])
testTar.append(tar[i])
listNum.remove(i)
for i in listNum:
trainSet.append(data[i])
trainTar.append(tar[i])
listNum = []
for i in range(data.shape[0]):
if tar[i] != 1:
listNum.append(i)
for i in random.sample(listNum, int(testRate * 50)):
testSet.append(data[i])
testTar.append(tar[i])
listNum.remove(i)
for i in listNum:
trainSet.append(data[i])
trainTar.append(tar[i])
return np.asarray(testSet, dtype="float64"), np.asarray(testTar, dtype="float64"), np.asarray(trainSet,dtype="float64"), np.asarray(trainTar, dtype="float64")
cmap={
0:"r",
1:"b",
2:"g"
}
shapeMap={
0:"o",
1:"x",
2:"^"
}
map={
"Iris-setosa":0,
"Iris-versicolor":1,
"Iris-virginica":2,
0:"setosa",
1:"versicolor",
2:"virginica"
}
def classifier_MED(data,tar,posC,negC): #MED分类器
testSet ,testTar ,trainSet ,trainTar = partition.hold_out_partition(0.3,0.7,data,tar)
C1,C2=[],[]
N1,N2=0,0
for i in range(trainSet.shape[0]):
if trainTar[i]==negC:
N1+=1
C1.append(trainSet[i])
elif trainTar[i]==posC:
N2+=1
C2.append(trainSet[i])
C1,C2=np.asarray(C1),np.asarray(C2)
z1,z2=C1.sum(axis=0)/N1,C2.sum(axis=0)/N2
testRes=[]
for x in testSet:
res=np.dot((z2-z1).transpose(),(x-(z1+z2)/2))
testRes.append(res)
testTar=testTar.astype("int16")
TP,FP,TN,FN=0,0,0,0
for i in range (len(testRes)):
#第C2类为正类,第C1类为负类
if testTar[i]==posC and testRes[i]>=0:
TP+=1
elif testTar[i]==posC and testRes[i]<0:
FN+=1
elif testTar[i]==negC and testRes[i]<0:
TN+=1
elif testTar[i] == negC and testRes[i]>=0:
FP+=1
testRes=np.array(testRes)
accuracy=float((TP+TN)/(TP+TN+FP+FN))
recall=float(TP/(TP+FN))
precision=float(TP/(TP+FP))
specificity=float(TN/(TN+FP))
F1_Score=float((2*recall*precision)/(recall+precision))
print("Accuracy:",accuracy)
print("Recall:",recall)
print("specificity:",specificity)
print("F1_Score",F1_Score)
#画图部分
fig = plt.figure(figsize=(10, 10))
xx = [[0, 1, 2], [1, 2, 3], [0, 1, 3], [0, 2, 3]]
yy = [["sepal_length (cm)", "sepal_width (cm)", "petal_length (cm)"],
["sepal_width (cm)", "petal_length (cm)", "petal_width (cm)"],
["sepal_length", "sepal_width", "petal_width"], ["sepal_length", "petal_length", "petal_width"]]
for i in range(4):
ax = fig.add_subplot(221 + i, projection="3d")
X, Y = np.meshgrid(np.arange(testSet.min(axis=0)[xx[i][0]],testSet.max(axis=0)[xx[i][0]],1), np.arange(testSet.min(axis=0)[xx[i][1]],testSet.max(axis=0)[xx[i][1]],1))
u1=np.array([z1[xx[i][0]],z1[xx[i][1]],z1[xx[i][2]]])
u2=np.array([z2[xx[i][0]],z2[xx[i][1]],z2[xx[i][2]]])
u=(u2-u1).transpose()
Z=(np.dot(u,(u1+u2)/2)-u[0]*X-u[1]*Y)/u[2]
ax.scatter(testSet[testRes>=0,xx[i][0]], testSet[testRes>=0,xx[i][1]],testSet[testRes>=0,xx[i][2]], c=cmap[posC], marker=shapeMap[posC], label=map[posC])
ax.scatter(testSet[testRes<0,xx[i][0]],testSet[testRes<0,xx[i][1]], testSet[testRes<0,xx[i][2]],c=cmap[negC], marker=shapeMap[negC],label=map[negC])
ax.set_xlabel(yy[i][0])
ax.set_ylabel(yy[i][1])
ax.set_zlabel(yy[i][2])
ax.plot_surface(X,Y,Z,alpha=0.4)#
ax.legend(loc=0)
plt.show()
linear_data,linear_tar=get_Iris_linear(data,tar,2)
classifier_MED(linear_data,linear_tar,0,1)
4.数据集白化
4.1 图片展示
白化之后,数据在某些维度上更容易区分了
4.2 核心代码
def witening(data):
Ex=np.cov(data,rowvar=False) #Ex为data的协方差矩阵
print(Ex.shape)
a, b = np.linalg.eig(Ex) #原始特征协方差矩阵Ex的特征值和特征向量
#特征向量单位化
modulus=[]
b=np.real(b)
for i in range(b.shape[1]):
sum=0
for j in range(b.shape[0]):
sum+=b[i][j]**2
modulus.append(sum)
modulus=np.asarray(modulus,dtype="float64")
b=b/modulus
# print(b)
#对角矩阵A
a=np.real(a)
A=np.diag(a**(-0.5))
W=np.dot(A,b.transpose())
X=np.dot(W,np.dot(Ex,W.transpose()))
for i in range(W.shape[0]):
for j in range(W.shape[1]):
if np.isnan(W[i][j]):
W[i][j]=0
print(W)
return np.dot(data,W)
witening(data)
visualization.data_visualization_3D(witening(data),tar)
5.MED非线性分类
5.1 结果展示
5.2 结果展示
Accuracy: 0.9
Recall: 0.8666666666666667
specificity: 0.9333333333333333
F1_Score 0.896551724137931
5.3核心代码
def get_Iris_noLinear(data,tar,flag):
linear_data = [data[i] for i in range(data.shape[0]) if tar[i] != flag]
linear_tar = [tar[i] for i in range(data.shape[0]) if tar[i] != flag]
return np.asarray(linear_data, dtype="float64"), np.asarray(linear_tar, dtype="float64")
noLinear_data,noLinear_tar=get_Iris_noLinear(data,tar,0)
classifier_MED(noLinear_data,noLinear_tar,1,2)
6.多分类贝叶斯分类器
6.1 数据可视化
Accuracy: 0.9933333333333334
6.2 核心代码
#K折验证
def K_Folds_Cross_Validation(data,tar,k):
import random
import numpy as np
Set=[]
Tar=[]
listNum = []
for i in range(k):
tempSet=[]
tempTar=[]
tempSet.extend(data[i*10:(i+1)*10])
tempTar.extend(tar[i*10:(i+1)*10])
tempSet.extend(data[(i+5) * 10:(i + 6) * 10])
tempTar.extend(tar[(i+5) * 10:(i + 6) * 10])
tempSet.extend(data[(i+10) * 10:(i + 11) * 10])
tempTar.extend(tar[(i+10) * 10:(i + 11) * 10])
Set.append(tempSet)
Tar.append(tempTar)
return np.asarray(Set),np.asarray(Tar)
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
from scipy import stats
import visualization,partition
# plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['savefig.dpi'] = 300 #图片像素
plt.rcParams['figure.dpi'] = 300 #分辨率
map={
"Iris-setosa":0,
"Iris-versicolor":1,
"Iris-virginica":2
}
#贝叶斯分类器
class BayesParameter(): #存储贝叶斯分类器参数
def __init__(self,mean,cov,category):
self.mean=mean
self.cov=cov
self.category=category
class BayesClassifier(): #贝叶斯分类器,高斯分布概率估计
def __init__(self):
self.parameters=[]
def train(self,X_data,Y_data):
for category in set(Y_data):#遍历每一种类别
selected= Y_data==category #选中对应该类别的数据
X_newData= X_data[selected] #得到新数据
mean=np.mean(X_newData,axis=0) #得到均值
cov = np.cov(X_newData.transpose()) #注意坑 或者设定参数np.cov(X_newData, rowvar=False)
self.parameters.append(BayesParameter(mean,cov,category))
def predit(self,data):
res=-1
probability=0
for parameter in self.parameters:
if stats.multivariate_normal.pdf(data, mean=parameter.means, cov=parameter.cov)>probability:
res=parameter.category
probability=stats.multivariate_normal.pdf(data, mean=parameter.means, cov=parameter.cov)
return res
if __name__=="__main__":
set,tar=partition.K_Folds_Cross_Validation(data,tar,5)
accuracy=0
print(tar[0].shape)
for i in range(5): #第i个子集作为测试集
x,y=0,0
X_data,Y_data=None,None
for j in range(5):
if i!=j:
if x*y==0:
X_data=set[i]
Y_data=tar[i]
else:
X_data=np.concatenate((X_data,set[i]),axis=0)
Y_data = np.concatenate((Y_data, tar[i]), axis=0)
x+=1
y+=1
bc=BayesClassifier()
bc.train(X_data,Y_data)
y_predict=[bc.predit(x) for x in set[i]]
tempAccuracy=np.sum(y_predict==tar[i])/tar[i].shape[0]
accuracy+=tempAccuracy
accuracy=accuracy/5
print(accuracy)
#https://blog.csdn.net/weixin_37895339/article/details/80351541 协方差和高斯分布的关系
def data_visualization_2D_Bayes(data,tar):
testSet, testTar, trainSet, trainTar = partition.hold_out_partition(0.3, 0.7, data, tar)
bc = BayesClassifier()
bc.train(trainSet, trainTar)
testPredict = np.array([bc.predit(x) for x in testSet],dtype="int")
import math
# 画图部分
fig = plt.figure(figsize=(10, 10))
xx = [[0, 1], [1, 2], [2, 3], [0,2],[0,3],[1,3]]
yy = [["sepal_length (cm)", "sepal_width (cm)"],
["sepal_width (cm)", "petal_length (cm)"],
["sepal_width(cm)", "petal_width(cm)"],
["sepal_length (cm)","petal_length (cm)"],
["sepal_length (cm)","petal_width(cm)"],
["sepal_width (cm)","petal_width(cm)"]]
for i in range(6):
ax = fig.add_subplot(321 + i)
x_max,x_min=testSet.max(axis=0)[xx[i][0]]+0.5,testSet.min(axis=0)[xx[i][0]]-0.5
y_max,y_min=testSet.max(axis=0)[xx[i][1]]+0.5,testSet.min(axis=0)[xx[i][1]]-0.5
xlist = np.linspace(x_min, x_max, 100) # Create 1-D arrays for x,y dimensions
ylist = np.linspace(y_min, y_max, 100)
XX, YY = np.meshgrid(xlist, ylist)
bc = BayesClassifier()
bc.train(trainSet[:, xx[i]],trainTar)
xys = [np.array([xx, yy]).reshape(1, -1) for xx, yy in zip(np.ravel(XX), np.ravel(YY))]
zz = np.array([bc.predit(x) for x in xys])
Z = zz.reshape(XX.shape)
plt.contourf(XX, YY, Z, 2, alpha=.1, colors=('blue', 'red', 'green'))
ax.scatter(testSet[testPredict == 0, xx[i][0]], testSet[testPredict == 0, xx[i][1]],
c='r', marker='o',
label="setosa")
ax.scatter(testSet[testPredict==1, xx[i][0]], testSet[testPredict==1, xx[i][1]], c='b', marker='x',
label="versicolor")
ax.scatter(testSet[testPredict==2, xx[i][0]], testSet[testPredict==2, xx[i][1]], c='g', marker='^',
label="virginica")
ax.set_xlabel(yy[i][0])
ax.set_ylabel(yy[i][1])
ax.legend(loc=0)
plt.show()
data,tar=readData()
data_visualization_2D_Bayes(data,tar)