机器学习第二次作业——鸢尾花

机器学习第二次作业——鸢尾花

1.加载Iris数据集

该数据集是一个字典

{

"data":...,

"target":...,

"target_names":...

"DESCR":...

}

data字段是numpy数据特征

target是花的类型

target_name是指花名

DESCR是数据集描述

2.数据集可视化

2.1 数据可视化展示

由上图可看出versicolor 和virgincia 是比较难以区分的,setosa相较于它们区分特征更明显

2.2代码实现:

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
# plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['savefig.dpi'] = 300 #图片像素
plt.rcParams['figure.dpi'] = 300 #分辨率

def data_visualization_3D(df_Iris,tar):
	flag1, flag2, flag3,flag4 = 0, 0, 0,0
	labels = ["First", "Second", "Third"]
	fig=plt.figure(figsize=(10,10))
	xx=[[0,1,2],[1,2,3],[0,1,3],[0,2,3]]
	yy=[["sepal_length (cm)","sepal_width (cm)","petal_length (cm)"],["sepal_width (cm)","petal_length (cm)","petal_width (cm)"],["sepal_length","sepal_width","petal_width"],["sepal_length","petal_length","petal_width"]]
	for i in range(4):
		ax=fig.add_subplot(221+i,projection="3d")
		ax.scatter(df_Iris[tar==0,xx[i][0]],df_Iris[tar==0,xx[i][1]],df_Iris[tar==0,xx[i][2]],c="r",marker="o",label="setosa")
		ax.scatter(df_Iris[tar == 1, xx[i][0]], df_Iris[tar == 1, xx[i][1]], df_Iris[tar == 1, xx[i][2]], c="b",
				   marker="x", label="versicolor")
		ax.scatter(df_Iris[tar == 2, xx[i][0]], df_Iris[tar == 2, xx[i][1]], df_Iris[tar == 2, xx[i][2]], c="g",
				   marker="^", label="virginica")
		ax.set_xlabel(yy[i][0])
		ax.set_ylabel(yy[i][1])
		ax.set_zlabel(yy[i][2])
		plt.legend(loc=0)
	plt.show()

    
  
data,tar=readData()
data_visualization_3D(data,tar)

3.MED线性分类

3.1 分类结果

3.2 量化指标

Accuracy: 1.0
Recall: 1.0
specificity: 1.0
F1_Score 1.0

3.3 核心代码

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np

def get_Iris_linear(data,tar,flag):
	linear_data=[data[i] for i in range(data.shape[0]) if tar[i]!=flag]
	linear_tar=[tar[i] for i in range(data.shape[0]) if tar[i]!=flag]
	return np.asarray(linear_data,dtype="float64"),np.asarray(linear_tar,dtype="float64")

def hold_out_partition(testRate, trainRate, data, tar): #留出法
	import random
	import numpy as np
	testSet = []
	testTar = []
	trainSet = []
	trainTar = []
	listNum = []
	for i in range(data.shape[0]):
		if tar[i] == 1:
			listNum.append(i)
	for i in random.sample(listNum, int(testRate * 50)):
		testSet.append(data[i])
		testTar.append(tar[i])
		listNum.remove(i)

	for i in listNum:
		trainSet.append(data[i])
		trainTar.append(tar[i])
	listNum = []
	for i in range(data.shape[0]):
		if tar[i] != 1:
			listNum.append(i)
	for i in random.sample(listNum, int(testRate * 50)):
		testSet.append(data[i])
		testTar.append(tar[i])
		listNum.remove(i)
	for i in listNum:
		trainSet.append(data[i])
		trainTar.append(tar[i])
	return np.asarray(testSet, dtype="float64"), np.asarray(testTar, dtype="float64"), np.asarray(trainSet,dtype="float64"), np.asarray(trainTar, dtype="float64")

cmap={
	0:"r",
	1:"b",
	2:"g"
}
shapeMap={
	0:"o",
	1:"x",
	2:"^"
}
map={
	"Iris-setosa":0,
	"Iris-versicolor":1,
	"Iris-virginica":2,
	0:"setosa",
	1:"versicolor",
	2:"virginica"

}
def classifier_MED(data,tar,posC,negC): #MED分类器
	testSet ,testTar ,trainSet ,trainTar = partition.hold_out_partition(0.3,0.7,data,tar)
	C1,C2=[],[]
	N1,N2=0,0
	for i in range(trainSet.shape[0]):
		if trainTar[i]==negC:
			N1+=1
			C1.append(trainSet[i])
		elif trainTar[i]==posC:
			N2+=1
			C2.append(trainSet[i])
	C1,C2=np.asarray(C1),np.asarray(C2)
	z1,z2=C1.sum(axis=0)/N1,C2.sum(axis=0)/N2
	testRes=[]
	for x in testSet:
		res=np.dot((z2-z1).transpose(),(x-(z1+z2)/2))
		testRes.append(res)
	testTar=testTar.astype("int16")
	TP,FP,TN,FN=0,0,0,0
	for i in range (len(testRes)):
		#第C2类为正类,第C1类为负类
		if testTar[i]==posC and testRes[i]>=0:
			TP+=1
		elif testTar[i]==posC and testRes[i]<0:
			FN+=1
		elif testTar[i]==negC and testRes[i]<0:
			TN+=1
		elif testTar[i] == negC and testRes[i]>=0:
			FP+=1
	testRes=np.array(testRes)
	accuracy=float((TP+TN)/(TP+TN+FP+FN))
	recall=float(TP/(TP+FN))
	precision=float(TP/(TP+FP))
	specificity=float(TN/(TN+FP))
	F1_Score=float((2*recall*precision)/(recall+precision))
	print("Accuracy:",accuracy)
	print("Recall:",recall)
	print("specificity:",specificity)
	print("F1_Score",F1_Score)


	#画图部分
	fig = plt.figure(figsize=(10, 10))
	xx = [[0, 1, 2], [1, 2, 3], [0, 1, 3], [0, 2, 3]]
	yy = [["sepal_length (cm)", "sepal_width (cm)", "petal_length (cm)"],
		  ["sepal_width (cm)", "petal_length (cm)", "petal_width (cm)"],
		  ["sepal_length", "sepal_width", "petal_width"], ["sepal_length", "petal_length", "petal_width"]]
	for i in range(4):
		ax = fig.add_subplot(221 + i, projection="3d")
		X, Y = np.meshgrid(np.arange(testSet.min(axis=0)[xx[i][0]],testSet.max(axis=0)[xx[i][0]],1), np.arange(testSet.min(axis=0)[xx[i][1]],testSet.max(axis=0)[xx[i][1]],1))
		u1=np.array([z1[xx[i][0]],z1[xx[i][1]],z1[xx[i][2]]])
		u2=np.array([z2[xx[i][0]],z2[xx[i][1]],z2[xx[i][2]]])
		u=(u2-u1).transpose()
		Z=(np.dot(u,(u1+u2)/2)-u[0]*X-u[1]*Y)/u[2]
		ax.scatter(testSet[testRes>=0,xx[i][0]], testSet[testRes>=0,xx[i][1]],testSet[testRes>=0,xx[i][2]], c=cmap[posC], marker=shapeMap[posC], label=map[posC])
		ax.scatter(testSet[testRes<0,xx[i][0]],testSet[testRes<0,xx[i][1]], testSet[testRes<0,xx[i][2]],c=cmap[negC], marker=shapeMap[negC],label=map[negC])
		ax.set_xlabel(yy[i][0])
		ax.set_ylabel(yy[i][1])
		ax.set_zlabel(yy[i][2])
		ax.plot_surface(X,Y,Z,alpha=0.4)#
		ax.legend(loc=0)
	plt.show()
	
linear_data,linear_tar=get_Iris_linear(data,tar,2)
classifier_MED(linear_data,linear_tar,0,1)

4.数据集白化

4.1 图片展示

白化之后,数据在某些维度上更容易区分了

4.2 核心代码

def witening(data):
	Ex=np.cov(data,rowvar=False) #Ex为data的协方差矩阵
	print(Ex.shape)
	a, b = np.linalg.eig(Ex) #原始特征协方差矩阵Ex的特征值和特征向量
	#特征向量单位化
	modulus=[]
	b=np.real(b)
	for i in range(b.shape[1]):
		sum=0
		for j in range(b.shape[0]):
			sum+=b[i][j]**2
		modulus.append(sum)
	modulus=np.asarray(modulus,dtype="float64")
	b=b/modulus
	# print(b)
	#对角矩阵A
	a=np.real(a)
	A=np.diag(a**(-0.5))
	W=np.dot(A,b.transpose())
	X=np.dot(W,np.dot(Ex,W.transpose()))
	for i in range(W.shape[0]):
		for j in range(W.shape[1]):
			if np.isnan(W[i][j]):
				W[i][j]=0
	print(W)
	return np.dot(data,W)
	
witening(data)
visualization.data_visualization_3D(witening(data),tar)

5.MED非线性分类

5.1 结果展示

5.2 结果展示

Accuracy: 0.9
Recall: 0.8666666666666667
specificity: 0.9333333333333333
F1_Score 0.896551724137931

5.3核心代码

def get_Iris_noLinear(data,tar,flag):
	linear_data = [data[i] for i in range(data.shape[0]) if tar[i] != flag]
	linear_tar = [tar[i] for i in range(data.shape[0]) if tar[i] != flag]
	return np.asarray(linear_data, dtype="float64"), np.asarray(linear_tar, dtype="float64")

noLinear_data,noLinear_tar=get_Iris_noLinear(data,tar,0)
classifier_MED(noLinear_data,noLinear_tar,1,2)

6.多分类贝叶斯分类器

6.1 数据可视化

机器学习第二次作业——鸢尾花_第1张图片

Accuracy: 0.9933333333333334

6.2 核心代码

#K折验证
def	K_Folds_Cross_Validation(data,tar,k):
	import random
	import numpy as np
	Set=[]
	Tar=[]
	listNum = []
	for i in range(k):
		tempSet=[]
		tempTar=[]
		tempSet.extend(data[i*10:(i+1)*10])
		tempTar.extend(tar[i*10:(i+1)*10])
		tempSet.extend(data[(i+5) * 10:(i + 6) * 10])
		tempTar.extend(tar[(i+5) * 10:(i + 6) * 10])
		tempSet.extend(data[(i+10) * 10:(i + 11) * 10])
		tempTar.extend(tar[(i+10) * 10:(i + 11) * 10])
		Set.append(tempSet)
		Tar.append(tempTar)
	return np.asarray(Set),np.asarray(Tar)

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
from scipy import stats
import visualization,partition
# plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['savefig.dpi'] = 300 #图片像素
plt.rcParams['figure.dpi'] = 300 #分辨率
map={
	"Iris-setosa":0,
	"Iris-versicolor":1,
	"Iris-virginica":2

}
#贝叶斯分类器
class BayesParameter(): #存储贝叶斯分类器参数

	def __init__(self,mean,cov,category):
		self.mean=mean
		self.cov=cov
		self.category=category
class  BayesClassifier():  #贝叶斯分类器,高斯分布概率估计

	def __init__(self):
		self.parameters=[]

	def train(self,X_data,Y_data):

		for category in set(Y_data):#遍历每一种类别
			selected= Y_data==category #选中对应该类别的数据
			X_newData= X_data[selected] #得到新数据
			mean=np.mean(X_newData,axis=0) #得到均值
			cov = np.cov(X_newData.transpose()) #注意坑 或者设定参数np.cov(X_newData, rowvar=False)
			self.parameters.append(BayesParameter(mean,cov,category))

	def predit(self,data):
		res=-1
		probability=0
		for parameter in self.parameters:
			if stats.multivariate_normal.pdf(data, mean=parameter.means, cov=parameter.cov)>probability:
				res=parameter.category
				probability=stats.multivariate_normal.pdf(data, mean=parameter.means, cov=parameter.cov)
		return res

    
if __name__=="__main__":
    set,tar=partition.K_Folds_Cross_Validation(data,tar,5)
	accuracy=0
	print(tar[0].shape)
	for i in range(5): #第i个子集作为测试集
		x,y=0,0
		X_data,Y_data=None,None
		for j in range(5):
			if i!=j:
				if x*y==0:
					X_data=set[i]
					Y_data=tar[i]
				else:
					X_data=np.concatenate((X_data,set[i]),axis=0)
					Y_data = np.concatenate((Y_data, tar[i]), axis=0)
					x+=1
					y+=1
		bc=BayesClassifier()
		bc.train(X_data,Y_data)
		y_predict=[bc.predit(x) for x in set[i]]
		tempAccuracy=np.sum(y_predict==tar[i])/tar[i].shape[0]
		accuracy+=tempAccuracy
	accuracy=accuracy/5
	print(accuracy)
#https://blog.csdn.net/weixin_37895339/article/details/80351541 协方差和高斯分布的关系
def data_visualization_2D_Bayes(data,tar):
	testSet, testTar, trainSet, trainTar = partition.hold_out_partition(0.3, 0.7, data, tar)
	bc = BayesClassifier()
	bc.train(trainSet, trainTar)
	testPredict = np.array([bc.predit(x) for x in testSet],dtype="int")

	import math
	# 画图部分
	fig = plt.figure(figsize=(10, 10))
	xx = [[0, 1], [1, 2], [2, 3], [0,2],[0,3],[1,3]]
	yy = [["sepal_length (cm)", "sepal_width (cm)"],
		  ["sepal_width (cm)", "petal_length (cm)"],
		  ["sepal_width(cm)", "petal_width(cm)"],
		  ["sepal_length (cm)","petal_length (cm)"],
		  ["sepal_length (cm)","petal_width(cm)"],
		  ["sepal_width (cm)","petal_width(cm)"]]
	for i in range(6):
		ax = fig.add_subplot(321 + i)
		x_max,x_min=testSet.max(axis=0)[xx[i][0]]+0.5,testSet.min(axis=0)[xx[i][0]]-0.5
		y_max,y_min=testSet.max(axis=0)[xx[i][1]]+0.5,testSet.min(axis=0)[xx[i][1]]-0.5
		xlist = np.linspace(x_min, x_max, 100)  # Create 1-D arrays for x,y dimensions
		ylist = np.linspace(y_min, y_max, 100)
		XX, YY = np.meshgrid(xlist, ylist)
		bc = BayesClassifier()
		bc.train(trainSet[:, xx[i]],trainTar)
		xys = [np.array([xx, yy]).reshape(1, -1) for xx, yy in zip(np.ravel(XX), np.ravel(YY))]
		zz = np.array([bc.predit(x) for x in xys])
		Z = zz.reshape(XX.shape)
		plt.contourf(XX, YY, Z, 2, alpha=.1, colors=('blue', 'red', 'green'))
		ax.scatter(testSet[testPredict == 0, xx[i][0]], testSet[testPredict == 0, xx[i][1]],
				  c='r', marker='o',
				   label="setosa")
		ax.scatter(testSet[testPredict==1, xx[i][0]], testSet[testPredict==1, xx[i][1]], c='b', marker='x',
				   label="versicolor")
		ax.scatter(testSet[testPredict==2, xx[i][0]], testSet[testPredict==2, xx[i][1]], c='g', marker='^',
				   label="virginica")
		ax.set_xlabel(yy[i][0])
		ax.set_ylabel(yy[i][1])
		ax.legend(loc=0)
	plt.show()
data,tar=readData()
data_visualization_2D_Bayes(data,tar)

你可能感兴趣的:(机器学习第二次作业——鸢尾花)