周志华机器学习 CH3.5 编程实现线性判别分析

方法一:利用sklearn库函数实现

import numpy as np
import matplotlib.pyplot as plt

'''
# LDA via sklearn
'''
from sklearn import model_selection
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import metrics

fr = open("3.0.txt")
dataSet = np.loadtxt(fr, delimiter = ",")
X = dataSet[:, 1:3]
Y = dataSet[:, 3]


# draw scatter diagram to show the raw data
f1 = plt.figure(1)
plt.title('watermelon_3a')
plt.xlabel('density')
plt.ylabel('ratio_sugar')
plt.scatter(X[Y == 0,0], X[Y == 0,1], marker = 'o', color = 'k', s=100, label = 'bad')
plt.scatter(X[Y == 1,0], X[Y == 1,1], marker = 'o', color = 'g', s=100, label = 'good')
plt.legend(loc = 'upper right')

X_train,X_test,Y_train,Y_test=model_selection.train_test_split(X,Y,test_size=0.5,random_state = 0)


#进行拟合
lda_model=LinearDiscriminantAnalysis(solver = 'lsqr',shrinkage = None).fit(X,Y)

y_pred=lda_model.predict(X_test)

print(metrics.confusion_matrix(Y_test,y_pred))
print(metrics.classification_report(Y_test,y_pred))


#画出决策边界
f2=plt.figure(2)
h=0.01
X0_min,X0_max=X[:,0].min()-0.1,X[:,0].max()+0.1
X1_min,X1_max=X[:,1].min()-0.1,X[:,1].max()+0.1
X0,X1=np.meshgrid(np.arange(X0_min,X0_max,h),np.meshgrid(np.arange(X1_min,X1_max,h)))
z=lda_model.predict(np.c_[X0.ravel(),X1.ravel()])
print(z.shape)
#  Put the result into a color plot
print(X0.shape)
z=z.reshape(X0.shape)
print(z)

print(z.shape)
plt.contour(X0,X1,z)

# Plot also the training pointsplt.title('watermelon_3a')
plt.title('watermelon_3a')
plt.xlabel('density')
plt.ylabel('ratio_sugar')
plt.scatter(X[Y == 0,0], X[Y == 0,1], marker = 'o', color = 'k', s=100, label = 'bad')
plt.scatter(X[Y == 1,0], X[Y == 1,1], marker = 'o', color = 'g', s=100, label = 'good')
plt.show()

 运行结果:

周志华机器学习 CH3.5 编程实现线性判别分析_第1张图片

 

 

方法二:利用python自己编程实现

import numpy as np
import matplotlib.pyplot as plt

def loadDataSet(fileName):
    '''
    fr=open(fileName)
    X=[];Y=[]
    for line in fr.readlines():
        lineArr=line.strip().split(',')
        X.append([float(lineArr[1]),float(lineArr[2])])
        Y.append(lineArr[3])
    '''
    fr=open(fileName)
    dataSet=np.loadtxt(fr,delimiter=",")
    X=dataSet[:,1:3]
    Y=dataSet[:,3]

    print(type(X),type(Y))
    return np.array(X),np.array(Y)

def plotDataSet(X,Y):
    # 绘制数据集
    f1 = plt.figure()
    plt.title("watermelon_3a")
    plt.xlabel("密度")
    plt.ylabel("含糖量")
    plt.scatter(X[Y == 0, 0], X[Y == 0, 1], marker = 'o', color = 'k', s = 100, label = 'bad')
    plt.scatter(X[Y == 1, 0], X[Y == 1, 1], marker = 'o', color = 'g', s = 100, label = 'good')
    plt.legend(loc = 'upper right')
    plt.show()

# 求类内散度矩阵
def Sw(X,Y):
    u=[]
    u.append(np.mean(X[Y==0],axis=0))  #column means
    u.append(np.mean(X[Y==1],axis=0))
    u=np.array(u)
    m,n=np.shape(X)
    sw=np.zeros((n,n))

    X0=X[Y==0]
    X1=X[Y==1]
    #  !!!!!
    # 一定注意要把u变成矩阵!!!!
    # 因为u.shape=(1,2),直接u.T转置还是一个一维的数组,参与运算出错!!
    # 因为u=[1,2],u.T=[1,2]
    # 变成矩阵u=[[1,2]],u.T=[[1],[2]]
    # 弄了一天!!才发现这里出了问题!!!!
    #(或者:u=u.reshape((2,1)),这样之后在转置,和np.mat(u)效果一样!)

    for i in range(np.shape(X0)[0]):
        sw+=np.dot(np.mat((X0[i]-u[0])).T,np.mat(X0[i]-u[0]))
    for i in range(np.shape(X1)[0]):
        sw+=np.dot(np.mat((X1[i]-u[1])).T,np.mat(X1[i]-u[1]))
    '''

    for i in range(m):
        if Y[i]==0:
            x_temp=X[i]-u[0]
            print("temp= ",np.array(x_temp).T,"temp.T= ",(x_temp))
        else:
            
            x_temp=X[i]-u[1]
           # print(x_temp.T)
        sw+=np.dot(np.mat(x_temp).T,np.mat(x_temp))
        print(i, "  ", sw)
            '''
    print(sw)
    return sw,u


#计算w
def wFunction(sw,u):
    m,n=np.shape(X)
    sw_inv=np.linalg.pinv(sw)
    w=np.dot(sw_inv,(u[0]-u[1]).T)
    return w

# draw projective point on the line
# 计算投影点的坐标,
# 利用直线和垂直线的斜率相乘=1,求两直线交点坐标
def GetProjectivePoint_2D(point, line):
    a = point[0]
    b = point[1]
    k = line[0]
    t = line[1]
    if   k == 0:      return [a, t] # 平行于横轴
    elif k == np.inf: return [0, b] # 平行于纵轴
    x = (a+k*b-k*t) / (k*k+1)  #斜线
    y = k*x + t
    return [x, y]


if __name__=="__main__":
    X,Y=loadDataSet("3.0.txt")
    #print(X)
    #print(Y)
    #plotDataSet(X,Y)
    sw,u=Sw(X,Y)
    w=wFunction(sw,u)
    print(w)

    f2 = plt.figure(2)

    p0_x0 = -X[:, 0].max()
    p0_x1 = (-w[0]/w[1])*p0_x0
    p1_x0 = X[:, 0].max()
    p1_x1 = (-w[0]/w[1])*p1_x0
    plt.title('watermelon_3a - LDA')
    plt.xlabel('density')
    plt.ylabel('ratio_sugar')
    plt.scatter(X[Y == 0, 0], X[Y == 0, 1], marker = 'o', color = 'k', s = 10, label = 'bad')
    plt.scatter(X[Y == 1, 0], X[Y == 1, 1], marker = 'o', color = 'g', s = 10, label = 'good')
    plt.legend(loc = 'upper right')
    plt.plot([p0_x0, p1_x0], [p0_x1, p1_x1])

    m, n = np.shape(X)
    for i in range(m):
        x_p = GetProjectivePoint_2D([X[i][0], X[i][1]], [-w[0]/w[1], 0])
        if Y[i] == 0:
            plt.plot(x_p[0], x_p[1], 'ko', markersize = 5)
        if Y[i] == 1:
            plt.plot(x_p[0], x_p[1], 'go', markersize = 5)
        plt.plot([x_p[0], X[i, 0]], [x_p[1], X[i, 1]], 'c--', linewidth = 0.3)
    plt.show()

运行结果:

周志华机器学习 CH3.5 编程实现线性判别分析_第2张图片

 

周志华机器学习 CH3.5 编程实现线性判别分析_第3张图片

 

周志华机器学习 CH3.5 编程实现线性判别分析_第4张图片

删除一个离群的训练样本后,得以改善:

周志华机器学习 CH3.5 编程实现线性判别分析_第5张图片

 

你可能感兴趣的:(机器学习,周志华西瓜书练习题)