附:
Pandas文档链接
sklearn文档链接
采用鸢尾花数据
数据链接https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data
数据集共3类,共有150条花的基本数据,三种花各50条,每条数据包括萼片长度,萼片宽度,花瓣长度,花瓣宽度4种特征
使用pandas读取数据集,代码:
import pandas as pd
# 显示所有列
pd.set_option('display.max_columns', None)
# 显示所有行
pd.set_option('display.max_rows', None)
# 增加每行的宽度
pd.set_option('display.width', 1000)
df = pd.read_csv(
filepath_or_buffer='iris.data',
header=None,
sep=',',
)
# 自定义列名
feature_dict = {
i: label for i, label in zip(
range(4),
(
'sepal length in cm',
'sepal width in cm',
'petal length in cm',
'petal width in cm',
)
)
}
# 指定列名
df.columns = [l for i, l in sorted(feature_dict.items())] + ['class label']
print(df.head(150)) # 返回前n行数据
from sklearn.preprocessing import LabelEncoder
X = df[['sepal length in cm',
'sepal width in cm',
'petal length in cm',
'petal width in cm']].values
y = df['class label'].values
# 制作标签 {1:'Setosa', 2:'Versicolor', 3:'Virginica'}
enc = LabelEncoder()
label_encoder = enc.fit(y)
y = label_encoder.transform(y) + 1
使用sklearn中的LabelEncoder完成标签转换,分两步走,先fit
再transform
,转换结果如下:
均值包括3各类别,每个类别有4个维度,在每个维度上都要计算均值
import numpy as np
# 设置小数点的位数
np.set_printoptions(precision=4)
# 求均值
def get_mean(X, y):
# 保存所有的均值
mean_vectors = []
# 计算3个类别
for i in range(1, 4):
# 求当前类别各个特征的均值
mean_vectors.append(np.mean(X[y == i], axis=0))
print('均值类别 %s: %s' % (i, mean_vectors[i - 1]))
# 计算类内散布矩阵
def class_in_matrix(mean_vectors, X, y):
# 原始数据有4个特征
S_W = np.zeros((4, 4))
for i, mv in zip(range(1, 4), mean_vectors):
class_sc_mat = np.zeros((4, 4))
# 选中当前类别的数据
for row in X[y == i]:
# 对各个特征分别进行计算,用矩阵的形式
row, mv = row.reshape(4, 1), mv.reshape(4, 1)
# 公式
class_sc_mat += (row - mv).dot((row - mv).T)
S_W += class_sc_mat
print('类内散布矩阵:\n', S_W)
# 计算类间散布矩阵
def class_out_matrix(mean_vectors, X, y):
overall_mean = np.mean(X, axis=0)
# 构建类间散布矩阵
S_B = np.zeros((4, 4))
# 对各个类别进行计算
for i, mean_vec in enumerate(mean_vectors):
# 当前类别的样本数
n = X[y == i + 1, :].shape[0]
mean_vec = mean_vec.reshape(4, 1)
overall_mean = overall_mean.reshape(4, 1)
# 公式
S_B += n * (mean_vec - overall_mean).dot((mean_vec - overall_mean).T)
print('类间散布矩阵:\n', S_B)
# 求特征值与特征向量
def get_eig(S_W, S_B):
eig_vals, eig_vecs = np.linalg.eig(np.linalg.inv(S_W).dot(S_B))
# 得到每一个特征值对应的特征向量
for i in range(len(eig_vals)):
eigvec_sc = eig_vecs[:, i].reshape(4, 1)
print('\n特征向量{}:\n{}'.format(i + 1, eigvec_sc.real))
print('特征值{:}:{:.2e}'.format(i + 1, eig_vals[i].real))
return eig_vals, eig_vecs
# 特征值与特征向量配对
eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:, i]) for i in range(len(eig_vals))]
# 按特征值大小排序
eig_pairs = sorted(eig_pairs, key=lambda k: k[0], reverse=True)
print('特征值排序结果:\n')
for i in eig_pairs:
print(i[0])
print('特征值占总体百分比:\n')
eigv_sum = sum(eig_vals)
for i, j in enumerate(eig_pairs):
print('特征值 {0:}:{1:.2%}'.format(i + 1, (j[0] / eigv_sum).real))
# 进行降维
W = np.hstack((eig_pairs[0][1].reshape(4, 1), eig_pairs[1][1].reshape(4, 1)))
X_lda = X.dot(W)
print(X_lda.shape)
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
import matplotlib.pyplot as plt
import pandas as pd
# 读取数据
from sklearn.preprocessing import LabelEncoder
def read():
df = pd.read_csv(
filepath_or_buffer='iris.data',
header=None,
sep=',',
)
# 自定义列名
feature_dict = {
i: label for i, label in zip(
range(4),
(
'sepal length in cm',
'sepal width in cm',
'petal length in cm',
'petal width in cm',
)
)
}
# 指定列名
df.columns = [l for i, l in sorted(feature_dict.items())] + ['class label']
# print(df.head(150)) # 返回前n行数据
# print(df.shape)
return df
# 标签转化
def transform_label(df):
X = df[['sepal length in cm',
'sepal width in cm',
'petal length in cm',
'petal width in cm']].values
y = df['class label'].values
# 制作标签 {1:'Setosa', 2:'Versicolor', 3:'Virginica'}
enc = LabelEncoder()
label_encoder = enc.fit(y)
y = label_encoder.transform(y) + 1
# print(y)
return X, y
df = read()
X, y = transform_label(df)
sklearn_lda = LDA(n_components=2)
X_lda_sklearn = sklearn_lda.fit_transform(X, y)
print(X_lda_sklearn.shape)
plt.scatter(X_lda_sklearn[:, 0], X_lda_sklearn[:, 1], marker='o', c=y)
plt.show()
关键代码:
sklearn_lda = LDA(n_components=2)
X_lda_sklearn = sklearn_lda.fit_transform(X, y)
print(X_lda_sklearn.shape)