第1关:实现PCA降维算法
import numpy as np
import matplotlib.pyplot as plt
#使用numpy库中的函数来创建一个随机的数据集
np.random.seed(3)
X = np.empty([100,2])
X[:,0] = np.random.uniform(0., 100., size=100)
X[:,1] = 0.75 * X[:,0] + 3. + np.random.normal(0., 10., size=100)
#定义均值归零函数
def demean(X):
#返回的矩阵是每个数据都减去均值后的矩阵
#***********Begin**********
return(X - np.mean(X,axis=0))
#************End***********
X_demean = demean(X)
#定义求方差函数
def getVariance(w,X):
#***********Begin**********
return np.sum((X.dot(w)**2))/len(X)
#************End***********
#定义求梯度函数
def getGradient(w,X):
#***********Begin**********
return X.T.dot(X.dot(w))*2./len(X)
#************End***********
#定义向量单位化
def direction(w):
#***********Begin**********
return w / np.linalg.norm(w)
#************End***********
def gradient_ascent(X, initial_w, eta, n_iters=1e4, epsilon=1e-8):
w = direction(initial_w)
cur_iter = 0
while cur_iter < n_iters:
gradient = getGradient(w,X)
last_w = w
w = last_w + eta * gradient
w = direction(w) # 将w转换成单位向量
if (abs(getVariance(w,X) - getVariance(last_w,X)) < epsilon):
break
cur_iter += 1
return w
if __name__ == "__main__":
initial_w = np.random.random(X.shape[1])
eta = 0.001
w = gradient_ascent(X_demean, initial_w, eta)
print(w)
第2关:实现LDA降维算法
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
# 显示所有列
pd.set_option('display.max_columns', None)
# 显示所有行
pd.set_option('display.max_rows', None)
# 增加每行的宽度
pd.set_option('display.width', 1000)
df = pd.read_csv(
filepath_or_buffer='/data/workspace/myshixun/实战2/irisData.data',
header=None,
sep=','
)
# 自定义列名
feature_dict = {
i: label for i, label in zip(
range(4),
(
'sepalLength',
'sepalWidth',
'petalLength',
'petalWidth',
)
)
}
# 指定列名
df.columns = [l for i, l in sorted(feature_dict.items())] + ['class label']
X = df[['sepalLength',
'sepalWidth',
'petalLength',
'petalWidth',]].values
y = df['class label'].values
# 制作标签 {1:'Setosa', 2:'Versicolor', 3:'Virginica'}
enc = LabelEncoder()
label_encoder = enc.fit(y)
y = label_encoder.transform(y) + 1
# 设置小数点的位数
np.set_printoptions(precision=4)
# 求均值
def get_mean(X, y):
# 保存所有的均值
mean_vectors = []
# 计算3个类别
#**********begin**********#
for i in range(1, 4):
mean_vectors.append(np.mean(X[y == i], axis=0))
#***********end***********#
return mean_vectors
mean_vectors = get_mean(X, y)
# 计算类内散布矩阵
def class_in_matrix(mean_vectors, X, y):
# 原始数据有4个特征
S_W = np.zeros((4, 4))
for i, mv in zip(range(1, 4), mean_vectors):
class_sc_mat = np.zeros((4, 4))
# 选中当前类别的数据
for row in X[y == i]:
# 对各个特征分别进行计算,用矩阵的形式
#**********begin**********#
row, mv = row.reshape(4, 1), mv.reshape(4, 1)
# 公式
class_sc_mat += (row - mv).dot((row - mv).T)
#***********end***********#
S_W += class_sc_mat
print('类内散布矩阵:\n', S_W)
return S_W
S_W = class_in_matrix(mean_vectors,X,y)
# 计算类间散布矩阵
def class_out_matrix(mean_vectors, X, y):
overall_mean = np.mean(X, axis=0)
# 构建类间散布矩阵
S_B = np.zeros((4, 4))
# 对各个类别进行计算
for i, mean_vec in enumerate(mean_vectors):
# 当前类别的样本数
n = X[y == i + 1, :].shape[0]
mean_vec = mean_vec.reshape(4, 1)
overall_mean = overall_mean.reshape(4, 1)
# 公式
S_B += n * (mean_vec - overall_mean).dot((mean_vec - overall_mean).T)
print('类间散布矩阵:\n', S_B)
return S_B
S_B = class_out_matrix(mean_vectors,X,y)
# 求特征值与特征向量
def get_eig(S_W, S_B):
eig_vals, eig_vecs = np.linalg.eig(np.linalg.inv(S_W).dot(S_B))
# 得到每一个特征值对应的特征向量
for i in range(len(eig_vals)):
eigvec_sc = eig_vecs[:, i].reshape(4, 1)
print('\n特征向量{}:\n{}'.format(i + 1, eigvec_sc.real))
print('特征值{:}:{:.2e}'.format(i + 1, eig_vals[i].real))
return eig_vals, eig_vecs
eig_vals, eig_vecs = get_eig(S_W,S_B)
# 特征值与特征向量配对
eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:, i]) for i in range(len(eig_vals))]
# 按特征值大小排序
eig_pairs = sorted(eig_pairs, key=lambda k: k[0], reverse=True)
print('特征值排序结果:\n')
for i in eig_pairs:
print(i[0])
print('特征值占总体百分比:\n')
eigv_sum = sum(eig_vals)
for i, j in enumerate(eig_pairs):
print('特征值 {0:}:{1:.2%}'.format(i + 1, (j[0] / eigv_sum).real))
# 进行降维
#**********begin**********#
W = np.hstack((eig_pairs[0][1].reshape(4, 1), eig_pairs[1] [1].reshape(4, 1)))
X_lda = X.dot(W)
#***********end***********#
print(X_lda.shape)
第3关:PCA投影并可视化
import matplotlib
matplotlib.use('Agg')
import numpy as np
import warnings
warnings.filterwarnings("ignore")
mean= [0,0] #生成二维数据的平均值
cov= [[1,5],[5,10]] #生成数据的协方差矩阵
count=2000
np.random.seed(20)
data = np.random.multivariate_normal(mean,cov,count) #生成多元正态分布矩阵
#plt.scatter(data[:,0],data[:,1],marker = '+') #画出散点图
from numpy.linalg import eig
cov_matrix=np.dot(data.T,data)/(count-1) #数据的协方差矩阵
values,vecs=eig(cov_matrix) #对协方差矩阵进行特征值分解,返回两个数组,第一个是特征值,然后是特征向量
#协方差矩阵最大的特征值,最佳投影方向就是最大特征值所对应的特征向量
from matplotlib import pyplot as plt
#***********Begin**********
#数据的协方差矩阵
data_new=np.dot(data,vecs[:,1])
#************End***********
def hist_plot(data_new):
plt.hist(data_new,bins=80,facecolor='blue')#画出频率分布直方图
return plt
hist_plot(data_new).savefig('/data/workspace/myshixun/step3/学员文件/describe.png')
print(data_new[0:3])
第4关:PCA投影与LDA投影
import matplotlib
matplotlib.use('Agg')
import numpy as np
from numpy.linalg import eig
from matplotlib import pyplot as plt
Class_data1=[(1,2),(2,3),(3,3),(4,5),(5,5)]
Class_data2=[(1,0),(2,1),(3,1),(3,2),(5,3),(6,5)]
data1_line = np.hstack((np.array(Class_data1)[:,0],np.array(Class_data2)[:,0]))
data2_line = np.hstack((np.array(Class_data1)[:,1],np.array(Class_data2)[:,1]))
def pca():
c1 = data1_line - data1_line.mean()#减去均值,去中心化
c2 = data2_line - data2_line.mean()
data = np.vstack((c1,c2))
#***********Begin**********
cov_matrix=np.dot(data,data.T)/((len(Class_data1)+len(Class_data2))-1)
#************End***********
#求出协方差矩阵
values,vecs=eig(cov_matrix) #协方差矩阵分解
global vecs1
vecs1=vecs
print(vecs[:,0])
pca()
def LDA():
u1 = np.array(Class_data1).mean(axis=0) #每个类的均值
u2 = np.array(Class_data2).mean(axis=0)
#计算散布矩阵第一步,先计算每个类别每个样本的均值向量
m = Class_data1 - u1
n = Class_data2 - u2
#***********Begin**********
S1 = np.dot(m.T,m)
S2 = np.dot(n.T,n)
#************End***********
Sw = S1+S2
C = np.array([data1_line.mean(),data2_line.mean()])
b = np.linalg.inv(Sw)
m1 = (u1 - C).reshape(2,1)
n1 = (u2 - C).reshape(2,1)#矩阵乘塑形
S3 = np.dot(m1,m1.T)
S4 = np.dot(n1,n1.T)
#***********Begin**********
Sb=(len(Class_data1)*S3+len(Class_data2)*S4)
#************End***********
values,vecs=eig(np.dot(b,Sb))
global vecs2
vecs2 = vecs
print(vecs[:,1])
a = (u1-u2).T
print(np.dot(a,b))
LDA()
def plot_1(vecs1,vecs2):
x=np.array(Class_data1)[:,0]
y=np.array(Class_data1)[:,1]
x1 = np.array(Class_data2)[:,0]
y1 = np.array(Class_data2)[:,1]
x_ = np.linspace(0,6)
y_=(vecs1[:,1][1]/vecs1[:,1][0])*(x_)
x_1 = np.linspace(-2,2)
y_1=(vecs2[:,1][1]/vecs2[:,1][0])*(x_1)
plt.plot(x,y,'ro')
plt.plot(x_,y_,'-r',label="$PCA$")
plt.plot(x1,y1,'bo')
plt.plot(x_1,y_1,'blue',label="$LDA$")
plt.grid()
plt.legend()
return plt
plot_1(vecs1,vecs2).savefig('/data/workspace/myshixun/step4/学员文件/describe.png')
第5关:比较有投影和无投影的分类性能和效率(乳腺癌实验)
from sklearn.svm import SVC #调用支持向量机算法
from sklearn.datasets import load_breast_cancer #从sklearn库里自带的乳腺癌数据进行数据分析
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA #调用PCA库
from sklearn.preprocessing import StandardScaler #用来标准化数据集
from sklearn import metrics
def knncls():
data=load_breast_cancer()
#乳腺癌数据集一共有569个样本,30个特征,标签为二分类
x = data.data #待划分样本数据
y = data.target #待划分样本数据的结果
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25,random_state=42)
print("实际的测试集结果:",y_test) #测试的数据集结果
# 进行标准化处理
Std = StandardScaler()
x_train = Std.fit_transform(x_train)
x_test = Std.fit_transform(x_test) #此时已标准化训练集和测试集
model_linear=SVC(C=1.0,kernel="linear") #线性SVM分类,C是惩罚参数'linear'代表线性核
model_linear.fit(x_train, y_train) #在训练集上训练该线性SVM分类器
preresult=model_linear.predict(x_test)
print("没降维时使用SVM预测结果为:",preresult)
print('没降维时的准确率为:{}'.format(metrics.accuracy_score(preresult, y_test)))
# 使用PCA降维
# ***********Begin**********
pca = PCA(n_components=5)
# ************End***********
x_norm=StandardScaler().fit_transform(x) #这儿其实就是pca.fit(X),只不过进行了标准化
x_pca=pca.fit_transform(x_norm) #获得协方差矩阵
print(pca.explained_variance_ratio_) #获得5个主成分的方差占比,发现只有前两个特征为主成分,所以可以将数据降到二维
x_train = pca.fit_transform(x_train)
x_test = pca.transform(x_test)
model_linear = SVC(C=1.0, kernel="linear") # 线性SVM分类,C是惩罚参数'linear'代表线性核
model_linear.fit(x_train, y_train)
y_predict =model_linear.predict(x_test)
print("降维后SVM算法预测结果为:",y_predict )
print('降维后的准确率为:{}'.format(metrics.accuracy_score(y_predict, y_test)))
return None
knncls()