本文从机器学习大方向着手,分三篇,从无监督,有监督,强化学习三大部分介绍最简单的概念和知识,涉及部分数学推导和代码。主要参考:北理工礼欣老师的MOOC《Python机器学习应用》,布客飞龙译《写给人类的机器学习》,吴恩达老师机器学习教程中文笔记-在线版
欧式距离:即直线相连距离
曼哈顿距离:沿着坐标轴走直线,将每个维度的差绝对值相加
马氏距离:协方差距离,尺度无关,下面是某知乎用户的回答,“将变量按照主成分进行旋转,让维度间相互独立,然后进行标准化,让维度同分布就OK了”
夹角余弦:a和b的点乘,考虑了夹角关系
if __name__ == '__main__':
# 自己写的函数,生成列表
data,cityName = loadData('city.txt')
# KMeans(n_clusters聚类数,[init=k-means++聚类方法,max_iter=300迭代次数])
km = KMeans(n_clusters=4)
# 计算中心并分配序号,对应了后面的label[i]为相应的簇
label = km.fit_predict(data)
# axis按行求和
expenses = numpy.sum(km.cluster_centers_,axis=1)
CityCluster = [[],[],[],[]]
for i in range(len(cityName)):
CityCluster[label[i]].append(cityName[i])
for i in range(len(CityCluster)):
print("Expenses:%.2f" % expenses[i])
print(CityCluster[i])
Ax=λx
,其中A就是要投影过去的坐标基向量,x为特征向量,λ就是特征值,而为了方便计算,因为E除了对角元之外都是0所以我们将λ=λE
,上式也就是最熟悉的|A-λE|x=0
。这是通用的求法,如果还不懂,可以看这个知乎回答,太好了写的。import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.datasets import load_iris
data = load_iris()
y = data.target
X = data.data
# 降维后主成分为2
pca = PCA(n_components=2)
reduced_X = pca.fit_transform(X)
red_x, red_y = [], []
blue_x, blue_y = [], []
green_x, green_y = [], []
for i in range(len(reduced_X)):
if y[i] == 0:
red_x.append(reduced_X[i][0])
red_y.append(reduced_X[i][1])
elif y[i] == 1:
blue_x.append(reduced_X[i][0])
blue_y.append(reduced_X[i][1])
else:
green_x.append(reduced_X[i][0])
green_y.append(reduced_X[i][1])
# 数据可视化
plt.scatter(red_x, red_y, c='r', marker='x')
plt.scatter(blue_x, blue_y, c='b', marker='D')
plt.scatter(green_x, green_y, c='g', marker='.')
plt.show()
M=UΣVt
这个式子就推出来啦。import numpy as np
import numpy.linalg as la
import matplotlib.pyplot as plt
from sklearn import datasets
from skimage import io
def getImgAsMat(index):
ds = datasets.fetch_olivetti_faces()
return np.mat(ds.images[index])
def getImgAsMatFromFile(filename):
img = io.imread(filename, as_grey=True)
return np.mat(img)
def plotImg(imgMat):
plt.imshow(imgMat, cmap=plt.cm.gray)
plt.show()
def recoverBySVD(imgMat, k):
# singular value decomposition
U, s, V = la.svd(imgMat)
# choose top k important singular values (or eigens)
Uk = U[:, 0:k]
Sk = np.diag(s[0:k])
Vk = V[0:k, :]
# recover the image
imgMat_new = Uk * Sk * Vk
return imgMat_new
A = getImgAsMatFromFile('D:/pic.jpg')
plotImg(A)
A_new = recoverBySVD(A, 30)
plotImg(A_new)