书接上文
层次聚类(hierarchical clustering)试图在不同层次对数据集进行划分,从而形成树状的聚类结构。数据集的划分可以采用“自底向上”的聚合策略,也可采用“自顶向下”的分拆策略。由于“自底向上”的策略更加通用,在这里只讨论它。
AGNES(AGglomerative NESting)算法是“自底向上”的策略的层次聚类方法,它先将数据集中每个样本作为初始聚类簇,算法在每次迭代过程中找出距离最近(相似性最高)的两个簇进行合并,不断迭代,直到达到预设的聚类簇个数。
这里的距离区别于点对点之间的距离,簇 C i , C j C_i,C_j Ci,Cj之间的距离可以被定义为:
最 小 距 离 : d m i n ( C i , C j ) = min x ∈ C i , z ∈ C j d i s t ( x , z ) (1) 最小距离:d_{min}(C_i,C_j) = \min_{x \in C_i,z \in C_j}dist(x,z) \tag{1} 最小距离:dmin(Ci,Cj)=x∈Ci,z∈Cjmindist(x,z)(1)
最 大 距 离 : d m a x ( C i , C j ) = max x ∈ C i , z ∈ C j d i s t ( x , z ) (2) 最大距离:d_{max}(C_i,C_j) = \max_{x \in C_i,z \in C_j}dist(x,z) \tag{2} 最大距离:dmax(Ci,Cj)=x∈Ci,z∈Cjmaxdist(x,z)(2)
平 均 距 离 : d a v g ( C i , C j ) = 1 ∣ C i ∣ ∣ C j ∣ ∑ x ∈ C i ∑ z ∈ C j d i s t ( x , z ) (3) 平均距离:d_{avg}(C_i,C_j) = \frac{1}{|C_i||C_j|}\sum_{x \in C_i}\sum_{z \in C_j}dist(x,z)\tag{3} 平均距离:davg(Ci,Cj)=∣Ci∣∣Cj∣1x∈Ci∑z∈Cj∑dist(x,z)(3)
集合之间的距离计算通常采用豪斯多夫距离(Hausdorff distance):
豪 斯 多 夫 距 离 : h ( C i , C j ) = max x ∈ C i { min z ∈ C j { d i s t ( x , z ) } } 豪斯多夫距离:h(C_i,C_j) = \max_{x \in C_i}\{\min_{z \in C_j}\{dist(x,z)\}\} 豪斯多夫距离:h(Ci,Cj)=x∈Cimax{z∈Cjmin{dist(x,z)}}
A more general definition of Hausdorff distance would be :
H ( C i , C j ) = max { h ( C i , C j ) , h ( C j , C i ) } H(C_i,C_j) = \max\{h(C_i,C_j),h(C_j,C_i)\} H(Ci,Cj)=max{h(Ci,Cj),h(Cj,Ci)}
伪代码:
1. h = 0
2. for every point ai of A,
2.1 shortest = Inf ;
2.2 for every point bj of B
dij = d (ai , bj )
if dij < shortest then
shortest = dij
2.3 if shortest > h then
h = shortest
当簇之间的距离采用 d m i n , d m a x , d a v g d_{min}, d_{max}, d_{avg} dmin,dmax,davg的时候,AGNES算法相应的被称为单链接(single-linkage),全链接(complete-linkage)或均链接(average-linkage)算法。
In Single-Link clustering similarity between clusters is measured as the similarity between the most similar pair of elements, one from each of the clusters, while in Complete-Link clustering the similarity is measured using the least similar pair of elements.
similarity between clusters is measured as the similarity between the most similar pair of elements, one from each of the clusters, while in Complete-Link clustering the similarity is measured using the least similar pair of elements.
# 自底向上层次聚类算法
import tsplib95
import numpy as np
import sys
# 预设聚类簇数
k = 6
# 距离计算函数 0: max, 1: min; 2: average; 3: hausdorff
dist_func = 3
# 聚类簇距离度量函数
def get_cluster_distance(cluster1, cluster2, problem):
dist = 0
if dist_func == 0:
dist = get_distance_max(cluster1, cluster2, problem)
elif dist_func == 1:
dist = get_distance_min(cluster1, cluster2, problem)
elif dist_func == 2:
dist = get_distance_average(cluster1, cluster2, problem)
elif dist_func == 3:
dist = get_distance_hausdorff(cluster1, cluster2, problem)
else:
print("dist_func is not follow standard!")
return dist
def get_node_distance(node1, node2, problem):
return problem.get_weight(node1, node2)
# 计算簇之间距离: 最大距离
def get_distance_max(cluster1, cluster2, problem):
dist_max = 0
for i in np.arange(0, len(cluster1)):
for j in np.arange(0, len(cluster2)):
temp_dist = get_node_distance(i+1, j+1, problem)
if temp_dist > dist_max:
dist_max = temp_dist
return dist_max
# 计算簇之间距离: 最小距离
def get_distance_min(cluster1, cluster2, problem):
dist_min = sys.maxsize
for i in np.arange(0, len(cluster1)):
for j in np.arange(0, len(cluster2)):
temp_dist = get_node_distance(i+1, j+1, problem)
if temp_dist < dist_min:
dist_min = temp_dist
return dist_min
# 计算簇之间距离: 平均距离
def get_distance_average(cluster1, cluster2, problem):
dist_sum = 0
for i in np.arange(0, len(cluster1)):
for j in np.arange(0, len(cluster2)):
temp_dist = get_node_distance(i+1, j+1, problem)
dist_sum += temp_dist
return dist_sum/(len(cluster1)*len(cluster2))
# 计算簇之间距离: 豪斯多夫距离
def get_distance_hausdorff (cluster1, cluster2, problem):
dist_max = 0
for i in np.arange(0, len(cluster1)):
dist_min = sys.maxsize
for j in np.arange(0, len(cluster2)):
temp_dist = get_node_distance(i+1, j+1, problem)
if temp_dist < dist_min:
dist_min = temp_dist
if dist_min > dist_max:
dist_max = dist_min
return dist_max
# 从tsp问题中读取数据,初始化样本集合sample_set
def initial_sample_set(file_path):
problem = tsplib95.load(file_path)
sample_set = []
for i in np.arange(0, problem.dimension):
temp_list = [i + 1]
sample_set.append(temp_list)
return problem, sample_set
# 寻找最近的两个簇
def find_nearest_cluster(sample_set, problem):
# O(n^2)
clusterIndex1 = -1
clusterIndex2 = -1
min_dist = sys.maxsize
for i in np.arange(0, len(sample_set)-1):
for j in np.arange(i+1, len(sample_set)):
temp_dist = get_cluster_distance(sample_set[i], sample_set[j], problem)
if temp_dist < min_dist:
min_dist = temp_dist
clusterIndex1 = i
clusterIndex2 = j
return clusterIndex1, clusterIndex2
# 打印输出层次聚类结果
def print_cluster_result(sample_set):
print(str(sample_set))
if __name__ == '__main__':
file_path = "D:\\dataset\\tsp\\dantzig42.tsp\\dantzig42.tsp"
problem, sample_set = initial_sample_set(file_path)
while len(sample_set) > k:
clusterIndex1, clusterIndex2 = find_nearest_cluster(sample_set, problem)
# 样本集合中删除对应元素
# sample_set.remove(clusterIndex1)
# sample_set.remove(clusterIndex2)
cluster1 = sample_set[clusterIndex1].copy()
cluster2 = sample_set[clusterIndex2].copy()
del sample_set[clusterIndex2]
del sample_set[clusterIndex1]
# 将合并好的簇添加到样本集合中
cluster1.extend(cluster2)
sample_set.append(cluster1)
print_cluster_result(sample_set)
# 为每个聚类簇穷举遍历
random walk(英文版)
random walk(翻译)
随机游走量子化
随机游走到Graph Embedding
To define this walk formally, take independent random variables Z 1 , Z 2 , … {\displaystyle Z_{1},Z_{2},\dots} Z1,Z2,… , where each variable is either 1 or −1, with a 50% probability for either value, and set S 0 = 0 a n d S n = ∑ j = 1 n Z j {\displaystyle S_{0}=0\,\!} \, and \, {\displaystyle S_{n}=\sum _{j=1}^{n}Z_{j}} S0=0andSn=j=1∑nZj. The series { S n } {\displaystyle \{S_{n}\}\,\!} {Sn} is called the simple random walk on Z {\displaystyle \mathbb {Z} } Z . This series (the sum of the sequence of −1s and 1s) gives the net distance walked, if each part of the walk is of length one. The expectation E ( S n ) o f S n {\displaystyle E(S_{n})\,\!} of {\displaystyle S_{n}\,\!} E(Sn)ofSn is zero. That is, the mean of all coin flips approaches zero as the number of flips increases. This follows by the finite additivity property of expectation:
E ( S n ) = ∑ j = 1 n E ( Z j ) = 0. {\displaystyle E(S_{n})=\sum _{j=1}^{n}E(Z_{j})=0.} E(Sn)=j=1∑nE(Zj)=0.
A similar calculation, using the independence of the random variables and the fact that E ( Z n 2 ) = 1 {\displaystyle E(Z_{n}^{2})=1} E(Zn2)=1, shows that:
E ( S n 2 ) = ∑ i = 1 n E ( Z i 2 ) + 2 ∑ 1 ≤ i < j ≤ n E ( Z i Z j ) = n . {\displaystyle E(S_{n}^{2})=\sum _{i=1}^{n}E(Z_{i}^{2})+2\sum _{1\leq i
E(Sn2)=i=1∑nE(Zi2)+21≤i<j≤n∑E(ZiZj)=n.
This hints that E ( ∣ S n ∣ ) {\displaystyle E(|S_{n}|)\,\!} E(∣Sn∣) the expected translation distance after n steps, should be of the order of n {\displaystyle {\sqrt {n}}} n. In fact,[6]
lim n → ∞ E ( ∣ S n ∣ ) n = 2 π . {\displaystyle \lim _{n\to \infty }{\frac {E(|S_{n}|)}{\sqrt {n}}}={\sqrt {\frac {2}{\pi }}}.} n→∞limnE(∣Sn∣)=π2.
体会: 随机游走就如他的名字(random walk)一样,下一步的状态只与当前状态和状态转移概率矩阵有关,和上一步的决策无关(有点像动态规划的思想)。上面的定义是在一维线性空间的,同样对于不同维度空间,也是一样的。
对于一个无向带权图来说:
邻接矩阵:
0 1 0 1 1 0 1 1 0 1 0 0 1 1 0 0 \begin{array}{cccc} 0 & 1 & 0 & 1 \\ 1 & 0 & 1 & 1 \\ 0 & 1 & 0 & 0 \\ 1& 1 & 0 & 0 \\ \end{array} 0101101101001100
添加自环,对角线设为1
1 1 0 1 1 1 1 1 0 1 1 0 1 1 0 1 \begin{array}{cccc} 1 & 1 & 0 & 1 \\ 1 & 1 & 1 & 1 \\ 0 & 1 & 1 & 0 \\ 1& 1 & 0 & 1 \\ \end{array} 1101111101101101
计算每个节点的度
d = ( 3 , 4 , 2 , 3 ) \mathrm{d}=(3, 4,2,3) d=(3,4,2,3)
随机游走概率矩阵:
1 3 1 4 0 1 3 1 3 1 4 1 2 1 3 0 1 4 1 2 0 1 3 1 4 0 1 3 \begin{matrix} \frac{1}{3} & \frac{1}{4} & 0 & \frac{1}{3}\\ \frac{1}{3} & \frac{1}{4} & \frac{1}{2} & \frac{1}{3}\\ 0 & \frac{1}{4} & \frac{1}{2}& 0 \\ \frac{1}{3} & \frac{1}{4} & 0 & \frac{1}{3}\\ \end{matrix} 3131031414141410212103131031
参考
设()是一个含有个变量的多元函数, = ( 1 , 2 , … , ) =(_1,_2,\dots,_) x=(x1,x2,…,xn)为维向量。
给定初始迭代点,初次行走步长,控制精度(是一个非常小的正数,用于控制结束算法)。
给定迭代控制次数,为当前迭代次数,置=1。
当 <时,随机生成一个(−1,1)之间的维向量 = ( 1 , 2 , ⋯ , ) =(_1,_2,⋯,_) u=(u1,u2,⋯,un), ( − 1 < < 1 , = 1 , 2 , ⋯ , ) (−1<_<1,=1,2,⋯,) (−1<ui<1,i=1,2,⋯,n),并将其标准化得到 ′ = u ∑ = 1 i 2 ^′=\frac{u}{\sqrt{\sum^_{=1}}_i^2} u′=∑i=1nui2u。令1=+′,完成第一步游走。
计算函数值,如果 (1)<(),即找到了一个比初始值好的点,那么重新置为1,将1变为,回到第2步;否则=+1,回到第3步。
如果连续次都找不到更优的值,则认为,最优解就在以当前最优解为中心,当前步长为半径的维球内(如果是三维,则刚好是空间中的球体)。此时,如果<,则结束算法;否则,令Font metrics not found for font: .,回到第1步,开始新一轮游走。
论文:On clustering using random walks 2001
阅读笔记:
We now offer two methods for performing the edge separation, both based on deterministic analysis of random walks.
边缘分离,锐化
NS: Separation by neighborhood similarity.
CE: Separation by circular escape.
the weighted neighborhood : 加权领域
bipartite subgraph
算法理解:
let G ( V , E , ω ) G(V,E,\omega) G(V,E,ω) be a weighted graph, V V V is the set of nodes, E E E is the edge between nodes in V V V, ω \omega ω is the function ω : E → R n \omega:E \to \mathbb{R}^n ω:E→Rn, that measures the simularity between pairs of items.
p i j = ω ( i , j ) d i p_{ij} = \frac{\omega(i,j)}{d_i} pij=diω(i,j)
d i = ∑ k = 1 n ω ( i , k ) d_i = \sum_{k=1}^n\omega(i,k) di=k=1∑nω(i,k)
M G ∈ R n × n M^G \in \mathbb{R}^{n \times n} MG∈Rn×n is the associated transition matrix,
M i j G = { p i j ⟨ i , j ⟩ ∈ E 0 otherwise M^G_{ij} = \begin{cases} p_{ij} & \langle i,j \rangle \in E \\ 0 & \textrm{otherwise} \end{cases} MijG={pij0⟨i,j⟩∈Eotherwise
这里的内容比较坑,我在论文中一直找不到关于 P visit k ( i ) P^{k}_{\textrm{visit}}(i) Pvisitk(i)是怎么计算的,在这里卡了好久好久。
在原文中的描述是这样的:
Now, denote by P v i s i t k ( i ) ∈ R n P^k_{visit}(i) \in \mathbb{R}^n Pvisitk(i)∈Rn the vector whose j-th component is the probability that a random walk originating at i will visit node j in its k-th step. Thus, P v i s i t k ( i ) P^k_{visit}(i) Pvisitk(i) is the i-th row in the matrix ( M G ) k (M^G)^k (MG)k, the k’th power of M G M^G MG.
现在我们知道 M G M^G MG是怎样计算的,但是 ( M G ) k (M^G)^k (MG)k呢,在原文中的描述是’'the k’th power of M G M^G MG", 我理解的应该是原有矩阵 M G M^G MG的k次方(矩阵的乘法)。
P v i s i t k ( i ) P^k_{visit}(i) Pvisitk(i) is the i-th row in the matrix ( M G ) k (M^G)^k (MG)k,
P v i s i t k ( i ) = ( M G ) i k P^k_{visit}(i) = (M^G)^k_i Pvisitk(i)=(MG)ik
( M G ) k = { P v i s i t k ( 1 ) T , P v i s i t k ( 2 ) T , … , P v i s i t k ( n ) T } (M^G)^k=\{P^k_{visit}(1)^{\mathbf{T}}, P^k_{visit}(2)^{\mathbf{T}}, \dots, P^k_{visit}(n)^{\mathbf{T}}\} (MG)k={Pvisitk(1)T,Pvisitk(2)T,…,Pvisitk(n)T}
Notice: 其实到这里,和马尔可夫聚类算法(MCL)是一样的。MCL是不断迭代,知道矩阵不再改变,这里作者考虑到计算复杂,采用前k次计算结果的和来作为替代。
P visit ≤ k ( v ) = ∑ i = 1 k P visit i ( v ) P^{\leq k}_{\textrm{visit}}(v) = \sum_{i=1}^kP^{i}_{\textrm{visit}}(v) Pvisit≤k(v)=i=1∑kPvisiti(v)
N S ( G ) = d f n G s ( V , E , ω s ) NS(G) \xlongequal{dfn} G_s(V, E, \omega_s) NS(G)dfnGs(V,E,ωs),
where ∀ ⟨ v , u ⟩ ∈ E , ω s ( u , v ) = s i m k ( P v i s i t ≤ k ( v ) , P v i s i t ≤ k ( u ) ) \forall \langle v, u \rangle \in E, \omega_s(u, v) = sim^k(P^{\leq k}_{visit}(v),P^{\leq k}_{visit}(u)) ∀⟨v,u⟩∈E,ωs(u,v)=simk(Pvisit≤k(v),Pvisit≤k(u))
s i m k ( x , y ) sim^k(x,y) simk(x,y) is some similarity measure of the vectors x \mathrm{x} x and y \mathrm{y} y, whose value increases as x \mathrm{x} x and y \mathrm{y} y are more similar.
s i m k ( x , y ) sim^k(x,y) simk(x,y) the suitable choose:
f k ( x , y ) = d f n exp ( 2 k − ∥ x − y ∥ L 1 ) − 1 (1) f^k(x,y) \xlongequal{dfn} \exp(2k − \|x − y\|_{L_1}) − 1 \tag{1} fk(x,y)dfnexp(2k−∥x−y∥L1)−1(1)
∥ x − y ∥ L 1 = ∑ i = 1 n ∣ x i − y i ∣ \|x − y\|_{L_1} = \sum_{i=1}^n|x_i-y_i| ∥x−y∥L1=i=1∑n∣xi−yi∣
another choose is:
cos ( x , y ) = ( x , y ) ( x , x ) . ( y , y ) (2) \cos(x,y)= \frac{(x,y)}{\sqrt{(x,x)}.\sqrt{(y,y)}} \tag{2} cos(x,y)=(x,x).(y,y)(x,y)(2)
where (·,·) denotes inner-product.(内积)
import numpy as np
def markovCluster(adjacencyMat, dimension, numIter, power=2, inflation=2):
columnSum = np.sum(adjacencyMat, axis=0)
probabilityMat = adjacencyMat / columnSum
# Expand by taking the e^th power of the matrix.
def _expand(probabilityMat, power):
expandMat = probabilityMat
for i in range(power - 1):
expandMat = np.dot(expandMat, probabilityMat)
return expandMat
expandMat = _expand(probabilityMat, power)
# Inflate by taking inflation of the resulting
# matrix with parameter inflation.
def _inflate(expandMat, inflation):
powerMat = expandMat
for i in range(inflation - 1):
powerMat = powerMat * expandMat
inflateColumnSum = np.sum(powerMat, axis=0)
inflateMat = powerMat / inflateColumnSum
return inflateMat
inflateMat = _inflate(expandMat, inflation)
for i in range(numIter):
expand = _expand(inflateMat, power)
inflateMat = _inflate(expand, inflation)
print(inflateMat)
print(np.zeros((7, 7)) != inflateMat)
if __name__ == "__main__":
dimension = 4
numIter = 10
adjacencyMat = np.array([[1, 1, 1, 1],
[1, 1, 0, 1],
[1, 0, 1, 0],
[1, 1, 0, 1]])
# adjacencyMat = np.array([[1, 1, 1, 1, 0, 0, 0],
# [1, 1, 1, 1, 1, 0, 0],
# [1, 1, 1, 1, 0, 0, 0],
# [1, 1, 1, 1, 0, 0, 0],
# [0, 1, 0, 0, 1, 1, 1],
# [0, 0, 0, 0, 1, 1, 1],
# [0, 0, 0, 0, 1, 1, 1],
# ])
markovCluster(adjacencyMat, dimension, numIter)
[[1.00000000e+000 1.00000000e+000 1.00000000e+000 1.00000000e+000]
[5.23869755e-218 5.23869755e-218 5.23869755e-218 5.23869755e-218]
[0.00000000e+000 0.00000000e+000 0.00000000e+000 0.00000000e+000]
[5.23869755e-218 5.23869755e-218 5.23869755e-218 5.23869755e-218]]
[[ True True True True]
[ True True True True]
[False False False False]
[ True True True True]]
可以从中得到聚类效果 { { 1 , 2 , 4 } , { 3 } } \{\{1,2,4\},\{3\}\} {{1,2,4},{3}}
谱聚类
MCL
MCL GitHub