1 根据数据构造一个graph, graph的每个节点对应一个数据点, 边的权重表示数据之间的相似度, 0表示完全不相似(通常为减少计算量, 将相似度低于阀值r的都记为0)。 graph用邻接矩阵的形式表示出来, 记为W.
2 把W的每一列元素加起来得到N个数,把他们放在对角上形成一个N * N矩阵, 记为D, D的其他元素都是0。 并令 L = D - W
3 求出L的前k个特征值及其对应的特征列向量
4 将这k个特征列向量排列在一起组成一个N * k的矩阵, 将每一行看做是一个k维空间中的向量, 然后对这N个k维向量进行聚类。
聚类的结果就是最终graph中的点的类别。
cut(A, B) = sigma(i in A, j in B) wij
RatioCut(A, B) = cut(A, B)/|A| + cut(A, B)/|B|
NCut(A, B) = cut(A, B)/vol(A) + cut(A,B)/vol(B), NCut 是normalized cut, vol(A)表示A中的边之和
两者都可以算作 A 的“大小”的一种度量,通过在分母上放置这样的项,就可以有效地防止孤立点的情况出现,达到相对平均一些的分割。
public class SpectralClusterer{ /** * The points of the dataset */ protected DoubleMatrix2D v; /** * The class number of each point in the dataset */ protected int[] cluster; /** * The number of different clusters found */ protected int numOfClusters = 0; /** * The alpha star parameter value */ protected double alpha_star = 0.5; /** * The distance cut factor */ protected double r = -1; /** * The sigma scaling factor */ protected double sigma = 1.0; /** * The using sparse matrix flag */ protected boolean useSparseMatrix = false; //计算x, y之间的欧式距离 protected static double distnorm2(DoubleMatrix1D x, DoubleMatrix1D y) { DoubleMatrix1D z = x.copy(); z.assign(y, Functions.minus); return z.zDotProduct(z); } //合并两个无交集的点集合。 点用整数表示 protected static int[] merge(int[] a, int[] b) { int[] v = new int[a.length + b.length]; System.arraycopy(a, 0, v, 0, a.length); System.arraycopy(b, 0, v, a.length, b.length); return v; } ////note, asso 与 cut 正好相对 //a,b表示两个点集,W是点的相似度矩阵; asso返回Cut(a, b). //Computes the association degree between two partitions of a graph protected static double asso(DoubleMatrix2D W, int[] a, int[] b) { return W.viewSelection(a, b).zSum(); } //同上, 返回NCut(a, b) protected static double Nasso(DoubleMatrix2D W, int[] a, int[] b) { int[] v = merge(a, b); return Nasso(W, a, b, v); } //相对于指定subgraph v的NCut(a, b) ...? protected static double Nasso(DoubleMatrix2D W, int[] a, int[] b, int[] v) { return asso(W, a, a) / asso(W, a, v) + asso(W, b, b) / asso(W, b, v); } //Returns the normalized dissimilarity degree (or cut) between two partitions protected static double Ncut(DoubleMatrix2D W, int[] a, int[] b) { return 2 - Nasso(W, a, b); } //Returns the normalized dissimilarity degree (or cut) between two partitions protected static double Ncut(DoubleMatrix2D W, int[] a, int[] b, int[] v) { return 2 - Nasso(W, a, b, v); } //计算 W的bestCut (最小); 得到两个partition, 分别表示两个点集(cluster) protected static int[][] bestCut(DoubleMatrix2D W) { int n = W.columns(); // Builds the diagonal matrices D and D^(-1/2) (represented as their diagonals) DoubleMatrix1D d = DoubleFactory1D.dense.make(n); DoubleMatrix1D d_minus_1_2 = DoubleFactory1D.dense.make(n); for(int i = 0; i < n; i++) { double d_i = W.viewRow(i).zSum(); d.set(i, d_i); //d就是步骤2中的D,w中各列加起来形成的对角矩阵 d_minus_1_2.set(i, 1 / Math.sqrt(d_i));//d_minus_1_2 是什么? } DoubleMatrix2D D = DoubleFactory2D.sparse.diagonal(d);//这里的D是什么 DoubleMatrix2D X = D.copy(); // X = D^(-1/2) * (D - W) * D^(-1/2),,,, X就是步骤2中的L。。? X.assign(W, Functions.minus); for(int i = 0; i < n; i++) for(int j = 0; j < n; j++) X.set(i, j, X.get(i, j) * d_minus_1_2.get(i) * d_minus_1_2.get(j)); // Computes the eigenvalues and the eigenvectors of X //计算特征值与特征向量 EigenvalueDecomposition e = new EigenvalueDecomposition(X); DoubleMatrix1D lambda = e.getRealEigenvalues(); // Selects the eigenvector z_2 associated with the second smallest eigenvalue // Creates a map that contains the pairs <index, eigevalue> //得到第2小的特征值 AbstractIntDoubleMap map = new OpenIntDoubleHashMap(n); for(int i = 0; i < n; i++) map.put(i, Math.abs(lambda.get(i))); IntArrayList list = new IntArrayList(); // Sorts the map on the value map.keysSortedByValue(list); // Gets the index of the second smallest element int i_2 = list.get(1); // y_2 = D^(-1/2) * z_2 DoubleMatrix1D y_2 = e.getV().viewColumn(i_2).copy(); y_2.assign(d_minus_1_2, Functions.mult); // Creates a map that contains the pairs <i, y_2[i]> map.clear(); for(int i = 0; i < n; i++) map.put(i, y_2.get(i)); // Sorts the map on the value map.keysSortedByValue(list); // Search the element in the map previuosly ordered that minimizes the cut // of the partition double best_cut = Double.POSITIVE_INFINITY; int[][] partition = new int[2][]; // The array v contains all the elements of the graph ordered by their // projection on vector y_2 int[] v = list.elements(); // For each admissible splitting point i for(int i = 1; i < n; i++) { // The array a contains all the elements that have a projection on vector // y_2 less or equal to the one of i-th element // The array b contains the remaining elements int[] a = new int[i]; int[] b = new int[n - i]; System.arraycopy(v, 0, a, 0, i); System.arraycopy(v, i, b, 0, n - i); double cut = Ncut(W, a, b, v); if(cut < best_cut) { best_cut = cut; partition[0] = a; partition[1] = b; } } return partition; } /** * Splits recursively the points of the graph while the value of the * best cut found is less of a specified limit (the alpha star factor). * * @param W the weight matrix of the graph * @param alpha_star the alpha star factor * @return an array of sets of points (partitions) */ //以alpha_star作为阀值,在W上递归的进行partition protected static int[][] partition(DoubleMatrix2D W, double alpha_star) { // If the graph contains only one point if(W.columns() == 1) { int[][] p = new int[1][1]; p[0][0] = 0; return p; // Otherwise } else { // Computes the best cut int[][] cut = bestCut(W); // Computes the value of the found cut double cutVal = Ncut(W, cut[0], cut[1], null); // If the value is less than alpha star if(cutVal < alpha_star) { // Recursively partitions the first one found ... DoubleMatrix2D W0 = W.viewSelection(cut[0], cut[0]); int[][] p0 = partition(W0, alpha_star); // ... and the second one DoubleMatrix2D W1 = W.viewSelection(cut[1], cut[1]); int[][] p1 = partition(W1, alpha_star); // Merges the partitions found in the previous recursive steps int[][] p = new int[p0.length + p1.length][]; for(int i = 0; i < p0.length; i++) { p[i] = new int[p0[i].length]; for(int j = 0; j < p0[i].length; j++) p[i][j] = cut[0][p0[i][j]]; } for(int i = 0; i < p1.length; i++) { p[i + p0.length] = new int[p1[i].length]; for(int j = 0; j < p1[i].length; j++) p[i + p0.length][j] = cut[1][p1[i][j]]; } return p; //得到所有合适的partition (得到了所有合适的聚类) } else { // Otherwise returns the partitions found in current step // w/o recursive invocation int[][] p = new int[1][W.columns()]; for(int i = 0; i < p[0].length; i++) p[0][i] = i; return p; } } } public int numberOfClusters() throws java.lang.Exception { return numOfClusters; } //分类。 前提是已经聚类; 返回距离instance最近的类 public int clusterInstance(Instance instance) throws java.lang.Exception { DoubleMatrix1D u = DoubleFactory1D.dense.make(instance.toDoubleArray()); double min_dist = Double.POSITIVE_INFINITY; int c = -1; for(int i = 0; i < v.rows(); i++) { double dist = distnorm2(u, v.viewRow(i)); if(dist < min_dist) { c = cluster[i]; min_dist = dist; } } return c; } //聚类 public void buildClusterer(Instances data) throws java.lang.Exception { int n = data.numInstances(); int k = data.numAttributes(); DoubleMatrix2D w; if(useSparseMatrix) w = DoubleFactory2D.sparse.make(n, n); else w = DoubleFactory2D.dense.make(n, n); double[][] v1 = new double[n][]; for(int i = 0; i < n; i++) v1[i] = data.instance(i).toDoubleArray(); v = DoubleFactory2D.dense.make(v1); double sigma_sq = sigma * sigma; //Sets up similarity matrix。。。 得到矩阵W for(int i = 0; i < n; i++) for(int j = i; j < n; j++) { double dist = distnorm2(v.viewRow(i), v.viewRow(j)); if((r == -1) || (dist < r)) { double sim = Math.exp(- (dist * dist) / (2 * sigma_sq)); w.set(i, j, sim); w.set(j, i, sim); } } //Partitions points。。。。。。。。进行partition int[][] p = partition(w, alpha_star); //Deploys results。。。。。。。。。输出聚类结果 numOfClusters = p.length; cluster = new int[n]; for(int i = 0; i < p.length; i++) for(int j = 0; j < p[i].length; j++) cluster[p[i][j]] = i; } }
partition调用bestCut对W进行划分,直到cut > alpha_cut为止。
bestCut的计算使用了本文最开头的步骤(有一些不一样。。), W, D, L=D-W,计算特征值和特征向量。。
还需要理解。。