Kmeans && Kmeans++ && Davies-Bouldin && Dunn index
1 import java.io.BufferedReader; 2 import java.io.FileReader; 3 import java.io.FileWriter; 4 import java.io.ObjectInputStream.GetField; 5 import java.util.ArrayList; 6 import java.util.Random; 7 8 9 public class Kmeans { 10 11 private int k; 12 private Vector[] cluster_centers; 13 private int[] point_ids; 14 15 private int num_clusters;//used to adapt to different initialization 16 17 Kmeans(int k){ 18 this.k=k; 19 cluster_centers=new Vector[k]; 20 } 21 22 private Vector get_cluster_center(int i){ 23 return cluster_centers[i]; 24 } 25 26 private int get_point_id(int i){ 27 return point_ids[i]; 28 } 29 30 /** 31 * returns the index of the cluster center the closest to the point X 32 */ 33 private int get_closest_center_id(Vector X){ 34 int id=0; 35 double tmp,distance=distance(X,get_cluster_center(0)); 36 for(int i=1;i<k&&i<num_clusters;i++){ 37 tmp=distance(X,get_cluster_center(i)); 38 if(distance>tmp){ 39 distance=tmp; 40 id=i; 41 } 42 } 43 return id; 44 } 45 46 47 /** 48 * add the copy construction in class Vector and KahanSum 49 * */ 50 private double distance(final Vector A, final Vector B){ 51 Vector diff=new Vector(A); 52 diff.sub(B); 53 return diff.norm(); 54 } 55 56 /** 57 * a simple random initialization, used in Kmeans 58 */ 59 private void init_point_indexes(Random rnd){ 60 for(int i=0;i<point_ids.length;i++) 61 point_ids[i] = rnd.nextInt(k); 62 num_clusters=k; 63 } 64 /** 65 * Kmeans++ initialization 66 */ 67 private void init_point_KMplusplus(Random rd,ArrayList<Vector> points){ 68 int cluster0=rd.nextInt(points.size());//choose the first center 69 cluster_centers[0]=points.get(cluster0); 70 //points.remove(cluster0); 71 num_clusters=1; 72 73 double[] D=new double[points.size()]; 74 75 while(num_clusters < k){ 76 int sum=0; 77 for(int i=0;i<points.size();i++){ 78 double d=distance(points.get(i),cluster_centers[ get_closest_center_id(points.get(i)) ]); 79 sum+=d*d; 80 D[i]=sum; 81 } 82 83 double r = rd.nextDouble()*sum; 84 for (int i = 0 ; i < D.length; i++) { 85 if (D[i] >= r){ 86 cluster_centers[num_clusters]=points.get(i); 87 //points.remove(i); 88 num_clusters++; 89 break; 90 } 91 } 92 93 } 94 95 assignment(points); 96 97 } 98 /********************************************************************** 99 * choose init_point_indexs(new Random) to do the Kmeans initialization 100 * or 101 * choose init_point_KMplusplus(new Random(),points) 102 * to do the Kmeans++ initialization 103 * 104 * ******************************************************************/ 105 int clusterize(ArrayList<Vector> points){ 106 int iterations=1; 107 point_ids = new int[points.size()]; 108 109 //init_point_indexes(new Random()); 110 init_point_KMplusplus(new Random(),points); 111 112 update(points); 113 114 while(!assignment(points)){ 115 update(points); 116 iterations++; 117 //if(iterations>100)return iterations; 118 } 119 return iterations; 120 } 121 122 //update the center of different clusters 123 private void update(ArrayList<Vector> points){ 124 ArrayList<Vector> cluster_members=new ArrayList<Vector>(); 125 126 //calculate the centers 127 for(int i=0;i<k;i++){ 128 for(int j=0;j<point_ids.length;j++){ 129 if(get_point_id(j)==i) 130 cluster_members.add(points.get(j)); 131 } 132 133 //System.out.println("!!!!!!"+cluster_members.size()); 134 if(cluster_members.size()!=0) 135 cluster_centers[i]=Vector.vector_median(cluster_members); 136 cluster_members.clear(); 137 } 138 } 139 140 private boolean assignment(ArrayList<Vector> points){ 141 boolean convergence=true; 142 143 for(int i=0;i<points.size();i++){ 144 int closest_id=get_closest_center_id(points.get(i)); 145 if(point_ids[i]!=closest_id){ 146 point_ids[i]=closest_id; 147 convergence=false; 148 } 149 } 150 return convergence; 151 } 152 153 public void write_data_withID(String filename,ArrayList<Vector> list){ 154 FileWriter fw; 155 int i=0,j=0; 156 Vector v; 157 try 158 { 159 fw = new FileWriter(filename); 160 while(j<list.size()){ 161 i=0; 162 v=list.get(j++); 163 while(i<v.get_length()){ 164 fw.write(Double.toString(v.get(i++))+" "); 165 } 166 fw.write(Integer.toString(get_point_id(j-1))); 167 fw.write('\n'); 168 169 } 170 fw.flush(); 171 fw.close(); 172 System.out.println("Vector write with cluster_id finished"); 173 174 }catch(Exception e){ 175 e.printStackTrace(); 176 } 177 } 178 179 /************************ 180 * cluster analysis: 181 * http://en.wikipedia.org/wiki/Cluster_analysis 182 * higher is better 183 * Dunn need the points to be clustered first 184 * so load the List of points and do clusterize() function 185 * **********************/ 186 public double Dunn(ArrayList<Vector> points){ 187 188 clusterize(points); 189 double max_intra_distance=0; 190 double min_cluster_distance=Double.MAX_VALUE; 191 double temp=0; 192 double temp_intra_distance=0; 193 for(int i=0;i<k;i++){ 194 195 temp_intra_distance = max_intra_distance(i,points); 196 if(temp_intra_distance > max_intra_distance) 197 max_intra_distance = temp_intra_distance; 198 199 for(int j=i+1;j<k;j++){ 200 temp=distance(cluster_centers[i], cluster_centers[j]); 201 if(temp<min_cluster_distance) 202 min_cluster_distance=temp; 203 } 204 } 205 if(min_cluster_distance==Double.MAX_VALUE||max_intra_distance==0) 206 { 207 System.out.println("Only have one cluster or Max intra cluster distance is 0" + 208 "\nthe return value will be '0'."); 209 return 0; 210 } 211 212 return min_cluster_distance/max_intra_distance; 213 214 } 215 /** 216 *calculate the average distance of points of cluster i, 217 *do clusterize to cluster the points first, 218 *the lower the better 219 */ 220 public double Davies_Bouldin(ArrayList<Vector> points){ 221 clusterize(points); 222 double[] Average=AverageDistance(points);//average distance of points of cluster i 223 224 double maxValue=0; 225 KahanSum sum=new KahanSum(); 226 227 for(int i=0;i<k;i++){ 228 for(int j=i+1;j<k;j++){ 229 double temp=Average[i]+Average[j]; 230 temp/=distance(cluster_centers[i],cluster_centers[j]); 231 if(temp>maxValue) 232 maxValue=temp; 233 } 234 sum.add(maxValue); 235 } 236 if(k==1)System.out.println("cluster number is 1, the value will be 0"); 237 238 return sum.getsum()/k; 239 240 } 241 /** 242 * the function max_intra_distance get the max intra distance in cluster i 243 * */ 244 private double max_intra_distance(int i,ArrayList<Vector> points){ 245 double dis=0; 246 double max_inrta=0; 247 for(int j=0;j<points.size();j++){ 248 if(point_ids[j]==i){ 249 dis=distance(cluster_centers[i],points.get(j)); 250 if(dis>max_inrta) 251 max_inrta=dis; 252 } 253 } 254 255 return max_inrta; 256 } 257 /** 258 * return a double[], element i has the value of average distance 259 * of the points of cluster i 260 */ 261 private double[] AverageDistance( ArrayList<Vector> points){ 262 263 double[] average =new double[k]; 264 KahanSum distance=new KahanSum(); 265 int count=0; 266 267 for(int i=0;i<k;i++){ 268 for(int j=0;j<point_ids.length;j++){ 269 if(get_point_id(j)==i) 270 { 271 count++; 272 distance.add(distance(cluster_centers[i],points.get(j))); 273 } 274 } 275 276 average[i]=0; 277 if(count!=0) 278 average[i]=distance.getsum()/count; 279 distance.reset(); 280 count=0; 281 } 282 return average; 283 284 } 285 286 public static void main(String[] args) { 287 // TODO Auto-generated method stub 288 ArrayList<Vector> points; 289 if(args.length==0) 290 points=Vector.read_data("dataset-4");//the dataset 291 else 292 points=Vector.read_data(args[0]); 293 //points.get(0).printvec(); 294 295 int dim=points.get(0).get_length(); 296 //System.out.println(points.size()+" "+dim); 297 298 Kmeans km=new Kmeans(2); 299 System.out.println("the iterations is: "+km.clusterize(points)+"\n" + 300 " by using the initialization of kmeans++."); 301 if(args.length==2) 302 km.write_data_withID(args[1], points); 303 else 304 km.write_data_withID("out-datasets", points); 305 306 307 for(int i=1;i<6;i++){ 308 km=new Kmeans(i); 309 System.out.println("Dunn cluster_num ="+i+" "+km.Dunn(points)); 310 } 311 312 for(int i=1;i<6;i++){ 313 km=new Kmeans(i); 314 System.out.println("Davies_Bouldin cluster_num ="+i+" "+km.Davies_Bouldin(points)); 315 } 316 317 318 } 319 320 }