Kmeans && Kmeans++ && Davies-Bouldin && Dunn index

  1 import java.io.BufferedReader;

  2 import java.io.FileReader;

  3 import java.io.FileWriter;

  4 import java.io.ObjectInputStream.GetField;

  5 import java.util.ArrayList;

  6 import java.util.Random;

  7 

  8 

  9 public class Kmeans {

 10 

 11     private int k;

 12     private Vector[] cluster_centers;

 13     private int[] point_ids;

 14     

 15     private int num_clusters;//used to adapt to different initialization 

 16     

 17     Kmeans(int k){

 18         this.k=k;

 19         cluster_centers=new Vector[k];

 20     }

 21     

 22     private Vector get_cluster_center(int i){

 23         return cluster_centers[i];

 24     }

 25     

 26     private int get_point_id(int i){

 27         return point_ids[i];    

 28     }

 29 

 30     /**

 31      * returns the index of the cluster center the closest to the point X

 32      */

 33     private int get_closest_center_id(Vector X){    

 34         int id=0;

 35         double tmp,distance=distance(X,get_cluster_center(0));

 36         for(int i=1;i<k&&i<num_clusters;i++){

 37             tmp=distance(X,get_cluster_center(i));

 38             if(distance>tmp){

 39                 distance=tmp;

 40                 id=i;

 41             }

 42         }

 43         return id;    

 44     }

 45     

 46     

 47     /**

 48      * add the copy construction in class Vector and KahanSum

 49      * */

 50     private double distance(final Vector A, final Vector B){

 51         Vector diff=new Vector(A);

 52         diff.sub(B);

 53         return diff.norm();

 54     }

 55     

 56     /**

 57      * a simple random initialization, used in Kmeans

 58      */

 59     private void init_point_indexes(Random rnd){

 60         for(int i=0;i<point_ids.length;i++)

 61             point_ids[i] = rnd.nextInt(k);    

 62         num_clusters=k;

 63     }

 64     /**

 65      * Kmeans++ initialization

 66      */

 67     private void init_point_KMplusplus(Random rd,ArrayList<Vector> points){

 68         int cluster0=rd.nextInt(points.size());//choose the first center

 69         cluster_centers[0]=points.get(cluster0);

 70         //points.remove(cluster0);

 71         num_clusters=1;

 72         

 73         double[] D=new double[points.size()];

 74         

 75         while(num_clusters < k){

 76             int sum=0;

 77             for(int i=0;i<points.size();i++){    

 78                 double d=distance(points.get(i),cluster_centers[ get_closest_center_id(points.get(i)) ]);

 79                 sum+=d*d;

 80                 D[i]=sum;    

 81             }

 82             

 83              double r = rd.nextDouble()*sum;

 84              for (int i = 0 ; i < D.length; i++) {

 85                     if (D[i] >= r){

 86                         cluster_centers[num_clusters]=points.get(i);

 87                         //points.remove(i);

 88                         num_clusters++;

 89                         break;

 90                     }

 91              }

 92             

 93         }

 94         

 95         assignment(points);

 96         

 97     }

 98 /**********************************************************************

 99  * choose init_point_indexs(new Random) to do the Kmeans initialization

100  * or

101  * choose init_point_KMplusplus(new Random(),points) 

102  *     to do the Kmeans++ initialization

103  * 

104  * ******************************************************************/    

105     int clusterize(ArrayList<Vector> points){

106         int iterations=1;

107         point_ids = new int[points.size()];        

108 

109         //init_point_indexes(new Random());

110         init_point_KMplusplus(new Random(),points);

111         

112         update(points);    

113 

114         while(!assignment(points)){

115         update(points);

116         iterations++;

117         //if(iterations>100)return iterations;

118         }

119         return iterations;

120     }

121 

122     //update the center of different clusters

123     private void update(ArrayList<Vector> points){

124         ArrayList<Vector> cluster_members=new ArrayList<Vector>();

125         

126         //calculate the centers

127         for(int i=0;i<k;i++){

128             for(int j=0;j<point_ids.length;j++){

129                 if(get_point_id(j)==i)

130                     cluster_members.add(points.get(j));

131             }

132             

133             //System.out.println("!!!!!!"+cluster_members.size());

134             if(cluster_members.size()!=0)

135             cluster_centers[i]=Vector.vector_median(cluster_members);    

136             cluster_members.clear();

137         }

138     }

139     

140     private boolean assignment(ArrayList<Vector> points){

141         boolean convergence=true;

142         

143         for(int i=0;i<points.size();i++){

144             int closest_id=get_closest_center_id(points.get(i));

145             if(point_ids[i]!=closest_id){

146                 point_ids[i]=closest_id;

147                 convergence=false;

148             }

149         }

150         return convergence;    

151     }

152     

153     public void write_data_withID(String filename,ArrayList<Vector> list){

154         FileWriter fw;

155         int i=0,j=0;

156         Vector v;

157         try

158         {

159             fw = new FileWriter(filename);

160             while(j<list.size()){

161                 i=0;

162             v=list.get(j++);

163             while(i<v.get_length()){

164             fw.write(Double.toString(v.get(i++))+" ");

165             }

166             fw.write(Integer.toString(get_point_id(j-1)));

167             fw.write('\n');

168             

169             }

170             fw.flush();

171             fw.close();

172             System.out.println("Vector write with cluster_id finished");

173 

174         }catch(Exception e){

175             e.printStackTrace();

176         }    

177     }

178 

179     /************************

180      * cluster analysis:

181      * http://en.wikipedia.org/wiki/Cluster_analysis

182      * higher is better

183      * Dunn need the points to be clustered first 

184      * so load the List of points and do clusterize() function

185      * **********************/ 

186     public double Dunn(ArrayList<Vector> points){

187         

188         clusterize(points);

189         double max_intra_distance=0;

190         double min_cluster_distance=Double.MAX_VALUE;

191         double temp=0;

192         double temp_intra_distance=0;

193         for(int i=0;i<k;i++){

194             

195             temp_intra_distance = max_intra_distance(i,points);

196             if(temp_intra_distance > max_intra_distance)

197                 max_intra_distance = temp_intra_distance;

198             

199             for(int j=i+1;j<k;j++){

200                 temp=distance(cluster_centers[i], cluster_centers[j]);

201                 if(temp<min_cluster_distance)

202                     min_cluster_distance=temp;

203             }

204         }

205         if(min_cluster_distance==Double.MAX_VALUE||max_intra_distance==0)

206         {

207             System.out.println("Only have one cluster or Max intra cluster distance is 0" +

208                     "\nthe return value will be '0'.");

209             return 0;

210         }

211         

212         return min_cluster_distance/max_intra_distance;

213         

214     }

215     /**

216      *calculate the average distance of points of cluster i,  

217      *do clusterize to cluster the points first,  

218      *the lower the better

219      */

220     public double Davies_Bouldin(ArrayList<Vector> points){

221         clusterize(points);

222         double[] Average=AverageDistance(points);//average distance of points of cluster i

223         

224         double maxValue=0;

225         KahanSum sum=new KahanSum();

226         

227         for(int i=0;i<k;i++){

228             for(int j=i+1;j<k;j++){

229                 double temp=Average[i]+Average[j];

230                 temp/=distance(cluster_centers[i],cluster_centers[j]);

231                 if(temp>maxValue)

232                     maxValue=temp;

233             }

234             sum.add(maxValue);    

235         }

236         if(k==1)System.out.println("cluster number is 1, the value will be 0");

237         

238         return sum.getsum()/k;

239         

240     }

241     /**

242      * the function max_intra_distance get the max intra distance in cluster i

243      * */

244     private double max_intra_distance(int i,ArrayList<Vector> points){

245         double dis=0;

246         double max_inrta=0;

247         for(int j=0;j<points.size();j++){

248             if(point_ids[j]==i){

249                 dis=distance(cluster_centers[i],points.get(j));

250             if(dis>max_inrta)

251                 max_inrta=dis;

252             }

253         }

254 

255         return max_inrta;

256     }

257     /**

258      * return a double[], element i has the value of average distance

259      * of the points of cluster i

260      */

261     private double[] AverageDistance( ArrayList<Vector> points){

262         

263         double[] average =new double[k];

264         KahanSum distance=new KahanSum();

265         int count=0;

266         

267         for(int i=0;i<k;i++){

268             for(int j=0;j<point_ids.length;j++){

269                 if(get_point_id(j)==i)

270                     {

271                     count++;

272                     distance.add(distance(cluster_centers[i],points.get(j)));

273                     }

274             }

275             

276             average[i]=0;

277             if(count!=0)

278                 average[i]=distance.getsum()/count;    

279             distance.reset();

280             count=0;

281         }

282         return average;

283 

284     }

285     

286     public static void main(String[] args) {

287         // TODO Auto-generated method stub

288         ArrayList<Vector> points;

289         if(args.length==0)

290             points=Vector.read_data("dataset-4");//the dataset

291         else

292             points=Vector.read_data(args[0]);

293         //points.get(0).printvec();

294         

295         int dim=points.get(0).get_length();

296         //System.out.println(points.size()+" "+dim);

297         

298         Kmeans km=new Kmeans(2);

299         System.out.println("the iterations is: "+km.clusterize(points)+"\n" +

300                 "    by using the initialization of kmeans++.");

301         if(args.length==2)

302         km.write_data_withID(args[1], points);

303         else

304         km.write_data_withID("out-datasets", points);

305 

306         

307         for(int i=1;i<6;i++){

308             km=new Kmeans(i);

309             System.out.println("Dunn cluster_num ="+i+" "+km.Dunn(points));

310         }

311         

312         for(int i=1;i<6;i++){

313             km=new Kmeans(i);

314             System.out.println("Davies_Bouldin cluster_num ="+i+" "+km.Davies_Bouldin(points));

315         }

316         

317     

318     }

319 

320 }

 

你可能感兴趣的:(index)