文本聚类——Kmeans

上两篇文章分别用朴素贝叶斯算法和KNN算法对newgroup文本进行了分类测试,本文使用Kmeans算法对文本进行聚类。


1、文本预处理

文本预处理在前面两本文章中已经介绍,此处(略)。


2、文本向量化

package com.datamine.kmeans;

import java.io.*;
import java.util.*;
import java.util.Map.Entry;

/**
 * 计算文档的属性向量,将所有文档向量化
 * @author Administrator
 */
public class ComputeWordsVector {

	/**
	 * 计算文档的TF-IDF属性向量,返回Map<文件名,<特征词,TF-IDF值>>
	 * @param testSampleDir 处理好的聚类样本测试样例集
	 * @return 所有测试样例的属性向量构成的map
	 * @throws IOException
	 */
	public Map> computeTFMultiIDF(String testSampleDir) throws IOException{
		
		String word;
		Map> allTestSampleMap = new TreeMap>();
		Map idfPerWordMap = computeIDF(testSampleDir);
		Map tfPerDocMap = new TreeMap();
		
		File[] samples = new File(testSampleDir).listFiles();
		System.out.println("the total number of test files is " + samples.length);
		for(int i = 0;i> tempTF = tfPerDocMap.entrySet();
			for(Iterator> mt = tempTF.iterator();mt.hasNext();){
				Map.Entry me = mt.next();
				if(me.getValue() > maxCount)
					maxCount = me.getValue();
			}
			
			for(Iterator> mt = tempTF.iterator();mt.hasNext();){
				Map.Entry me = mt.next();
				Double IDF = Math.log(samples.length / idfPerWordMap.get(me.getKey()));
				wordWeight = (me.getValue() / wordSumPerDoc) * IDF;
				tfPerDocMap.put(me.getKey(), wordWeight);
			}
			TreeMap tempMap = new TreeMap();
			tempMap.putAll(tfPerDocMap);
			allTestSampleMap.put(samples[i].getName(), tempMap);
		}
		printTestSampleMap(allTestSampleMap);
		return allTestSampleMap;
	}
	
	/**
	 * 输出测试样例map内容,用于测试
	 * @param allTestSampleMap
	 * @throws IOException 
	 */
	private void printTestSampleMap(
			Map> allTestSampleMap) throws IOException {
		// TODO Auto-generated method stub
		File outPutFile = new File("E:/DataMiningSample/KmeansClusterResult/allTestSampleMap.txt");
		FileWriter outPutFileWriter = new FileWriter(outPutFile);
		Set>> allWords = allTestSampleMap.entrySet();
		
		for(Iterator>> it = allWords.iterator();it.hasNext();){
			
			Map.Entry> me = it.next();
			outPutFileWriter.append(me.getKey()+" ");
			
			Set> vectorSet = me.getValue().entrySet();
			for(Iterator> vt = vectorSet.iterator();vt.hasNext();){
				Map.Entry vme = vt.next();
				outPutFileWriter.append(vme.getKey()+" "+vme.getValue()+" ");
			}
			outPutFileWriter.append("\n");
			outPutFileWriter.flush();
		}
		outPutFileWriter.close();
		
	}

	/**
	 * 统计每个词的总出现次数,返回出现次数大于n次的词汇构成最终的属性词典
	 * @param strDir 处理好的newsgroup文件目录的绝对路径
	 * @param wordMap 记录出现的每个词构成的属性词典
	 * @return newWordMap 返回出现次数大于n次的词汇构成最终的属性词典
	 * @throws IOException
	 */
	public SortedMap countWords(String strDir,
			Map wordMap) throws IOException {
		
		File sampleFile = new File(strDir);
		File[] sample = sampleFile.listFiles();
		String word;
		
		for(int i =0 ;i < sample.length;i++){
			
			if(!sample[i].isDirectory()){
				FileReader samReader = new FileReader(sample[i]);
				BufferedReader samBR = new BufferedReader(samReader);
				while((word = samBR.readLine()) != null){
					if(!word.isEmpty() && wordMap.containsKey(word))
						wordMap.put(word, wordMap.get(word)+1);
					else
						wordMap.put(word, 1.0);
				}
				samBR.close();
			}else{
				countWords(sample[i].getCanonicalPath(),wordMap);
			}
		}
		
		/*
		 * 去除停顿词后,先用DF算法选取特征词,后面再加入特征词的选取算法
		 */
		SortedMap newWordMap = new TreeMap();
		Set> allWords = wordMap.entrySet();
		for(Iterator> it = allWords.iterator();it.hasNext();){
			Map.Entry me = it.next();
			if(me.getValue() > 100) //DF算法降维
				newWordMap.put(me.getKey(), me.getValue());
		}
		
		return newWordMap;
	}
	
	/**
	 * 计算IDF,即属性词典中每个词在多少个文档中出现过
	 * @param testSampleDir 聚类算法测试样本所在的目录
	 * @return 单词IDFmap <单词,包含该单词的文档数>
	 * @throws IOException
	 */
	public Map computeIDF(String testSampleDir) throws IOException{
		
		Map IDFPerWordMap = new TreeMap();
		//记下当前已经遇到过的该文档中的词
		Set alreadyCountWord = new HashSet();
		String word;
		File[] samples = new File(testSampleDir).listFiles();
		for(int i = 0;i wordMap = new TreeMap();
		wordMap = countWords(srcDir,wordMap);
		System.out.println("special words map sizes:" + wordMap.size());
		String word,testSampleFile;
		
		File[] sampleDir = new File(srcDir).listFiles();
		for(int i =0;i> allWords = wordMap.entrySet();
		for(Iterator> it = allWords.iterator();it.hasNext();){
			Map.Entry me = it.next();
			terms[i] = me.getKey();
			i++;
		}
		
		return terms;
		
	}
	
	
	

	
	
}

3、Kmeans算法

Kmeans算法是非常经典的聚类算法,算法主要步骤如下:先选K个(或者随机选择)初始聚类点作为初始中心点,然后就算其他所有点到K个聚类中心点的距离,将点分到最近的聚类中。聚类完后,再次计算各个类中的中心点,中心点发生变化,于是更新中心点,然后再计算其他点到中心点的距离重新聚类,中心点又发生变化,如此迭代下去。


初始点选取策略:随机选,均匀抽样,最大最小法等....

距离的度量方法:1-余弦相似度,2-向量内积

算法停止条件:计算准则函数及设置最大迭代次数

空聚类的处理:注意空聚类导致的程序bug


package com.datamine.kmeans;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.*;

/**
 * kmeans聚类算法的实现类,将newsgroup文档集聚成10类、20类、30类
 * 算法结束条件:当每个点最近的聚类中心点就是它所属的聚类中心点时,算法结束
 * @author Administrator
 *
 */
public class KmeansCluster {

	/**
	 * kmeans算法主过程
	 * @param allTestSampleMap 聚类算法测试样本map(已经向量化) <文件名,<特征词,TF-IDF值>>
	 * @param k 聚类的数量
	 * @return 聚类结果 <文件名,聚类完成后所属的类别号>
	 */
	private Map doProcess(
			Map> allTestSampleMap, int k) {
		
		//0、首先获取allTestSampleMap所有文件名顺序组成的数组
		String[] testSampleNames = new String[allTestSampleMap.size()];
		int count =0,tsLength = allTestSampleMap.size();
		Set>> allTestSampleMapSet = allTestSampleMap.entrySet();
		for(Iterator>> it = allTestSampleMapSet.iterator();it.hasNext();){
			Map.Entry> me = it.next();
			testSampleNames[count++] = me.getKey();
		}
		
		//1、初始点的选择算法是随机选择或者是均匀分开选择,这里采用后者
		Map> meansMap = getInitPoint(allTestSampleMap,k);
		double [][] distance = new double[tsLength][k]; //distance[i][k]记录点i到聚类中心k的距离
		
		//2、初始化k个聚类
		int[] assignMeans = new int[tsLength]; //记录所有点属于的聚类序号,初始化全部为0
		Map> clusterMember = new TreeMap>();//记录每个聚类的成员点序号
		Vector mem = new Vector();
		int iterNum = 0; //迭代次数
		
		while(true){
			System.out.println("Iteration No." + (iterNum++) + "-------------------------");
			//3、计算每个点和每个聚类中心的距离
			for(int i = 0;i < tsLength;i++){
				for(int j = 0;j>> allTestSampleMapSet = allTestSampleMap.entrySet();
		for(Iterator>> it = allTestSampleMapSet.iterator();it.hasNext();){
			Map.Entry> me = it.next();
			if(count == i*allTestSampleMapSet.size() / k){
				meansMap.put(i, me.getValue());
				System.out.println(me.getKey());
				i++;
			}
			count++ ;
		}
		
		return meansMap;
	}

	/**
	 * 输出聚类结果到文件中
	 * @param kmeansClusterResult 聚类结果
	 * @param kmeansClusterResultFile 输出聚类结果到文件中
	 * @throws IOException 
	 */
	private void printClusterResult(Map kmeansClusterResult,
			String kmeansClusterResultFile) throws IOException {

		FileWriter resultWriter = new FileWriter(kmeansClusterResultFile);
		Set> kmeansClusterResultSet = kmeansClusterResult.entrySet();
		for(Iterator> it = kmeansClusterResultSet.iterator();it.hasNext();){
			Map.Entry me = it.next();
			resultWriter.append(me.getKey()+" "+me.getValue()+"\n");
		}
		resultWriter.flush();
		resultWriter.close();
	}
	
	/**
	 * 评估函数根据聚类结果文件统计熵 和 混淆矩阵
	 * @param kmeansClusterResultFile 聚类结果文件
	 * @param k 聚类数目
	 * @return 聚类结果的熵值
	 * @throws IOException 
	 */
	private double evaluateClusterResult(String kmeansClusterResultFile, int k) throws IOException {

		Map rightCate = new TreeMap();
		Map resultCate = new TreeMap();
		FileReader crReader = new FileReader(kmeansClusterResultFile);
		BufferedReader crBR  = new BufferedReader(crReader);
		String[] s;
		String line;
		while((line = crBR.readLine()) != null){
			s = line.split(" ");
			resultCate.put(s[0], s[1]);
			rightCate.put(s[0], s[0].split("_")[0]);
		}
		crBR.close();
		return computeEntropyAndConfuMatrix(rightCate,resultCate,k);//返回熵
	}
	
	/**
	 * 计算混淆矩阵并输出,返回熵
	 * @param rightCate 正确的类目对应map
	 * @param resultCate 聚类结果对应map
	 * @param k 聚类的数目
	 * @return 返回聚类熵
	 */
	private double computeEntropyAndConfuMatrix(Map rightCate,
			Map resultCate, int k) {
		
		//k行20列,[i,j]表示聚类i中属于类目j的文件数
		int[][] confusionMatrix = new int[k][20];
		
		//首先求出类目对应的数组索引
		SortedSet cateNames = new TreeSet();
		Set> rightCateSet = rightCate.entrySet();
		for(Iterator> it = rightCateSet.iterator();it.hasNext();){
			Map.Entry me = it.next();
			cateNames.add(me.getValue());
		}
		
		String[] cateNamesArray = cateNames.toArray(new String[0]);
		Map cateNamesToIndex = new TreeMap();
		for(int i =0;i < cateNamesArray.length ;i++){
			cateNamesToIndex.put(cateNamesArray[i], i);
		}
		
		for(Iterator> it = rightCateSet.iterator();it.hasNext();){
			Map.Entry me = it.next();
			confusionMatrix[Integer.parseInt(resultCate.get(me.getKey()))][cateNamesToIndex.get(me.getValue())]++;
		}
		
		//输出混淆矩阵
		double [] clusterSum = new double[k]; //记录每个聚类的文件数
		double [] everyClusterEntropy = new double[k]; //记录每个聚类的熵
		double clusterEntropy = 0;
		
		System.out.print("      ");
		
		for(int i=0;i<20;i++){
			System.out.printf("%-6d",i);
		}
		
		System.out.println();
		
		for(int i =0;i> 即为Map<文件名,Map<特征词,TF-IDF值>>
		ComputeWordsVector computV = new ComputeWordsVector();
		
		//int k[] = {10,20,30}; 三组分类
		int k[] = {20};
		
		Map> allTestSampleMap = computV.computeTFMultiIDF(testSampleDir);
		
		for(int i =0;i KmeansClusterResult = new TreeMap();
			KmeansClusterResult = doProcess(allTestSampleMap,k[i]);
			KmeansClusterResultFile += k[i];
			printClusterResult(KmeansClusterResult,KmeansClusterResultFile);
			System.out.println("The Entropy for this Cluster is " + evaluateClusterResult(KmeansClusterResultFile,k[i]));
		}
		
	}
	
	
	public static void main(String[] args) throws IOException {
		
		KmeansCluster test = new KmeansCluster();
		
		String KmeansClusterResultFile = "E:\\DataMiningSample\\KmeansClusterResult\\20";
		System.out.println("The Entropy for this Cluster is " + test.evaluateClusterResult(KmeansClusterResultFile,20));
	}


	
}

4、程序入口

package com.datamine.kmeans;

import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;

public class ClusterMain {

	/**
	 * Kmeans 聚类主程序入口
	 * @param args
	 * @throws IOException 
	 */
	public static void main(String[] args) throws IOException {
		
		//数据预处理 在分类算法中已经实现 这里(略)
		
		ComputeWordsVector computeV = new ComputeWordsVector();
		
		KmeansCluster kmeansCluster = new KmeansCluster();
		
		String srcDir = "E:\\DataMiningSample\\processedSample\\";
		String desDir = "E:\\DataMiningSample\\clusterTestSample\\";
		
		SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
		String beginTime = sdf.format(new Date());
		System.out.println("程序开始执行时间:"+beginTime);
		
		String[] terms = computeV.createTestSamples(srcDir,desDir);
		kmeansCluster.KmeansClusterMain(desDir);
		
		String endTime = sdf.format(new Date());
		System.out.println("程序结束执行时间:"+endTime);
		
	}
	
	
}

5、聚类结果

程序开始执行时间:2016-03-14 17:02:38
special words map sizes:3832
the total number of test files is 18828
开始聚类,聚成20类
本次聚类的初始点对应的文件为:
alt.atheism_49960
comp.graphics_38307
comp.os.ms-windows.misc_10112
comp.sys.ibm.pc.hardware_58990
comp.sys.mac.hardware_50449
comp.windows.x_66402
comp.windows.x_68299
misc.forsale_76828
rec.autos_103685
rec.motorcycles_105046
rec.sport.baseball_104941
rec.sport.hockey_54126
sci.crypt_15819
sci.electronics_54016
sci.med_59222
sci.space_61185
soc.religion.christian_20966
talk.politics.guns_54517
talk.politics.mideast_76331
talk.politics.misc_178699
Iteration No.0-------------------------
okCount = 512
Iteration No.1-------------------------
okCount = 10372
Iteration No.2-------------------------
okCount = 15295
Iteration No.3-------------------------
okCount = 17033
Iteration No.4-------------------------
okCount = 17643
Iteration No.5-------------------------
okCount = 18052
Iteration No.6-------------------------
okCount = 18282
Iteration No.7-------------------------
okCount = 18404
Iteration No.8-------------------------
okCount = 18500
Iteration No.9-------------------------
okCount = 18627
      0     1     2     3     4     5     6     7     8     9     10    11    12    13    14    15    16    17    18    19    
0     482   0     3     3     1     1     0     5     2     1     0     0     2     27    11    53    4     6     15    176   
1     4     601   69    8     14    127   7     5     5     8     0     14    31    16    34    2     2     2     1     5     
2     1     64    661   96    18    257   26    9     3     0     0     13    25    13    6     2     3     2     6     2     
3     0     56    78    575   213   15    119   15    6     2     1     4     131   2     4     2     6     0     2     1     
4     1     25    13    151   563   11    50    3     3     1     2     14    125   4     8     1     0     3     0     0     
5     2     28    78    25    37    348   13    2     0     0     2     5     38    5     6     2     1     1     2     8     
6     20    80    24    21    23    166   38    45    45    26    10    37    87    34    27    22    15    8     35    12    
7     4     20    6     24    45    6     629   28    20    14    0     3     87    10    4     1     8     0     13    0     
8     0     2     1     10    8     4     25    781   40    1     1     0     70    5     10    2     8     4     2     3     
9     4     2     11    0     1     1     11    34    831   1     0     1     7     7     0     1     1     1     8     0     
10    10    7     6     2     4     1     7     7     4     633   4     5     11    18    9     5     13    8     10    3     
11    1     0     1     9     4     1     20    1     3     286   961   0     17    8     4     2     2     0     5     3     
12    3     14    0     6     1     2     2     0     1     1     0     858   51    1     1     2     16    8     69    4     
13    3     15    4     7     7     17    5     12    8     5     2     5     46    13    793   6     5     2     30    5     
14    2     4     0     1     0     2     4     6     3     4     4     2     14    746   3     1     2     3     55    11    
15    30    43    29    39    15    18    12    13    7     3     4     13    195   38    36    5     6     18    5     11    
16    195   1     0     2     0     1     1     0     4     1     4     1     4     16    6     846   3     6     16    274   
17    8     2     0     2     4     2     1     5     7     0     0     10    30    12    5     28    363   9     289   23    
18    19    1     0     0     2     0     0     6     0     1     1     3     1     3     2     9     8     843   48    18    
19    10    8     1     1     1     0     2     13    2     6     3     3     9     12    18    5     444   16    164   69    

The Entropy for this Cluster is 1.2444339205006887
程序结束执行时间:2016-03-14 17:08:24




你可能感兴趣的:(数据仓库与数据挖掘)