(update 2012.12.28 关于本项目下载及运行的常见问题 FAQ见 newsgroup18828文本分类器、文本聚类器、关联分析频繁模式挖掘算法的Java实现工程下载及运行FAQ )
本文要点如下:
对newsgroup文档集进行预处理,按照DF法及SVD分解法抽取特征词,实现降维
实现了K-Means,MBSAS,DBSCAN三种聚类算法
用weka工具进行newsgroup文档聚类
计算各种算法聚类的熵,进行算法评价
1、newsgroup文档集预处理
newsgroup是常用的数据挖掘实验数据。文本预处理主要包括单词分片、去除标点等无关符号、去停用词等等,相关详细介绍见我的另一篇博文数据挖掘-基于贝叶斯算法及KNN算法的newsgroup18828文本分类器的JAVA实现(上),此处只给出文本预处理和向量化不同的部分代码。
文本预处理类DataPreProcess.java
文本向量化表示主要基于TF-IDF值
ComputeWordsVector.java
- package com.pku.yangliu;
- import java.io.BufferedReader;
- import java.io.File;
- import java.io.FileReader;
- import java.io.FileWriter;
- import java.io.IOException;
- import java.util.HashSet;
- import java.util.SortedMap;
- import java.util.Map;
- import java.util.Set;
- import java.util.SortedSet;
- import java.util.TreeMap;
- import java.util.Iterator;
- import java.util.TreeSet;
-
-
-
-
- public class ComputeWordsVector {
-
-
-
-
-
-
- public Map<String,Map<String,Double>> computeTFMultiIDF(String testSampleDir) throws IOException{
- String word;
- Map<String,Map<String,Double>> allTestSampleMap = new TreeMap<String,Map<String,Double>>();
- Map<String, Double> idfPerWordMap = computeIDF(testSampleDir);
- Map<String,Double> TFPerDocMap = new TreeMap<String,Double>();
- File[] samples = new File(testSampleDir).listFiles();
- System.out.println("the total number of test files is" + samples.length);
- for(int i = 0; i < samples.length; i++){
- TFPerDocMap.clear();
- FileReader samReader = new FileReader(samples[i]);
- BufferedReader samBR = new BufferedReader(samReader);
- Double wordSumPerDoc = 0.0;
- while((word = samBR.readLine()) != null){
- if(!word.isEmpty()){
- wordSumPerDoc++;
- if(TFPerDocMap.containsKey(word)){
- Double count = TFPerDocMap.get(word);
- TFPerDocMap.put(word, count + 1.0);
- }
- else {
- TFPerDocMap.put(word, 1.0);
- }
- }
- }
-
- Double maxCount = 0.0, wordWeight;
- Set<Map.Entry<String, Double>> tempTF = TFPerDocMap.entrySet();
- for(Iterator<Map.Entry<String, Double>> mt = tempTF.iterator(); mt.hasNext();){
- Map.Entry<String, Double> me = mt.next();
- if(me.getValue() > maxCount) maxCount = me.getValue();
- }
- for(Iterator<Map.Entry<String, Double>> mt = tempTF.iterator(); mt.hasNext();){
- Map.Entry<String, Double> me = mt.next();
- Double IDF = Math.log(samples.length / idfPerWordMap.get(me.getKey())) / Math.log(10);
- wordWeight = (me.getValue() / maxCount) * IDF;
- TFPerDocMap.put(me.getKey(), wordWeight);
- }
- TreeMap<String,Double> tempMap = new TreeMap<String,Double>();
- tempMap.putAll(TFPerDocMap);
- allTestSampleMap.put(samples[i].getName(), tempMap);
- }
-
- return allTestSampleMap;
- }
-
-
-
-
-
- void printTestSampleMap(Map<String,Map<String,Double>> allTestSampleMap) throws IOException {
-
- File outPutFile = new File("F:/DataMiningSample/KmeansClusterResult/allTestSampleMap.txt");
- FileWriter outPutFileWriter = new FileWriter(outPutFile);
- Set<Map.Entry<String,Map<String,Double>>> allWords = allTestSampleMap.entrySet();
- for(Iterator<Map.Entry<String,Map<String,Double>>> it = allWords.iterator(); it.hasNext();){
- Map.Entry<String,Map<String,Double>> me = it.next();
- outPutFileWriter.append(me.getKey() + " ");
- Set<Map.Entry<String,Double>> vecSet = me.getValue().entrySet();
- for(Iterator<Map.Entry<String, Double>> jt = vecSet.iterator(); jt.hasNext();){
- Map.Entry<String, Double> ne = jt.next();
- outPutFileWriter.append(ne.getKey() + " "+ ne.getValue() + " ");
- }
- outPutFileWriter.append("\n");
- outPutFileWriter.flush();
- }
- outPutFileWriter.close();
- }
-
-
-
-
-
- public SortedMap<String,Double> countWords(String strDir,Map<String, Double> wordMap) throws IOException{
- File sampleFile = new File(strDir);
- File [] sampleDir = sampleFile.listFiles();
- String word;
- for(int j = 0; j < sampleDir.length; j++){
- File[] sample = sampleDir[j].listFiles();
- for(int i = 0; i < sample.length; i++){
- if(sample[i].getName().contains("stemed")){
- FileReader samReader = new FileReader(sample[i]);
- BufferedReader samBR = new BufferedReader(samReader);
- while((word = samBR.readLine()) != null){
- if(!word.isEmpty() && wordMap.containsKey(word)){
- double count = wordMap.get(word) + 1;
- wordMap.put(word, count);
- }
- else {
- wordMap.put(word, 1.0);
- }
- }
- }
- }
- }
-
-
- SortedMap<String,Double> newWordMap = new TreeMap<String,Double>();
- Set<Map.Entry<String,Double>> allWords = wordMap.entrySet();
- for(Iterator<Map.Entry<String,Double>> it = allWords.iterator(); it.hasNext();){
- Map.Entry<String, Double> me = it.next();
- if(me.getValue() > 100){
- newWordMap.put(me.getKey(),me.getValue());
- }
- }
- return newWordMap;
- }
-
-
-
-
-
-
- Map<String,Double> computeIDF(String testSampleDir) throws IOException {
-
- Map<String,Double> IDFPerWordMap = new TreeMap<String,Double>();
- Set<String> alreadyCountWord = new HashSet<String>();
- String word;
- File[] samples = new File(testSampleDir).listFiles();
- for(int i = 0; i < samples.length; i++){
- alreadyCountWord.clear();
- FileReader tsReader = new FileReader(samples[i]);
- BufferedReader tsBR = new BufferedReader(tsReader);
- while((word = tsBR.readLine()) != null){
- if(!alreadyCountWord.contains(word)){
- if(IDFPerWordMap.containsKey(word)){
- IDFPerWordMap.put(word, IDFPerWordMap.get(word) + 1.0);
- }
- else IDFPerWordMap.put(word, 1.0);
- alreadyCountWord.add(word);
- }
- }
- }
- return IDFPerWordMap;
- }
-
-
-
-
-
-
-
- String[] createTestSamples( String srcDir, String destDir) throws IOException {
-
- SortedMap<String,Double> wordMap = new TreeMap<String,Double>();
- wordMap = countWords(srcDir, wordMap);
- System.out.println("special words map sizes:" + wordMap.size());
- String word, testSampleFile;
- File[] sampleDir = new File(srcDir).listFiles();
- for(int i = 0; i < sampleDir.length; i++){
- File[] sample = sampleDir[i].listFiles();
- for(int j = 0;j < sample.length; j++){
- if(sample[j].getName().contains("stemed")){
- testSampleFile = destDir + sampleDir[i].getName()+"_"+sample[j].getName();
- FileReader samReader = new FileReader(sample[j]);
- BufferedReader samBR = new BufferedReader(samReader);
- FileWriter tsWriter = new FileWriter(new File(testSampleFile));
- while((word = samBR.readLine()) != null){
- if(wordMap.containsKey(word)){
- tsWriter.append(word + "\n");
- }
- }
- tsWriter.flush();
- tsWriter.close();
- }
- }
- }
-
- String [] terms = new String[wordMap.size()];
- int i = 0;
- Set<Map.Entry<String,Double>> allWords = wordMap.entrySet();
- for(Iterator<Map.Entry<String,Double>> it = allWords.iterator(); it.hasNext();){
- Map.Entry<String, Double> me = it.next();
- terms[i] = me.getKey();
- i++;
- }
- return terms;
- }
-
-
-
-
-
-
-
- double evaluateClusterRes(String clusterResultFile, int K) throws IOException {
-
- Map<String,String> rightCate = new TreeMap<String,String>();
- Map<String,String> resultCate = new TreeMap<String,String>();
- FileReader crReader = new FileReader(clusterResultFile);
- BufferedReader crBR = new BufferedReader(crReader);
- String[] s;
- String line;
- while((line = crBR.readLine()) != null){
- s = line.split(" ");
- resultCate.put(s[0], s[1]);
-
- rightCate.put(s[0], s[0].split("_")[0]);
- }
- return computeEntropyAndConfuMatrix(rightCate,resultCate,K);
- }
-
-
-
-
-
-
-
- private double computeEntropyAndConfuMatrix(Map<String, String> rightCate,
- Map<String, String> resultCate, int K) {
-
- int[][] confusionMatrix = new int[K][20];
-
- SortedSet<String> cateNames = new TreeSet<String>();
- Set<Map.Entry<String, String>> rightCateSet = rightCate.entrySet();
- for(Iterator<Map.Entry<String, String>> it = rightCateSet.iterator(); it.hasNext();){
- Map.Entry<String, String> me = it.next();
- cateNames.add(me.getValue());
- }
- String[] cateNamesArray = cateNames.toArray(new String[0]);
- Map<String,Integer> cateNamesToIndex = new TreeMap<String,Integer>();
- for(int i = 0; i < cateNamesArray.length; i++){
- cateNamesToIndex.put(cateNamesArray[i],i);
- }
- for(Iterator<Map.Entry<String, String>> it = rightCateSet.iterator(); it.hasNext();){
- Map.Entry<String, String> me = it.next();
- confusionMatrix[Integer.parseInt(resultCate.get(me.getKey()))][cateNamesToIndex.get(me.getValue())]++;
- }
-
- double [] clusterSum = new double[K];
- double[] everyClusterEntropy = new double[K];
- double clusterEntropy = 0;
- System.out.print(" ");
- for(int i = 0; i < 20; i++){
- System.out.print(i + " ");
- }
- System.out.println();
- for(int i = 0; i < K; i++){
- System.out.print(i + " ");
- for(int j = 0; j < 20; j++){
- clusterSum[i] += confusionMatrix[i][j];
- System.out.print(confusionMatrix[i][j]+" ");
- }
- System.out.println();
- }
- System.out.println();
- for(int i = 0; i < K; i++){
- if(clusterSum[i] != 0){
- for(int j = 0; j < 20; j++){
- double p = (double)confusionMatrix[i][j]/clusterSum[i];
- if(p != 0){
- everyClusterEntropy[i] += -p * Math.log(p);
- }
- }
- clusterEntropy += clusterSum[i]/(double)rightCate.size() * everyClusterEntropy[i];
- }
- }
- return clusterEntropy;
- }
-
- }
2、K-means算法
K-means算法是非常经典的聚类算法。其算法思路是: 先选K个初始聚类点作为初始中心点,然后计算其他所有点到K个聚类点的距离做聚类,将点分到最近的聚类,聚完类后中心点发生变化了,于是更新中心点。然后再计算其他所有点到这K个中心点的距离重新聚类,中心点又会发生变化,如此迭代下去。其伪代码如下:
K-means算法的实现有以下关键点:
初始点的选择策略:随机选、均匀抽样、最大最小法等
距离的度量 1-余弦相似度,欧式距离,1-向量内积,测试发现1-余弦相似度效果最好,而1-向量内积速度最快。
中心点的计算 向量各维取评价
算法停止条件 计算准则函数及设置最大迭代次数
空聚类的处理 注意空聚类导致的程序bug
K-means算法实现类KmeansCluster.java
- package com.pku.yangliu;
- import java.io.FileWriter;
- import java.io.IOException;
- import java.util.Iterator;
- import java.util.Map;
- import java.util.Set;
- import java.util.TreeMap;
- import java.util.Vector;
-
-
-
-
-
-
- public class KmeansCluster {
-
-
-
-
-
-
-
- private Map<String, Integer> doProcess(
- Map<String, Map<String, Double>> allTestSampleMap, int K) {
-
-
- String[] testSampleNames = new String[allTestSampleMap.size()];
- int count = 0, tsLength = allTestSampleMap.size();
- Set<Map.Entry<String, Map<String, Double>>> allTestSampeleMapSet = allTestSampleMap.entrySet();
- for(Iterator<Map.Entry<String, Map<String, Double>>> it = allTestSampeleMapSet.iterator(); it.hasNext(); ){
- Map.Entry<String, Map<String, Double>> me = it.next();
- testSampleNames[count++] = me.getKey();
- }
-
- Map<Integer, Map<String, Double>> meansMap = getInitPoint(allTestSampleMap, K);
- double [][] distance = new double[tsLength][K];
-
- int [] assignMeans = new int[tsLength];
- Map<Integer, Vector<Integer>> clusterMember = new TreeMap<Integer,Vector<Integer>>();
- Vector<Integer> mem = new Vector<Integer>();
- int iterNum = 0;
- while(true){
- System.out.println("Iteration No." + (iterNum++) + "----------------------");
-
- for(int i = 0; i < tsLength; i++){
- for(int j = 0; j < K; j++){
- distance[i][j] = getDistance(allTestSampleMap.get(testSampleNames[i]),meansMap.get(j));
- }
- }
-
- int[] nearestMeans = new int[tsLength];
- for(int i = 0; i < tsLength; i++){
- nearestMeans[i] = findNearestMeans(distance, i);
- }
-
- int okCount = 0;
- for(int i = 0; i <tsLength; i++){
- if(nearestMeans[i] == assignMeans[i]) okCount++;
- }
- System.out.println("okCount = " + okCount);
- if(okCount == tsLength || iterNum >= 10) break;
-
- clusterMember.clear();
- for(int i = 0; i < tsLength; i++){
- assignMeans[i] = nearestMeans[i];
- if(clusterMember.containsKey(nearestMeans[i])){
- clusterMember.get(nearestMeans[i]).add(i);
- }
- else {
- mem.clear();
- mem.add(i);
- Vector<Integer> tempMem = new Vector<Integer>();
- tempMem.addAll(mem);
- clusterMember.put(nearestMeans[i], tempMem);
- }
- }
-
- for(int i = 0; i < K; i++){
- if(!clusterMember.containsKey(i)){
- continue;
- }
- Map<String, Double> newMean = computeNewMean(clusterMember.get(i), allTestSampleMap, testSampleNames);
- Map<String, Double> tempMean = new TreeMap<String, Double>();
- tempMean.putAll(newMean);
- meansMap.put(i, tempMean);
- }
- }
-
- Map<String, Integer> resMap = new TreeMap<String, Integer>();
- for(int i = 0; i < tsLength; i++){
- resMap.put(testSampleNames[i], assignMeans[i]);
- }
- return resMap;
- }
-
-
-
-
-
-
-
-
- private Map<String, Double> computeNewMean(Vector<Integer> clusterM,
- Map<String, Map<String, Double>> allTestSampleMap,
- String[] testSampleNames) {
-
- double memberNum = (double)clusterM.size();
- Map<String, Double> newMeanMap = new TreeMap<String,Double>();
- Map<String, Double> currentMemMap = new TreeMap<String,Double>();
- for(Iterator<Integer> it = clusterM.iterator(); it.hasNext();){
- int me = it.next();
- currentMemMap = allTestSampleMap.get(testSampleNames[me]);
- Set<Map.Entry<String, Double>> currentMemMapSet = currentMemMap.entrySet();
- for(Iterator<Map.Entry<String, Double>> jt = currentMemMapSet.iterator(); jt.hasNext();){
- Map.Entry<String, Double> ne = jt.next();
- if(newMeanMap.containsKey(ne.getKey())){
- newMeanMap.put(ne.getKey(), newMeanMap.get(ne.getKey()) + ne.getValue());
- }
- else {
- newMeanMap.put(ne.getKey(), ne.getValue());
- }
- }
- }
-
- Set<Map.Entry<String, Double>> newMeanMapSet = newMeanMap.entrySet();
- for(Iterator<Map.Entry<String, Double>> jt = newMeanMapSet.iterator(); jt.hasNext();){
- Map.Entry<String, Double> ne = jt.next();
- newMeanMap.put(ne.getKey(), newMeanMap.get(ne.getKey()) / memberNum);
- }
- return newMeanMap;
- }
-
-
-
-
-
-
- private int findNearestMeans(double[][] distance,int m) {
-
- double minDist = 10;
- int j = 0;
- for(int i = 0; i < distance[m].length; i++){
- if(distance[m][i] < minDist){
- minDist = distance[m][i];
- j = i;
- }
- }
- return j;
- }
-
-
-
-
-
-
-
- private double getDistance(Map<String, Double> map1, Map<String, Double> map2) {
-
- return 1 - computeSim(map1,map2);
- }
-
-
-
-
-
-
-
- private double computeSim(Map<String, Double> testWordTFMap,
- Map<String, Double> trainWordTFMap) {
-
- double mul = 0;
- Set<Map.Entry<String, Double>> testWordTFMapSet = testWordTFMap.entrySet();
- for(Iterator<Map.Entry<String, Double>> it = testWordTFMapSet.iterator(); it.hasNext();){
- Map.Entry<String, Double> me = it.next();
- if(trainWordTFMap.containsKey(me.getKey())){
- mul += me.getValue()*trainWordTFMap.get(me.getKey());
- }
-
- }
-
-
-
-
-
-
-
-
- return mul ;
- }
-
-
-
-
-
-
-
- private Map<Integer, Map<String, Double>> getInitPoint(Map<String, Map<String, Double>> allTestSampleMap, int K) {
-
- int count = 0, i = 0;
- Map<Integer, Map<String, Double>> meansMap = new TreeMap<Integer, Map<String, Double>>();
- System.out.println("本次聚类的初始点对应的文件为:");
- Set<Map.Entry<String, Map<String,Double>>> allTestSampleMapSet = allTestSampleMap.entrySet();
- for(Iterator<Map.Entry<String, Map<String,Double>>> it = allTestSampleMapSet.iterator();it.hasNext();){
- Map.Entry<String, Map<String,Double>> me = it.next();
- if(count == i * allTestSampleMapSet.size() / K){
- meansMap.put(i, me.getValue());
- System.out.println(me.getKey() + " map size is " + me.getValue().size());
- i++;
- }
- count++;
- }
- return meansMap;
- }
-
-
-
-
-
-
- private void printClusterResult(Map<String, Integer> kmeansClusterResult, String kmeansClusterResultFile) throws IOException {
-
- FileWriter resWriter = new FileWriter(kmeansClusterResultFile);
- Set<Map.Entry<String,Integer>> kmeansClusterResultSet = kmeansClusterResult.entrySet();
- for(Iterator<Map.Entry<String,Integer>> it = kmeansClusterResultSet.iterator(); it.hasNext(); ){
- Map.Entry<String, Integer> me = it.next();
- resWriter.append(me.getKey() + " " + me.getValue() + "\n");
- }
- resWriter.flush();
- resWriter.close();
- }
-
- public void KmeansClusterMain(String testSampleDir) throws IOException {
-
- ComputeWordsVector computeV = new ComputeWordsVector();
- int[] K = {10, 20 ,30};
- Map<String,Map<String,Double>> allTestSampleMap = computeV.computeTFMultiIDF(testSampleDir);
- for(int i = 0; i < K.length; i++){
- System.out.println("开始聚类,聚成" + K[i] + "类");
- String KmeansClusterResultFile = "F:/DataMiningSample/KmeansClusterResult/";
- Map<String,Integer> KmeansClusterResult = new TreeMap<String, Integer>();
- KmeansClusterResult = doProcess(allTestSampleMap, K[i]);
- KmeansClusterResultFile += K[i];
- printClusterResult(KmeansClusterResult,KmeansClusterResultFile);
- System.out.println("The Entropy for this Cluster is " + computeV.evaluateClusterRes(KmeansClusterResultFile, K[i]));
- }
- }
- }
聚类器主类ClusterMain.java
- package com.pku.yangliu;
-
- import java.io.IOException;
- import java.text.SimpleDateFormat;
-
-
-
-
- public class ClusterMain {
-
-
-
-
-
- public static void main(String[] args) throws IOException {
-
- DataPreProcess DataPP = new DataPreProcess();
- ComputeWordsVector computeV = new ComputeWordsVector();
-
- KmeansCluster kmeansCluster2 = new KmeansCluster();
- DataPP.BPPMain(args);
-
- String srcDir = "F:/DataMiningSample/processedSample_includeNotSpecial/";
- String destDir = "F:/DataMiningSample/clusterTestSample/";
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
- String beginTime = sdf.format(new java.util.Date());
- System.out.println("程序开始执行时间:"+beginTime);
- String[] terms = computeV.createTestSamples(srcDir, destDir);
-
- kmeansCluster2.KmeansClusterMain(destDir);
- String endTime = sdf.format(new java.util.Date());
- System.out.println("程序结束执行时间:"+endTime);
- }
- }
3、K-means算法聚类结果
K-means算法对newsgroup文本聚类的结果用聚类结果的熵值来度量,熵值定义如下
对newsgroup文本聚类的结果混淆矩阵如下:
这是用DF法降维到6070词的聚类结果,熵值已经比较小了聚20类时只有1.144,特征词抽取降维是数据挖掘研究中的一个重要内容,我还尝试了用LSI中的SVD分解来进行特征降维,详细介绍实现和其他两种聚类算法的聚类结果对比见下一篇博文数据挖掘-基于Kmeans算法、MBSAS算法及DBSCAN算法的newsgroup18828文本聚类器的JAVA实现(下)