使用weka进行文本聚类的例子

先看上篇会容易看懂些,这篇的注释不多!

import java.io.BufferedReader;

import java.io.File;

import java.io.FileReader;



import weka.clusterers.Clusterer;

import weka.clusterers.SimpleKMeans;

import weka.core.Attribute;

import weka.core.FastVector;

import weka.core.Instance;

import weka.core.Instances;

import weka.filters.Filter;

import weka.filters.unsupervised.attribute.StringToWordVector;



public class MessageClustering {

	

	private Instances instances=null;	

	private StringToWordVector filter=new StringToWordVector();

	private Clusterer clusterer=null;

	

	public MessageClustering(Clusterer clusterer)

	{

		this.clusterer=clusterer;

	}

	

	static String path="E:\\datasets\\alt.atheism\\";

	public void loadInstances() throws Exception

	{

		String name="text";

		FastVector attributes=new FastVector(1);

		attributes.addElement(new Attribute("message",(FastVector)null));

		instances=new Instances(name,attributes,100);

		for(File file : new File(path).listFiles())

		{

			String message=getAllMessage(file);

			Instance instance=new Instance(1);

			Attribute attribute=instances.attribute("message");

			instance.setValue(attribute, attribute.addStringValue(message));

			instance.setDataset(instances);

			instances.add(instance);

		}

		

		filter.setInputFormat(instances);		

		Instances filtedData=Filter.useFilter(instances,filter);		

		instances=filtedData;			

	}

	

	public void testCluster() throws Exception

	{

		clusterer.buildClusterer(instances);

		for (int i = 0; i < instances.numInstances(); i++) {			

			int cluster = clusterer.clusterInstance(instances.instance(i));

			System.out.println("\t"+(i+1)+":"+cluster);	

		}		

		System.out.println(clusterer.numberOfClusters());

//		System.out.println(clusterer.toString());

	}

	

	private String getAllMessage(File file) {

		StringBuilder sb=new StringBuilder();

		try

		{

			BufferedReader br=new BufferedReader(new FileReader(file));

			String line;

			while(true)

			{

				if((line=br.readLine())==null) break;

				sb.append(line.trim());

			}		

			br.close();

		} catch (Exception e){}

		return sb.toString();

	}



	public static void main(String[] args) throws Exception {

		SimpleKMeans cluster=new SimpleKMeans();//构造聚类算法

		cluster.setNumClusters(5);

		

		MessageClustering sk=new MessageClustering(cluster);

		sk.loadInstances();

		sk.testCluster();//测试聚类效果

	}

}

你可能感兴趣的:(例子)