先来贴源码吧:
package edu.wvtool.test;
import java.io.FileWriter;
import edu.udo.cs.wvtool.config.WVTConfiguration;
import edu.udo.cs.wvtool.config.WVTConfigurationFact;
import edu.udo.cs.wvtool.generic.output.WordVectorWriter;
import edu.udo.cs.wvtool.generic.stemmer.DummyStemmer;
import
edu.udo.cs.wvtool.generic.tokenizer.NGramTokenizer;
import
edu.udo.cs.wvtool.generic.tokenizer.SimpleTokenizer;
import edu.udo.cs.wvtool.generic.vectorcreation.TFIDF;
import edu.udo.cs.wvtool.main.WVTDocumentInfo;
import edu.udo.cs.wvtool.main.WVTFileInputList;
import
edu.udo.cs.wvtool.main.WVTWordVector;
import edu.udo.cs.wvtool.main.WVTool;
import edu.udo.cs.wvtool.wordlist.WVTWordList;
public
class WVToolTest4 {
public
static
void main(String[] args)
throws Exception {
//初始化一个WVTool对象
WVTool wvt =
new WVTool(
true);
//初始化一个configuration对象
WVTConfiguration config =
new WVTConfiguration();
//配置config
config.setConfigurationRule(WVTConfiguration.
STEP_STEMMER,
new WVTConfigurationFact(
new DummyStemmer()));
//定义两个输入类别的文件
WVTFileInputList list =
new WVTFileInputList(2);
// Add entries
//为输入添加一个文档信息对象 (WVTDocumentInfo),其中sourceName对象可以是一个文件夹的名称,也可以是一个文件名称, 最后一个0这个文档信息对象的类别
//样本数据
list.addEntry(
new WVTDocumentInfo("E:/VSMTest/edu.txt", "txt", "utf-8", "chinese", 0));
list.addEntry(
new WVTDocumentInfo("
E:/VSMTest/gov.txt, "txt", "utf-8", "chinese", 1)); "
//生成wordList
WVTWordList wordList = wvt.createWordList(list, config);
//对wordList中词频做出一个限制,即词频在1<n<5之间
wordList.pruneByFrequency(1, 5);
//生成词组文件
wordList.storePlain(
new FileWriter("E:/VSMTest/wordlist.txt"));
FileWriter outFile =
new FileWriter("E:/VSMTest/wv.txt");
WordVectorWriter wvw =
new WordVectorWriter(outFile,
true);
config.setConfigurationRule(WVTConfiguration.
STEP_OUTPUT,
new WVTConfigurationFact(wvw));
config.setConfigurationRule(WVTConfiguration.
STEP_VECTOR_CREATION,
new WVTConfigurationFact(
new TFIDF()));
// Create the vectors
wvt.createVectors(list, config, wordList);
// Close the output file
wvw.close();
outFile.close();
}
}
样本数据内容:
E:/VSMTest/edu.txt内容:
Education in its broadest, general sense is the means through which the aims and habits of a group of people lives on from one generation to the next.China Education! China Education!
E:/VSMTest/gov.txt
This article is about the People's Republic of China.
运算过程中是先统计各单词出现词频TF、文档数N、文档频率DF
输出结果:
wordlist.txt内容(去除了停用词):
Education
broadest
general
sense
means
aims
habits
group
people
lives
generation
China
article
People
Republic
wv.txt内容:
E:/VSMTest/edu.txt; 0:0.6882472016116853 1:0.22941573387056174 2:0.22941573387056174 3:0.22941573387056174 4:0.22941573387056174 5:0.22941573387056174 6:0.22941573387056174 7:0.22941573387056174 8:0.22941573387056174 9:0.22941573387056174 10:0.22941573387056174
E:/VSMTest/gov.txt; 12:0.5773502691896257 13:0.5773502691896257 14:0.5773502691896257
其中值为归一化后的TFIDF值。