WVTool和分词程序相结合

 

实现wvtool中文功能要implement  WVTTokenizer, TokenEnumeration接口
 写道
package ICTCLAS.vsm;

import java.io.Reader;
import java.util.ArrayList;
import java.util.List;

import ICTCLAS.util.ICTCLASUtil;

import edu.udo.cs.wvtool.generic.tokenizer.WVTTokenizer;
import edu.udo.cs.wvtool.main.WVTDocumentInfo;
import edu.udo.cs.wvtool.util.TokenEnumeration;
import edu.udo.cs.wvtool.util.WVToolException;

/**
 *@date 2011-3-21
 * 
 *@author Jing Yang
 * 
 */
public class ChineseTokenizer implements WVTTokenizer, TokenEnumeration {

	private final List<String> currentTokens;
	private TokenEnumeration input;
	private final WVTTokenizer tokenizer;

	public ChineseTokenizer(WVTTokenizer tokenizer) {

		super();
		this.currentTokens = new ArrayList<String>();
		this.input = null;
		this.tokenizer = tokenizer;
	}

	public TokenEnumeration tokenize(Reader source, WVTDocumentInfo d)
			throws WVToolException {

		if (source != null) {
			input = tokenizer.tokenize(source, d);
			readNextToken();
			return this;
		} else
			return null;
	}

	// 分词
	public void readNextToken() throws WVToolException {

		if (input.hasMoreTokens()) {
			String token = input.nextToken();
			//System.out.println(token);
			
			if (token.length() > 0) {
				currentTokens.addAll(ICTCLASUtil.ContentProcess(token));// 这是我根据ictclas编写的分词程序
				System.out.println(currentTokens);
			}
		}
	}

	public boolean hasMoreTokens() {

		if (input != null)
			return (currentTokens.size() > 0);
		else
			return false;
	}

	public String nextToken() throws WVToolException {

		String result = null;

		// If unequal null, return the current token and read another one from
		// the stream
		if (currentTokens.size() > 0) {
			result = (String) currentTokens.get(0);
			currentTokens.remove(0);
			if (currentTokens.size() == 0) {
				readNextToken();
			}
		} else {
			result = null;
		}

		return result;
	}
}
 

 我在来贴一下别人的代码做个对比

 

 写道
package com.xh; 

import java.io.IOException; 
import java.io.Reader; 
import java.io.StringReader; 
import java.util.ArrayList; 
import java.util.List; 

import org.wltea.analyzer.IKSegmentation; 
import org.wltea.analyzer.Lexeme; 


import edu.udo.cs.wvtool.generic.tokenizer.SimpleTokenizer; 
import edu.udo.cs.wvtool.generic.tokenizer.WVTTokenizer; 
import edu.udo.cs.wvtool.main.WVTDocumentInfo; 
import edu.udo.cs.wvtool.util.TokenEnumeration; 
import edu.udo.cs.wvtool.util.WVToolException; 
/* 
* 这个程序是wvtool结合IKAnalyzer的分词而写成的分词代码, 
* 程序执行的速度有些慢,由于我也是初学,对于其中的原理也 
* 不甚明了,仅供大家做个参考而已 
* */ 
public class IKAnalyzerTokenizer implements WVTTokenizer,TokenEnumeration{ 
/* 
* 一般用Wvtool进行分词的时候,我们都习惯传入SimpleTokentizer,而SimpleTokenizer 
* 好像只是提取出了一行文本,根本就没有分词, 
* 而这里面,应该是对文本:一行一行的进行分词 
* 而且真正实现分词功能的代码在readTokenizer()方法中,这个方法也是这个类里 
* 唯一的私有方法 
* */ 
private final List<String> currentToken; 
private TokenEnumeration enumeration; 
private final WVTTokenizer tokenizer; 

public IKAnalyzerTokenizer(WVTTokenizer tokenizer){ 
this.tokenizer=tokenizer; 
currentToken=new ArrayList<String>(); 
enumeration=null; 
} 
@Override 
public TokenEnumeration tokenize(Reader source, WVTDocumentInfo info) 
throws WVToolException { 
if(source!=null){ 
//刚开始看到这段代码,我以为是递归,后来发现跟本不是的,这里它调用的是通过构造函数 
//传过来的对象的方法,而不是本方法 
enumeration=tokenizer.tokenize(source, info); 

readNextTokenizer(); 
return this; 
}else{ 
return null; 
} 
} 

@Override 
public boolean hasMoreTokens() { 
if (enumeration != null) 
return (currentToken.size() > 0); 
else 
return false; 
} 

@Override 
public String nextToken() throws WVToolException { 

String result = null; 

// If unequal null, return the current token and read another one from 
// the stream 

if (currentToken.size() > 0) { 
result = (String) currentToken.get(0); 
currentToken.remove(0); 
if (currentToken.size() == 0) 
readNextTokenizer(); 
} else 
result = null; 

return result; 
} 

private void readNextTokenizer() throws WVToolException{ 
//我想吧:真正影响程序性能的代码在这里……但怎么优化呢?唉! 我也不知道啦…… 
if(enumeration.hasMoreTokens()){ 
//其实吧:就是读入一行文本 
String string=enumeration.nextToken(); 
//包装一下吧 
StringReader reader=new StringReader(string); 
//好了,分词在这里完成 
IKSegmentation seg=new IKSegmentation(reader); 
Lexeme lex=new Lexeme(0, 0, 0, 0); 
try { 
while((lex=seg.next())!=null) 
{ 
currentToken.add(lex.getLexemeText()); 
} 
} catch (IOException e) { 
// TODO Auto-generated catch block 
e.printStackTrace(); 
} 
} 
} 

//最后测试一下吧 
public static void main(String[] args) throws WVToolException { 
IKAnalyzerTokenizer toker=new IKAnalyzerTokenizer(new SimpleTokenizer()); 
String string="雅虎新闻雅虎新闻并校十年难言成败\n雅虎新闻雅虎新闻并校十年难言成败"; 

StringReader reader=new StringReader(string); 
WVTDocumentInfo info=new WVTDocumentInfo("text.html", "html", "utf-8", "chinese"); 

TokenEnumeration enumeration=toker.tokenize(reader, info); 

while(enumeration.hasMoreTokens()){ 
System.out.print(enumeration.nextToken()+"|"); 
} 
//结果:雅虎|新闻|雅虎|新闻|并|校|十年|十|年|难言|成败|雅虎|新闻|雅虎|新闻|并|校|十年|十|年|难言|成败| 
} 
}
 

 几乎一样啊,呵呵。

你可能感兴趣的:(tool)