搜索引擎之猎兔分词实例

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;

import com.lietu.seg.result.CnTokenizer;

//需要lucene-core-2.3.2.jar和猎兔分词seg.jar和字典目录dic
public class MyCnAnalyzerTest {

	static class MyCnAnalyzer extends Analyzer 
	{

		public MyCnAnalyzer()
		{
			CnTokenizer.makeTag=true;
		}
		@Override
		public TokenStream tokenStream(String fieldName, Reader reader) {
			
			TokenStream result =  new CnTokenizer(reader);		
			result = new MySingleFilter(result);
			return result;
		}		
	}
	static class MySingleFilter extends TokenFilter 
	{
		private Token buff=null;
		private int offset=0;
		//一元分词
		private static String tokenType = "1word";

		public MySingleFilter(TokenStream in) {
			super(in);
		}
		@Override
		public Token next() throws IOException {
			if (buff!=null)
			{	
				if(offset == buff.termText().length())
				{
					Token buff2 = buff;
					buff = null;
					return buff2;
				}
				Token buff2 = new Token(buff.termText().substring(offset,1+offset),
					buff.startOffset()+offset,
					buff.startOffset()+offset+1,tokenType);
				buff2.setPositionIncrement(0);//增量为0,当增量是0的时候就是说明这个字和词是同义词,查找字的时候可以找到结果,查找词的时候同样可以找到结果。
				++offset;
				return buff2;
			}		
			Token t = input.next();		
			if (t == null)
				return null;
			if (t.termText().length()>1)
			{
				buff = t;//这个buff影响了全局变量private Token buff=null;那么就会影响到上一个if中的buff
				offset = 0;
				Token buff2 = new Token(buff.termText().substring(offset,1+offset),
					buff.startOffset()+offset,
					buff.startOffset()+offset+1,tokenType);
				buff2.setPositionIncrement(0);				
				++offset;
				return buff2;
			}			
			return t;
		}
		
	}
	public static void main(String[] args) throws IOException {
		MyCnAnalyzer cna = new MyCnAnalyzer();
		String input = "由广东省公安厅和广东卫视合办的《南粤警视》栏目,广西南宁,北部湾";
		// 需要注意的是在调用CnAnalyzer的tokenStream方法的时候在内存中就就初始化了很多相应的变量和类。
		TokenStream ts = cna.tokenStream("asd", new StringReader(input));

		for (Token t = ts.next(); t != null; t = ts.next()) {
			System.out.println(t.termText() + " " + t.startOffset() + " "
					+ t.endOffset() + " " + t.type() + " "
					+ t.getPositionIncrement());
		}
	}
}

你可能感兴趣的:(搜索引擎,java)