lucene分词器分词

package com.essearch.core.analyzer;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.ngram.NGramTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;


//如果需要编写自己的中文分词器,可以参照Lucene-analyzers-commons-4.10.2.jar中cn包下的中文分词进行改造,编写符合特殊要求的分词器。假如我们需要对文档中的每个字符进行分词,那么核心代码如下:

public class MyNGramAnalyzer extends Analyzer {

	@Override
	protected TokenStreamComponents createComponents(String fieldName,
			Reader reader) {

		NGramTokenizer nGramTokenizer = new NGramTokenizer(reader,1,15);
		
		TokenStream result = new LowerCaseFilter(nGramTokenizer);
	
		 
		 return new TokenStreamComponents(nGramTokenizer,result);

	}

	private static void testtokenizer(Tokenizer tokenizer) {

		try {
			tokenizer.reset();
			while (tokenizer.incrementToken()) {
				CharTermAttribute charTermAttribute = tokenizer
						.addAttribute(CharTermAttribute.class);
				TermToBytesRefAttribute termToBytesRefAttribute = tokenizer
						.addAttribute(TermToBytesRefAttribute.class);
				PositionIncrementAttribute positionIncrementAttribute = tokenizer
						.addAttribute(PositionIncrementAttribute.class);
				PositionLengthAttribute positionLengthAttribute = tokenizer
						.addAttribute(PositionLengthAttribute.class);
				OffsetAttribute offsetAttribute = tokenizer
						.addAttribute(OffsetAttribute.class);
				TypeAttribute typeAttribute = tokenizer
						.addAttribute(TypeAttribute.class);
				// System.out.println(attribute.toString());
				System.out.println("term=" + charTermAttribute.toString() + ","
						+ offsetAttribute.startOffset() + "-"
						+ offsetAttribute.endOffset() + ",type="
						+ typeAttribute.type() + ",PositionIncrement="
						+ positionIncrementAttribute.getPositionIncrement()
						+ ",PositionLength="
						+ positionLengthAttribute.getPositionLength());

			}
			tokenizer.end();
			tokenizer.close();
		} catch (IOException e) {
			e.printStackTrace();
		}
	}

	public static void main(String[] args) {

		String s = "编码规范从根本上解决了程序维护员的难题;规范的编码阅读和理解起来更容易,也可以快速的不费力气的借鉴别人的编码。对将来维护你编码的人来说,你的编码越优化,他们就越喜欢你的编码,理解起来也就越快。";
		StringReader sr = new StringReader(s);

		NGramTokenizer nGramTokenizer = new NGramTokenizer(sr,1,15);
		testtokenizer(nGramTokenizer);
		Analyzer analyzer=new MyNGramAnalyzer();
		testAnalyzer(analyzer,s);
		

	}

	private static void testAnalyzer(Analyzer analyzer,String data) {

	    TokenStream ts = null;
		try {
			ts = analyzer.tokenStream("myfield", new StringReader(data));
			//获取词元位置属性
		    OffsetAttribute  offset = ts.addAttribute(OffsetAttribute.class); 
		    //获取词元文本属性
		    CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
		    //获取词元文本属性
		    TypeAttribute type = ts.addAttribute(TypeAttribute.class);
			ts.reset(); 
			//迭代获取分词结果
			while (ts.incrementToken()) {
				System.out.println(offset.startOffset() + "-" + offset.endOffset() + ":" + term.toString() + "|" + type.type());
			}
			//关闭TokenStream(关闭StringReader)
			ts.end(); 
			

		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			//释放TokenStream的所有资源
			if(ts != null){
		      try {
				ts.close();
		      } catch (IOException e) {
				e.printStackTrace();
		      }
			}
	    }
		
	}

	@Override
	protected TokenStreamComponents createComponents(String arg0) {
		// TODO Auto-generated method stub
		return null;
	}

}

你可能感兴趣的:(lucene分词器分词)