转:http://blog.sina.com.cn/s/blog_4b3b7aff0100g3wh.html
Lucene的分词器部分是经常被修改的,我们实验室自己的分词器更适合自然语言处理,因此如何挂载自己的分词结果呢?在Lucene 3 中,发生了较大的变化。研究了半天,只需重写Tokenizer即可,关键是incrementToken()函数,用来向索引表写入词语数据和位移数据。其中的FMM.fmm()采用java本地接口jni调用dll实现,dll采用C写成,分词效率极高,且具有很强的未登录词识别功能;
import java.io.BufferedReader; import java.io.IOException; import java.io.Reader; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.util.AttributeSource; public class ICTSegTokenizer extends Tokenizer { public ICTSegTokenizer(Reader in) { super(in); init(); } public ICTSegTokenizer(AttributeSource source, Reader in) { super(source, in); init(); } public ICTSegTokenizer(AttributeFactory factory, Reader in) { super(factory, in); init(); } private void init() { termAtt = (TermAttribute) addAttribute(TermAttribute.class); offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class); } public void seg() { try { BufferedReader buffer = new BufferedReader(input);// input转缓存 StringBuffer sb = new StringBuffer();// 存放每一行内容 String line = ""; while ((line = buffer.readLine()) != null) { sb.append(line); } String result = FMM.fmm(sb.toString(), "\t"); // System.out.println(sb.toString() + ":" + result); FMM.freeMem(); resultArray = result.split("\t"); } catch (Exception e) { e.printStackTrace(); } } private int index = 0, len = 0, offset = 0; private String[] resultArray; private TermAttribute termAtt; private OffsetAttribute offsetAtt; public boolean incrementToken() { clearAttributes(); if (index == 0) { seg(); index = 0; len = resultArray.length; } if (index > len - 1) {// 超过分词结果时退出 index = 0; len = 0; return false; } String word = resultArray[index];// 得到该索引的词 termAtt.setTermBuffer(word);// 设置termAttr int wordLen = word.length();// 词的长度 offsetAtt.setOffset(offset, offset + wordLen);// 设置位移 // System.out.println(word + ":(" + Integer.toString(offset) + "-" // + Integer.toString(offset + wordLen) + ")"); offset += wordLen; index++; return true; } public final void end() { // set final offset final int finalOffset = offset; this.offsetAtt.setOffset(finalOffset, finalOffset); } public void reset() throws IOException { super.reset(); offset = index = 0; } public void reset(Reader input) throws IOException { super.reset(input); reset(); } }
Lucene允许分词器的扩充,或换句话说也就是允许你自己编写的分词器应用到Lucene中,那么Lucene是如何做到这点的呢?如果让我们来自己设计,我们会如何做呢?下面将以Lucene自带的标准分词器StandardAnalyzer来予以说明
首先看一下StandardAnalyzer的代码,为了显示简洁,以及突出重点,将不显示StandardAnalyzer的所有代码,先来看:
public class StandardAnalyzer extends Analyzer { public TokenStream tokenStream(String fieldName, Reader reader){ TokenStream result = new StandardTokenizer(reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); result = new StopFilter(result, stopSet); return result; } }
public abstract class Analyzer { public abstract TokenStream tokenStream(String fieldName, Reader reader); public int getPositionIncrementGap(String fieldName){ return 0; } }
String s = "hello"; reader = new StringReader(s); //StringReader继承自Reader