Analysis:把文本转换成索引表示的过程。
需要先parse再anaylize。
Analysis唯一需要实现的方法:
public TokenStream tokenStream(String fieldName, Reader reader)
要提高性能需要实现reusableTokenStream。
token:分析器的输出,包括偏移和长度。
以paoding的分析器为例:
public class PaodingAnalyzer extends PaodingAnalyzerBean public class PaodingAnalyzerBean extends Analyzer { public TokenStream tokenStream(String fieldName, Reader reader) { if (knife == null) { throw new NullPointerException("knife should be set before token"); } // PaodingTokenizer是TokenStream实现,使用knife解析reader流入的文本 return new PaodingTokenizer(reader, knife, createTokenCollector()); } public final class CachingTokenFilter extends TokenFilter { public final boolean incrementToken() throws IOException { if (cache == null) { // fill cache lazily cache = new LinkedList<AttributeSource.State>(); fillCache(); iterator = cache.iterator(); } if (!iterator.hasNext()) { // the cache is exhausted, return false return false; } // Since the TokenFilter can be reset, the tokens need to be preserved as immutable. restoreState(iterator.next()); return true; } private void fillCache() throws IOException { while(input.incrementToken()) { cache.add(captureState()); } // capture final state input.end(); finalState = captureState(); } } //分析过程中调用incrementToken() public final class PaodingTokenizer extends Tokenizer implements Collector { public boolean incrementToken() throws IOException { // 已经穷尽tokensIteractor的Token对象,则继续请求reader流入数据 while (tokenIteractor == null || !tokenIteractor.hasNext()) { tokenIteractor = tokenCollector.iterator(); } Token token = tokenIteractor.next(); termAtt.setTermBuffer(token.term()); offsetAtt.setOffset(correctOffset(token.startOffset()), correctOffset(token.endOffset())); typeAtt.setType("paoding"); } } }
CachingTokenFilter.incrementToken调用CachingTokenFilter.fillCache,调用PaodingTokenizer.incrementToken,调用
getFieldQuery source = analyzer.tokenStream(field, new StringReader(queryText)); new PaodingTokenizer(reader, knife, createTokenCollector()); buffer = new CachingTokenFilter(source); hasMoreTokens = buffer.incrementToken(); fillCache(); input.incrementToken(); knife.dissect((Collector) this, beef, dissected); offset = super.dissect(collector, beef, offset); token = tokenIteractor.next(); termAtt.setTermBuffer(token.term()); offsetAtt.setOffset(); captureState(); this.getCurrentState();