lucene5.5根据现有分词器改造做同义词分词器

lucene5之后版本有了较大的改动,现将lucene5的同义词分词器改造代码和方式,记录一下

功能加测试的类一共6个,一一介绍一下

1 同义词分词器类SameWordAnalyzer

2 同义词过滤器类SameWordFilter

3 根据词语获取同义词引擎接口SameWordEngine

4 同义词引擎接口实现类SameWordEngineImpl

5 分词器分词分析工具类 AnalyzerUtils

6 结果测试类 TestUnit

使用的jar包如下

lucene5.5根据现有分词器改造做同义词分词器_第1张图片



具体代码如下

1 SameWordAnalyzer类内容:

package com.liu.lucene.pro;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardTokenizer;

public class SameWordAnalyzer extends Analyzer {

private SameWordEngine engine;

public SameWordAnalyzer(SameWordEngine engine){
this.engine = engine;
}

@Override
protected TokenStreamComponents createComponents(String fieldName) {
// TODO Auto-generated method stub
Tokenizer source = new StandardTokenizer();
TokenStream result = new SameWordFilter(source,engine);
return new TokenStreamComponents(source, result);
}

}

2 SameWordFilter类


package com.liu.lucene.pro;

import java.io.IOException;
import java.util.Stack;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.AttributeSource;


public class SameWordFilter extends TokenFilter {
private SameWordEngine engine;
private Stack samewordStack;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
    private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
    private AttributeSource.State current;
    
    protected SameWordFilter(TokenStream input,SameWordEngine engine) {
super(input);
this.engine = engine;
samewordStack = new Stack();
}
    
@Override
public boolean incrementToken() throws IOException {
if(samewordStack.size()>0){
String sameWord = samewordStack.pop();
this.restoreState(current);
//termAtt.copyBuffer(sameWord.toCharArray(), 0, sameWord.length());
termAtt.setEmpty();
termAtt.append(sameWord);
posIncrAtt.setPositionIncrement(0);
return true;
}

if(!input.incrementToken()){
return false;
}

if(isAddSameWord()){
current = this.captureState();
}

return true;
}


private boolean isAddSameWord() {
String[] sameWords = engine.getSameWords(termAtt.toString());
if(sameWords == null){
return false;
}

for(String sameWord:sameWords){
samewordStack.push(sameWord);
}
return true;
}


}


3 SameWordEngine引擎接口

package com.liu.lucene.pro;


public interface SameWordEngine {
String[] getSameWords(String str);
}


4 SameWordEngineImpl引擎接口实现类

package com.liu.lucene.pro;


import java.util.HashMap;
import java.util.Map;


public class SameWordEngineImpl implements SameWordEngine {


@Override
public String[] getSameWords(String str) {
// TODO Auto-generated method stub
Map map = new HashMap();

map.put("2015", new String[]{"二零一五","20一5"});
map.put("redis", new String[]{"内存数据库","re内存"});

return map.get(str);
}


}

5 分词器分析工具类AnalyzerUtils


package com.liu.lucene.pro;


import java.io.IOException;
import java.io.Reader;


import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;


public class AnalyzerUtils {


public static void displayTokens(Analyzer analyzer,Reader reader){
try {
TokenStream tokenStream = analyzer.tokenStream("path", reader);
tokenStream.reset();

CharTermAttribute term = tokenStream.addAttribute(CharTermAttribute.class);
PositionIncrementAttribute posIncrAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);

while(tokenStream.incrementToken()){
System.out.print("["+term.toString()+"]");
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}


6 测试类 TestUnit

package com.liu.lucene.test;


import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.Reader;
import org.junit.Before;
import org.junit.Test;


import com.liu.lucene.pro.AnalyzerUtils;
import com.liu.lucene.pro.LuceneIndex;
import com.liu.lucene.pro.SameWordAnalyzer;
import com.liu.lucene.pro.SameWordEngineImpl;


public class TestUnit {
LuceneIndex index = null;

@Before
public void setUp(){
index = new LuceneIndex();
}


@Test
public void testIndex(){
index.index(true);
}

@Test
public void testIndexAnalyzer(){

index.index(true,new SameWordAnalyzer(new SameWordEngineImpl()));
}

@Test
public void testSearch(){
index.search("20一5",new SameWordAnalyzer(new SameWordEngineImpl()));
}

@Test
public void testDisplayTokens(){
try {
Reader reader = new FileReader("D:\\lhl\\developSoft\\apache-tomcat-7.0.62-windows-x64\\apache-tomcat-7.0.62\\logs\\loginfo.log.2015-11-27.log");

AnalyzerUtils.displayTokens(new SameWordAnalyzer(new SameWordEngineImpl()), reader);
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}
}

你可能感兴趣的:(lucene5.5根据现有分词器改造做同义词分词器)