Lucene自定义同义词分词器

 1 package com.lucene.util;

 2 

 3 import java.io.Reader;

 4 

 5 import org.apache.lucene.analysis.Analyzer;

 6 import org.apache.lucene.analysis.TokenStream;

 7 

 8 import com.chenlb.mmseg4j.Dictionary;

 9 import com.chenlb.mmseg4j.MaxWordSeg;

10 import com.chenlb.mmseg4j.analysis.MMSegTokenizer;

11 

12 public class MySameworkAnalyzer extends Analyzer {

13 

14     @Override

15     public TokenStream tokenStream(String str, Reader reader) {

16         //获取中文分词器的字段,我这里使用的是MMSeg4j的中文分词器

17         Dictionary dic=Dictionary.getInstance("F:\\官方包\\lucene-3.5.0\\mmseg4j-1.8.5\\data");

18         return new MySameworkFilter(new MMSegTokenizer(new MaxWordSeg(dic), reader));

19     }

20 

21 }
 1     @Test

 2     public void test05(){

 3         try {

 4             Analyzer a1=new MySameworkAnalyzer();

 5             String str="我来自中国,我的名字叫什么";

 6             AnalyzerUtil.displayToken(str, a1);

 7             Directory directory=new RAMDirectory();

 8             IndexWriter indexWriter=new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35, a1));

 9             Document document=new Document();

10             document.add(new Field("content", str,Field.Store.YES,Field.Index.ANALYZED));

11             indexWriter.addDocument(document);

12             indexWriter.close();

13             IndexReader indexReader=IndexReader.open(directory);

14             IndexSearcher searcher=new IndexSearcher(indexReader);

15             TopDocs tds=searcher.search(new TermQuery(new Term("content", "大陆")), 10);

16             ScoreDoc[] docs=tds.scoreDocs;

17             Document doc=searcher.doc(docs[0].doc);

18             System.out.println(doc.get("content"));

19             searcher.close();

20             indexReader.close();

21         } catch (CorruptIndexException e) {

22             e.printStackTrace();

23         } catch (LockObtainFailedException e) {

24             e.printStackTrace();

25         } catch (IOException e) {

26             e.printStackTrace();

27         }

28     }

 

 1 package com.lucene.util;

 2 

 3 import java.io.IOException;

 4 import java.util.HashMap;

 5 import java.util.Map;

 6 import java.util.Stack;

 7 

 8 import org.apache.lucene.analysis.TokenFilter;

 9 import org.apache.lucene.analysis.TokenStream;

10 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

11 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;

12 import org.apache.lucene.util.AttributeSource;

13 

14 public class MySameworkFilter extends TokenFilter  {

15 

16     //保存相应的词汇

17     private CharTermAttribute cta=null;

18     //保存词与词之间的位置增量

19     private PositionIncrementAttribute pia=null;

20     //定义一个状态

21     private AttributeSource.State current=null;

22     //用栈保存同义词集合

23     private Stack<String> sames=null;

24     protected MySameworkFilter(TokenStream input) {

25         super(input);

26         cta=this.addAttribute(CharTermAttribute.class);

27         pia=this.addAttribute(PositionIncrementAttribute.class);

28         sames=new Stack<String>();

29     }

30 

31 

32     @Override

33     public boolean incrementToken() throws IOException {

34         if(sames.size()>0){

35             //将元素出栈,并获取同义词

36             String str=sames.pop();

37             //还原状态

38             restoreState(current);

39             //先清空,再添加

40             cta.setEmpty();

41             cta.append(str);

42             //设置位置为0,表示同义词

43             pia.setPositionIncrement(0);

44             return true;

45         }

46         

47         if(!this.input.incrementToken())

48         return false;

49         

50         //如果改词中有同义词,捕获当前状态

51         if(this.getSamewords(cta.toString())){

52             current=captureState();

53         }

54         

55         return true;

56     }

57 

58     //定义同义词字典,并判断如果有同义词就返回true

59     private boolean getSamewords(String key){

60         Map<String, String[]> maps=new HashMap<String, String[]>();

61         maps.put("我", new String[]{"咱","俺"});

62         maps.put("中国", new String[]{"大陆","天朝"});

63         

64         if(maps.get(key)!=null){

65             for(String s:maps.get(key)){

66                 sames.push(s);

67             }

68         }

69         

70         if(sames.size()>0){

71             return true;

72         }

73         return false;

74     }

75 

76 }

 

你可能感兴趣的:(Lucene)