Lucene 自定义分词器

 1 package com.lucene.util;

 2 

 3 import java.io.Reader;

 4 import java.util.Set;

 5 

 6 import org.apache.lucene.analysis.Analyzer;

 7 import org.apache.lucene.analysis.LetterTokenizer;

 8 import org.apache.lucene.analysis.LowerCaseFilter;

 9 import org.apache.lucene.analysis.StopAnalyzer;

10 import org.apache.lucene.analysis.StopFilter;

11 import org.apache.lucene.analysis.TokenStream;

12 import org.apache.lucene.util.Version;

13 

14 //定义禁用词分词器

15 public class UserDefinedAnalyzer extends Analyzer {

16 

17     //定义禁用词集合

18     private Set stops;

19     

20     //无参构造器使用默认的禁用词分词器

21     public UserDefinedAnalyzer(){

22         stops=StopAnalyzer.ENGLISH_STOP_WORDS_SET;

23     }

24     

25     /**

26      * 传一个禁用词数组

27      * @param sws

28      */

29     public UserDefinedAnalyzer(String[] sws){

30         //使用stopFilter创建禁用词集合

31         stops=StopFilter.makeStopSet(Version.LUCENE_35,sws,true);

32         //将默认的禁用词添加进集合

33         stops.addAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET);

34     }

35     

36     /**

37      * 自定义分词器

38      */

39     @Override

40     public TokenStream tokenStream(String str, Reader reader) {

41         

42         return new StopFilter    

43                 (Version.LUCENE_35, 

44                         new LowerCaseFilter    

45                 (Version.LUCENE_35, 

46                         new LetterTokenizer(

47                 Version.LUCENE_35, reader)), stops);

48     }

49 

50 }
    @Test

    public void test04(){

        Analyzer a1=new UserDefinedAnalyzer(new String[]{"my","name"});

        //Analyzer a1=new UserDefinedAnalyzer();

        String str="my name is paul";

        AnalyzerUtil.displayToken(str, a1);

    }

 

你可能感兴趣的:(Lucene)