lucene4.x自定义停用分词器

package com.kkrgwbj.util;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LetterTokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;

import java.io.Reader;
import java.util.HashSet;
import java.util.Set;

/**
 * 自定义停用分词器
 * Created by lvbingyang on 2015/11/25 0025.
 */
public class MyStopAnalyzer extends Analyzer {
    private Set stops;

    public MyStopAnalyzer(String[] sws) {
        //将字符串数组添加到停用词的set集合中
        stops = StopFilter.makeStopSet(Version.LUCENE_45, sws, true);
        //加入原来的停用词
        stops.addAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET);
    }

    /**
     * 默认构造方法
     */
    public MyStopAnalyzer() {
        stops = new HashSet<>();
        stops.addAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET);//加入原来的停用词
    }

    @Override
    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        //主要负责接收reader,将reader进行分词操作
        Tokenizer tokenizer = new LetterTokenizer(Version.LUCENE_45, reader);
        //创建停用词的set对象
        CharArraySet charArraySet = CharArraySet.copy(Version.LUCENE_45, stops);
        //分词器做好处理之后得到的一个流,这个流中存储了分词的信息
        //使用了忽略大小写的filter,停用filter过滤
        TokenStream tokenStream = new LowerCaseFilter(Version.LUCENE_45, new StopFilter(Version.LUCENE_45, tokenizer, charArraySet));
        return new TokenStreamComponents(tokenizer, tokenStream);
    }
}

junit测试:

 @Test
    public void test2() {
        Analyzer analyzer = new MyStopAnalyzer(new String[]{"I", "you", "hate"});
        Analyzer analyzer1 = new StopAnalyzer(Version.LUCENE_45);
        String txt = "i love you,i hate you";
        //自定义的停用词分词器
        AnalyzerUtils.displayToken(txt, analyzer);
        //默认的停用词分词器
        AnalyzerUtils.displayToken(txt, analyzer1);
    }



在这里,我们停用了i,you,hate,运行结果:

lucene4.x自定义停用分词器_第1张图片

 

 

 

 

你可能感兴趣的:(Lucene,分词器,停用分词器)