IKAnalyzer中文进行分词

import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.highlight.*;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;
import org.wltea.analyzer.lucene.IKAnalyzer;
 
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
 
/**
 * 搜索工具类
 * User: Winter Lau
 * Date: 13-1-10
 * Time: 上午11:54
 */
public class SearchHelper {
 
    private final static Log log = LogFactory.getLog(SearchHelper.class);
    private final static IKAnalyzer analyzer = new IKAnalyzer();
 
    private final static List<String> nowords = new ArrayList<String>(){{//可用可不用
        try{
            addAll(IOUtils.readLines(SearchHelper.class.getResourceAsStream("/stopword.dic")));
        }catch(IOException e){
            log.error("Unabled to read stopword file", e);
        }
    }};
 
    /**
     * 关键字切分
     * @param sentence 要分词的句子
     * @return 返回分词结果
     */
    public static List<String> splitKeywords(String sentence) {
 
        List<String> keys = new ArrayList<String>();
 
        if(StringUtils.isNotBlank(sentence))  {
            StringReader reader = new StringReader(sentence);
            IKSegmenter ikseg = new IKSegmenter(reader, true);
            try{
                do{
                    Lexeme me = ikseg.next();
                    if(me == null)
                        break;
                    String term = me.getLexemeText();
                    if(StringUtils.isNumeric(StringUtils.remove(term,'.')))
                        continue;
                    if(nowords.contains(term.toLowerCase()))
                        continue;
                    keys.add(term);
                }while(true);
            }catch(IOException e){
                log.error("Unable to split keywords", e);
            }
        }
 
        return keys;
    }
 
}

 

protected static void test_split(){
    String text = "开源中国 www.oschina.net 成立于2008年8月,是目前中国最大的开源技术社区。我们传播开源的理念,推广开源项目,为 IT 开发者提供了一个发现、使用、并交流开源技术的平台。";
    long ct = System.currentTimeMillis();
    for(String word : SearchHelper.splitKeywords(text)){
        System.out.println(word);
    }
    System.out.printf("TIME %d\n", (System.currentTimeMillis() - ct));
}

 

你可能感兴趣的:(中文分词算法,IKAnalyzer)