ES小知识:IKSegmenter ,实现自定义分词器

代码记录:

private static final Pattern SPECIAL_CHAR_PATTERN = Pattern.compile(".*[`~!@#$%^&*()+=|{}':;',\\[\\].<>/?~!@#¥%……&*()——+|{}【】\\-‘;:”“’。,、?\\\\]+.*");
/**
 * 自定义分词器
 */
private String customIk(String text) {
    StringBuilder word2Frequency = new StringBuilder();
    // 使用 IKSegmenter 初始化文本信息并加载词典
    IKSegmenter ikSegmenter = new IKSegmenter(new StringReader(text), true);
    Lexeme lex;
    while (true) {
        try {
            if ((lex = ikSegmenter.next()) == null) {
                break;
            }
            String word = StringUtils.trimToEmpty(lex.getLexemeText());
            if (StringUtils.isBlank(word)) {
                continue;
            } // 过滤一些高频率的符号
            else if (word.length() < 3 && SPECIAL_CHAR_PATTERN.matcher(word).matches()) {
                continue;
            }
            // 此处过滤长度为1&&不是数字的str,可以根据自己需求定义
            else if (word.length() < 2 && !Character.isDigit(word.charAt(0))) {
                continue;
            }
            word2Frequency.append(word).append(",");
        } catch (IOException e) {
            log.error("customIk.自定义分词异常", e);
        }
    }
    if(word2Frequency.length() == 0){
        word2Frequency.append(text);
    }
    log.info("customIk.自定义分词结果={}", word2Frequency);
    return word2Frequency.toString();
}

你可能感兴趣的:(elasticsearch,大数据,java)