elasticsearch 1.1.0 mmseg 插件的版本是1.2.2 版本。该版本没有解决英文数字分词问题。
比如 user123。分词后 user123
解决1:
mmseg插件升级 elasticsearch-analysis-mmseg-1.4.0。
https://github.com/medcl/elasticsearch-analysis-mmseg/commit/61b5e8199425c845a3060fe39f40e59868dd364b
index: analysis: tokenizer: mmseg_maxword: type: mmseg seg_type: max_word mmseg_complex: type: mmseg seg_type: complex analyzer: mmseg_maxword: type: custom filter: - lowercase - cut_letter_digit tokenizer: mmseg_maxword mmseg: type: custom filter: - lowercase - cut_letter_digit tokenizer: mmseg_maxword mmseg_complex: type: custom filter: - lowercase tokenizer: mmseg_complex #index.analysis.analyzer.default.type : "org.elasticsearch.index.analysis.MMsegAnalyzerProvider" #index.analysis.analyzer.default.type : "ik" index.analysis.analyzer.default.type : "mmseg"
解决2:
修改1.2.2 版本jar包中的MMSegAnalyzer,然后替换class
package com.chenlb.mmseg4j.analysis; import java.io.File; import java.io.Reader; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Analyzer.TokenStreamComponents; import com.chenlb.mmseg4j.Dictionary; import com.chenlb.mmseg4j.MaxWordSeg; import com.chenlb.mmseg4j.Seg; /** * 榛樿浣跨敤 max-word * * @see {@link SimpleAnalyzer}, {@link ComplexAnalyzer}, {@link MaxWordAnalyzer} * * @author chenlb */ public class MMSegAnalyzer extends Analyzer { protected Dictionary dic; /** * @see Dictionary#getInstance() */ public MMSegAnalyzer() { dic = Dictionary.getInstance(); } /** * @param path 璇嶅簱璺緞 * @see Dictionary#getInstance(String) */ public MMSegAnalyzer(String path) { dic = Dictionary.getInstance(path); } /** * @param path 璇嶅簱鐩綍 * @see Dictionary#getInstance(File) */ public MMSegAnalyzer(File path) { dic = Dictionary.getInstance(path); } public MMSegAnalyzer(Dictionary dic) { super(); this.dic = dic; } protected Seg newSeg() { return new MaxWordSeg(dic); } public Dictionary getDict() { return dic; } /*@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { return new TokenStreamComponents(new MMSegTokenizer(newSeg(), reader)); }*/ @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer t = new MMSegTokenizer(newSeg(), reader); return new TokenStreamComponents(t, new CutLetterDigitFilter(t)); } }