以下提示采用了ik分词器和pinyin插件配合
https://github.com/medcl/elasticsearch-analysis-ik/releases
https://github.com/medcl/elasticsearch-analysis-pinyin/releases
检验ik分词器和拼音插件是否生效
POST /_analyze
{
"analyzer":"pinyin",
"text":"北京东"
}
POST /_analyze
{
"analyzer":"ik_max_word",
"text":"北京东"
}
拼音的分析结果
{
"tokens": [
{
"token": "bei",
"start_offset": 0,
"end_offset": 0,
"type": "word",
"position": 0
},
{
"token": "jing",
"start_offset": 0,
"end_offset": 0,
"type": "word",
"position": 1
},
{
"token": "dong",
"start_offset": 0,
"end_offset": 0,
"type": "word",
"position": 2
},
{
"token": "bjd",
"start_offset": 0,
"end_offset": 0,
"type": "word",
"position": 2
}
]
}
--------
IK分词分析结果
{
"tokens": [
{
"token": "北京",
"start_offset": 0,
"end_offset": 2,
"type": "CN_WORD",
"position": 0
},
{
"token": "京东",
"start_offset": 1,
"end_offset": 3,
"type": "CN_WORD",
"position": 1
}
]
}
建立索引
{
"index": {
"analysis": {
"analyzer": {
"pinyin_analyzer": {
"tokenizer": "s-pinyin"
},
"first_py_letter_analyzer": {
"tokenizer": "first_py_letter"
},
"full_pinyin_letter_analyzer": {
"tokenizer": "full_pinyin_letter"
}
},
"tokenizer": {
"s-pinyin": {
"keep_joined_full_pinyin": "true",
"keep_first_letter": "true",
"keep_separate_first_letter": "false",
"lowercase": "true",
"type": "pinyin",
"limit_first_letter_length": "16",
"keep_original": "true",
"keep_full_pinyin": "true",
"keep_none_chinese_in_joined_full_pinyin": "true"
},
"first_py_letter": {
"type": "pinyin",
"keep_first_letter": true,
"keep_full_pinyin": false,
"keep_original": false,
"limit_first_letter_length": 16,
"lowercase": true,
"trim_whitespace": true,
"keep_none_chinese_in_first_letter": false,
"none_chinese_pinyin_tokenize": false,
"keep_none_chinese": true,
"keep_none_chinese_in_joined_full_pinyin": true
},
"full_pinyin_letter": {
"type": "pinyin",
"keep_separate_first_letter": false,
"keep_full_pinyin": false,
"keep_original": false,
"limit_first_letter_length": 16,
"lowercase": true,
"keep_first_letter": false,
"keep_none_chinese_in_first_letter": false,
"none_chinese_pinyin_tokenize": false,
"keep_none_chinese": true,
"keep_joined_full_pinyin": true,
"keep_none_chinese_in_joined_full_pinyin": true
}
}
}
}
}
建立mapping
{
"suggest-word": {
"properties": {
"suggest": {
"type": "completion",
"fields": {
"s-pinyin": {
"type": "completion",
"analyzer": "pinyin_analyzer"
},
"keyword-pinyin": {
"type": "completion",
"analyzer": "full_pinyin_letter_analyzer"
},
"keyword-first-py": {
"type": "completion",
"analyzer": "first_py_letter_analyzer"
},
"ik-word":{
"type": "completion",
"analyzer": "ik_max_word"
},
"standard-word":{
"type": "completion",
"analyzer": "standard"
}
}
}
}
}
}
查询suggest
{
"suggest": {
"text": "美白",
"keyword_pinyin": {
"completion": {
"field": "suggest.keyword_pinyin"
}
},
"s-pinyin": {
"completion": {
"field": "suggest.s-pinyin"
}
},
"standard-word": {
"completion": {
"field": "suggest.standard-word"
}
},
"keyword_first_py": {
"completion": {
"field": "suggest.keyword_first_py"
}
},
"ik-word": {
"completion": {
"field": "suggest.ik-word"
}
}
}
}
结果是有5个,当然使用时不能都使用,需要根据不同的情况使用。
ik-word->s-pinyin->keyword_pinyin->keyword_first_py->standard-word
偏差越大的应当放在越后,用于补全等操作。
关于词库
词库应该是库内的专业词库,或者从搜索日志里捞出搜索量很大的词汇,充当搜索词建议。
Java API
public List suggestWord(String text) {
//Set results = new TreeSet()
String indexName = "cb_es_ext_word";
CompletionSuggestionBuilder sPinyin = SuggestBuilders.completionSuggestion("suggest_spinyin").prefix(text);
CompletionSuggestionBuilder standardWord = SuggestBuilders.completionSuggestion("suggest_standard").prefix(text);
CompletionSuggestionBuilder keywordPinyin = SuggestBuilders.completionSuggestion("suggest_pinyin").prefix(text);
CompletionSuggestionBuilder ikWord = SuggestBuilders.completionSuggestion("suggest_ik_word").prefix(text);
CompletionSuggestionBuilder keywordFirstPy = SuggestBuilders.completionSuggestion("suggest_first_py").prefix(text);
CompletionSuggestionBuilder suggestFuzzy = SuggestBuilders.completionSuggestion("suggest").prefix(text,Fuzziness.TWO);
SearchRequest searchRequest = new SearchRequest().indices(indexName).types(ElasticSearchConstant.DEFAULT_TYPE_STR).source(new SearchSourceBuilder().suggest(
new SuggestBuilder().addSuggestion("s-pinyin", sPinyin)
.addSuggestion("standard-word", standardWord)
.addSuggestion("keyword_pinyin", keywordPinyin)
.addSuggestion("ik-word", ikWord)
.addSuggestion("keyword-first-py", keywordFirstPy)
.addSuggestion("suggest-fuzzy", suggestFuzzy)
));
SearchResponse searchResponse = null;
try {
LOGGER.debug(" SearchRequest String:" + searchRequest.source().toString());
searchResponse = restHighLevelClient.search(searchRequest, RequestOptions.DEFAULT);
} catch (IOException e) {
e.printStackTrace();
}
//System.out.println(searchResponse);
Suggest suggestions = searchResponse.getSuggest();
//合并词条
Map suggestMap = new LinkedHashMap<>();
handlerSuggest(suggestions,suggestMap,"ik-word");
if(suggestMap.size() < 10){
handlerSuggest(suggestions,suggestMap,"s-pinyin");
handlerSuggest(suggestions,suggestMap,"keyword_pinyin");
}
if(suggestMap.size() == 0){
handlerSuggest(suggestions,suggestMap,"standard-word");
}
if(suggestMap.size() == 0){
handlerSuggest(suggestions,suggestMap,"keyword-first-py");
}
if(suggestMap.size() == 0){
// 匹配文本相似度 根据专业词库纠正文本
handlerSuggest(suggestions,suggestMap,"suggest-fuzzy");
}
System.out.println(JSON.toJSONString(suggestMap));
/*
1. 全中文词汇 采用ik-word ik分词 和standard 查询。
2. 含有英文和中文的 采用 s-pinyin keyword_pinyin
3. 全英文的就使用拼音
**/
List suggestList = new ArrayList<>();
suggestMap.forEach((key,value)->{
suggestList.add(key);
});
return suggestList;
}
private void handlerSuggest(Suggest suggestions,Map suggestMap,String suggestName){
List extends Suggest.Suggestion.Entry extends Suggest.Suggestion.Entry.Option>> results = suggestions.getSuggestion(suggestName).getEntries();
for (Suggest.Suggestion.Entry extends Suggest.Suggestion.Entry.Option> op : results) {
List extends Suggest.Suggestion.Entry.Option> options = op.getOptions();
for (Suggest.Suggestion.Entry.Option pp : options) {
if (suggestMap.containsKey(pp.getText().toString())) {
suggestMap.put(pp.getText().toString(), suggestMap.get(pp.getText().toString()) + 1);
} else {
suggestMap.put(pp.getText().toString(), 1);
}
}
}
}
使用weight
数据结构
{
"suggest":{
"input":"联想词",
"weight":10
}
}
使用weight 不能使用 fields 字段 映射
需建立多个字段来分词
{
"mappings": {
"content_bank_entity": {
"properties": {
"suggest_spinyin": {
"max_input_length": 50,
"analyzer": "pinyin_analyzer",
"preserve_position_increments": true,
"type": "completion",
"preserve_separators": true
},
"suggest_standard": {
"max_input_length": 50,
"analyzer": "standard",
"preserve_position_increments": true,
"type": "completion",
"preserve_separators": true
},
"suggest_first_py": {
"max_input_length": 50,
"analyzer": "first_py_letter_analyzer",
"preserve_position_increments": true,
"type": "completion",
"preserve_separators": true
},
"suggest": {
"max_input_length": 50,
"analyzer": "simple",
"preserve_position_increments": true,
"type": "completion",
"preserve_separators": true
},
"suggest_ik_word": {
"max_input_length": 50,
"analyzer": "ik_max_word",
"preserve_position_increments": true,
"type": "completion",
"preserve_separators": true
},
"suggest_pinyin": {
"max_input_length": 50,
"analyzer": "full_pinyin_letter_analyzer",
"preserve_position_increments": true,
"type": "completion",
"preserve_separators": true
}
}
}
}
}
搜索语句改成
{
"suggest": {
"text": "quchensh",
"keyword_pinyin": {
"completion": {
"field": "suggest_pinyin"
}
},
"s-pinyin": {
"completion": {
"field": "suggest_spinyin"
}
},
"standard-word": {
"completion": {
"field": "suggest_standard"
}
},
"keyword_first_py": {
"completion": {
"field": "suggest_first_py"
}
},
"ik-word": {
"completion": {
"field": "suggest_ik_word"
}
}
}
}
参考资料
主要参考
https://blog.csdn.net/baifanwudi/article/details/88662561 https://blog.csdn.net/wwd0501/article/details/80885987
https://www.jianshu.com/p/9e2c6a8e1b54
系统学习suggest
https://blog.csdn.net/supermao1013/article/details/84311057 https://www.cnblogs.com/wangzhuxing/p/9574630.html#_label2
自动纠错 用于英文可以 ,中文不行
https://blog.csdn.net/Insightzen_xian/article/details/80692366
https://learnku.com/articles/37090
辅助
https://www.jianshu.com/p/8a6b80813a34
自定义分词器
https://www.cnblogs.com/shoufeng/p/10562746.html
ES Mapping、字段类型Field type详解
https://blog.csdn.net/ZYC88888/article/details/83059040
可以使用百度文本纠错接口
https://ai.baidu.com/ai-doc/NLP/Sk3pmn0o5