ElasticSearch Suggest 提示(生产使用)

  1. 以下提示采用了ik分词器和pinyin插件配合

    https://github.com/medcl/elasticsearch-analysis-ik/releases
    https://github.com/medcl/elasticsearch-analysis-pinyin/releases
    
  2. 检验ik分词器和拼音插件是否生效

    POST /_analyze
    {
    		"analyzer":"pinyin",
    		"text":"北京东"
    }
    
    POST /_analyze
    {
    		"analyzer":"ik_max_word",
    		"text":"北京东"
    }
    
    拼音的分析结果
    {
      "tokens": [
        {
          "token": "bei",
          "start_offset": 0,
          "end_offset": 0,
          "type": "word",
          "position": 0
        },
        {
          "token": "jing",
          "start_offset": 0,
          "end_offset": 0,
          "type": "word",
          "position": 1
        },
        {
          "token": "dong",
          "start_offset": 0,
          "end_offset": 0,
          "type": "word",
          "position": 2
        },
        {
          "token": "bjd",
          "start_offset": 0,
          "end_offset": 0,
          "type": "word",
          "position": 2
        }
      ]
    }
    --------
    IK分词分析结果
    {
      "tokens": [
        {
          "token": "北京",
          "start_offset": 0,
          "end_offset": 2,
          "type": "CN_WORD",
          "position": 0
        },
        {
          "token": "京东",
          "start_offset": 1,
          "end_offset": 3,
          "type": "CN_WORD",
          "position": 1
        }
      ]
    }
    
  3. 建立索引

    
    {
    	"index": {
    		"analysis": {
    			"analyzer": {
    				"pinyin_analyzer": {
    					"tokenizer": "s-pinyin"
    				},
    				"first_py_letter_analyzer": {
    					"tokenizer": "first_py_letter"
    				},
    				"full_pinyin_letter_analyzer": {
    					"tokenizer": "full_pinyin_letter"
    				}
    			},
    			"tokenizer": {
    				"s-pinyin": {
    					"keep_joined_full_pinyin": "true",
    					"keep_first_letter": "true",
    					"keep_separate_first_letter": "false",
    					"lowercase": "true",
    					"type": "pinyin",
    					"limit_first_letter_length": "16",
    					"keep_original": "true",
    					"keep_full_pinyin": "true",
    					"keep_none_chinese_in_joined_full_pinyin": "true"
    				},
    				"first_py_letter": {
    					"type": "pinyin",
    					"keep_first_letter": true,
    					"keep_full_pinyin": false,
    					"keep_original": false,
    					"limit_first_letter_length": 16,
    					"lowercase": true,
    					"trim_whitespace": true,
    					"keep_none_chinese_in_first_letter": false,
    					"none_chinese_pinyin_tokenize": false,
    					"keep_none_chinese": true,
    					"keep_none_chinese_in_joined_full_pinyin": true
    				},
    				"full_pinyin_letter": {
    					"type": "pinyin",
    					"keep_separate_first_letter": false,
    					"keep_full_pinyin": false,
    					"keep_original": false,
    					"limit_first_letter_length": 16,
    					"lowercase": true,
    					"keep_first_letter": false,
    					"keep_none_chinese_in_first_letter": false,
    					"none_chinese_pinyin_tokenize": false,
    					"keep_none_chinese": true,
    					"keep_joined_full_pinyin": true,
    					"keep_none_chinese_in_joined_full_pinyin": true
    				}
    			}
    		}
    	}
    }
    
  4. 建立mapping

    {
    	"suggest-word": {
    		"properties": {
    			"suggest": {
    				"type": "completion",
    				"fields": {
    					"s-pinyin": {
    						"type": "completion",
    						"analyzer": "pinyin_analyzer"
    					},
    					"keyword-pinyin": {
    						"type": "completion",
    						"analyzer": "full_pinyin_letter_analyzer"
    					},
    					"keyword-first-py": {
    						"type": "completion",
    						"analyzer": "first_py_letter_analyzer"
    					},
    					"ik-word":{
                              "type": "completion",
    						"analyzer": "ik_max_word"
    					},
    					"standard-word":{
                              "type": "completion",
    						"analyzer": "standard"
    					}
    				}
    			}
    		}
    	}
    }
    
  5. 查询suggest

    {
      "suggest": {
        "text": "美白",
        "keyword_pinyin": {
          "completion": {
            "field": "suggest.keyword_pinyin"
          }
        },
        "s-pinyin": {
          "completion": {
            "field": "suggest.s-pinyin"
          }
        },
        "standard-word": {
          "completion": {
            "field": "suggest.standard-word"
          }
        },
        "keyword_first_py": {
          "completion": {
            "field": "suggest.keyword_first_py"
          }
        },
        "ik-word": {
          "completion": {
            "field": "suggest.ik-word"
          }
        }
      }
    }
    

    结果是有5个,当然使用时不能都使用,需要根据不同的情况使用。

    ik-word->s-pinyin->keyword_pinyin->keyword_first_py->standard-word

    偏差越大的应当放在越后,用于补全等操作。

  6. 关于词库

    词库应该是库内的专业词库,或者从搜索日志里捞出搜索量很大的词汇,充当搜索词建议。

  7. Java API

    public List suggestWord(String text) {
    		//Set results = new TreeSet()
    
    		String indexName = "cb_es_ext_word";
    
    		CompletionSuggestionBuilder sPinyin = SuggestBuilders.completionSuggestion("suggest_spinyin").prefix(text);
    		CompletionSuggestionBuilder standardWord = SuggestBuilders.completionSuggestion("suggest_standard").prefix(text);
    		CompletionSuggestionBuilder keywordPinyin = SuggestBuilders.completionSuggestion("suggest_pinyin").prefix(text);
    		CompletionSuggestionBuilder ikWord = SuggestBuilders.completionSuggestion("suggest_ik_word").prefix(text);
    		CompletionSuggestionBuilder keywordFirstPy = SuggestBuilders.completionSuggestion("suggest_first_py").prefix(text);
    		CompletionSuggestionBuilder suggestFuzzy = SuggestBuilders.completionSuggestion("suggest").prefix(text,Fuzziness.TWO);
    		SearchRequest searchRequest = new SearchRequest().indices(indexName).types(ElasticSearchConstant.DEFAULT_TYPE_STR).source(new SearchSourceBuilder().suggest(
    				new SuggestBuilder().addSuggestion("s-pinyin", sPinyin)
    						.addSuggestion("standard-word", standardWord)
    						.addSuggestion("keyword_pinyin", keywordPinyin)
    						.addSuggestion("ik-word", ikWord)
    						.addSuggestion("keyword-first-py", keywordFirstPy)
    						.addSuggestion("suggest-fuzzy", suggestFuzzy)
    		));
    		SearchResponse searchResponse = null;
    		try {
    			LOGGER.debug(" SearchRequest String:" + searchRequest.source().toString());
    			searchResponse = restHighLevelClient.search(searchRequest, RequestOptions.DEFAULT);
    		} catch (IOException e) {
    			e.printStackTrace();
    		}
    		//System.out.println(searchResponse);
    		Suggest suggestions = searchResponse.getSuggest();
    
    		//合并词条
    		Map suggestMap = new LinkedHashMap<>();
    
    		handlerSuggest(suggestions,suggestMap,"ik-word");
    		if(suggestMap.size() < 10){
    			handlerSuggest(suggestions,suggestMap,"s-pinyin");
    			handlerSuggest(suggestions,suggestMap,"keyword_pinyin");
    		}
    		if(suggestMap.size() == 0){
    			handlerSuggest(suggestions,suggestMap,"standard-word");
    		}
    		if(suggestMap.size() == 0){
    			handlerSuggest(suggestions,suggestMap,"keyword-first-py");
    		}
    		if(suggestMap.size() == 0){
    			// 匹配文本相似度 根据专业词库纠正文本
    			handlerSuggest(suggestions,suggestMap,"suggest-fuzzy");
    		}
    
    
    		System.out.println(JSON.toJSONString(suggestMap));
    
    		/*
    		1. 全中文词汇 采用ik-word ik分词 和standard 查询。
    		2. 含有英文和中文的 采用 s-pinyin keyword_pinyin
    		3. 全英文的就使用拼音
    
    		 **/
    
    		List suggestList = new ArrayList<>();
    		suggestMap.forEach((key,value)->{
    			suggestList.add(key);
    		});
    
    		return suggestList;
    	}
    	
    	private void handlerSuggest(Suggest suggestions,Map suggestMap,String suggestName){
    		List> results = suggestions.getSuggestion(suggestName).getEntries();
    		for (Suggest.Suggestion.Entry op : results) {
    			List options = op.getOptions();
    			for (Suggest.Suggestion.Entry.Option pp : options) {
    				if (suggestMap.containsKey(pp.getText().toString())) {
    					suggestMap.put(pp.getText().toString(), suggestMap.get(pp.getText().toString()) + 1);
    				} else {
    					suggestMap.put(pp.getText().toString(), 1);
    				}
    			}
    		}
    	}
    
  8. 使用weight

    数据结构

    {
        "suggest":{
            "input":"联想词",
            "weight":10
        }
    }
    

    使用weight 不能使用 fields 字段 映射

    需建立多个字段来分词

    {
    	"mappings": {
    		"content_bank_entity": {
    			"properties": {
    				"suggest_spinyin": {
    					"max_input_length": 50,
    					"analyzer": "pinyin_analyzer",
    					"preserve_position_increments": true,
    					"type": "completion",
    					"preserve_separators": true
    				},
    				"suggest_standard": {
    					"max_input_length": 50,
    					"analyzer": "standard",
    					"preserve_position_increments": true,
    					"type": "completion",
    					"preserve_separators": true
    				},
    				"suggest_first_py": {
    					"max_input_length": 50,
    					"analyzer": "first_py_letter_analyzer",
    					"preserve_position_increments": true,
    					"type": "completion",
    					"preserve_separators": true
    				},
    				"suggest": {
    					"max_input_length": 50,
    					"analyzer": "simple",
    					"preserve_position_increments": true,
    					"type": "completion",
    					"preserve_separators": true
    				},
    				"suggest_ik_word": {
    					"max_input_length": 50,
    					"analyzer": "ik_max_word",
    					"preserve_position_increments": true,
    					"type": "completion",
    					"preserve_separators": true
    				},
    				"suggest_pinyin": {
    					"max_input_length": 50,
    					"analyzer": "full_pinyin_letter_analyzer",
    					"preserve_position_increments": true,
    					"type": "completion",
    					"preserve_separators": true
    				}
    			}
    		}
    	}
    }
    

    搜索语句改成

    {
      "suggest": {
        "text": "quchensh",
        "keyword_pinyin": {
          "completion": {
            "field": "suggest_pinyin"
          }
        },
        "s-pinyin": {
          "completion": {
            "field": "suggest_spinyin"
          }
        },
        "standard-word": {
          "completion": {
            "field": "suggest_standard"
          }
        },
        "keyword_first_py": {
          "completion": {
            "field": "suggest_first_py"
          }
        },
        "ik-word": {
          "completion": {
            "field": "suggest_ik_word"
          }
        }
      }
    }
    
  9. 参考资料
    主要参考

    https://blog.csdn.net/baifanwudi/article/details/88662561 https://blog.csdn.net/wwd0501/article/details/80885987

    https://www.jianshu.com/p/9e2c6a8e1b54

    系统学习suggest

    https://blog.csdn.net/supermao1013/article/details/84311057 https://www.cnblogs.com/wangzhuxing/p/9574630.html#_label2

    自动纠错 用于英文可以 ,中文不行

    https://blog.csdn.net/Insightzen_xian/article/details/80692366

    https://learnku.com/articles/37090

    辅助

    https://www.jianshu.com/p/8a6b80813a34

    自定义分词器

    https://www.cnblogs.com/shoufeng/p/10562746.html

    ES Mapping、字段类型Field type详解

    https://blog.csdn.net/ZYC88888/article/details/83059040

    可以使用百度文本纠错接口

    https://ai.baidu.com/ai-doc/NLP/Sk3pmn0o5

你可能感兴趣的:(搜索)