一、先摆需求:
1、中文搜索、英文搜索、中英混搜 如:“南京东路”,“cafe 南京东路店”
2、全拼搜索、首字母搜索、中文+全拼、中文+首字母混搜 如:“nanjingdonglu”,“njdl”,“南京donglu”,“南京dl”,“nang南东路”,“njd路”等等组合
3、简繁搜索、特殊符号过滤搜索 如:“龍馬”可通过“龙马”搜索,再比如 L.G.F可以通过lgf搜索,café可能通过cafe搜索
4、排序优先级为: 以关键字开头>包含关键字
二、生产效果图:
三、实现
1、索引设计
为搜索字段建立不同类型的索引,有全拼索引、首字母简写索引、Ngram索引以及IK索引,从各个角度分别击破,然后通过char-filter进行特殊符号与简繁转换。
1.建立settings
PUT /enterpriseextend
{
"settings": {
"analysis": {
"filter": {
"edge_ngram_filter": {
"type": "edge_ngram",
"min_gram": 1,
"max_gram": 50
},
"pinyin_simple_filter": {
"type": "pinyin",
"keep_first_letter": true,
"keep_separate_first_letter": false,
"keep_full_pinyin": false,
"keep_original": false,
"limit_first_letter_length": 50,
"lowercase": true
},
"pinyin_full_filter": {
"type": "pinyin",
"keep_first_letter": false,
"keep_separate_first_letter": false,
"keep_full_pinyin": true,
"none_chinese_pinyin_tokenize": true,
"keep_original": false,
"limit_first_letter_length": 50,
"lowercase": true
}
},
"char_filter": {
"charconvert": {
"type": "mapping",
"mappings_path": "char_filter_text.txt"
}
},
"tokenizer": {
"ik_max_word": {
"type": "ik_max_word",
"use_smart": true
}
},
"analyzer": {
"ngramIndexAnalyzer": {
"type": "custom",
"tokenizer": "keyword",
"filter": [
"edge_ngram_filter",
"lowercase"
],
"char_filter": [
"charconvert"
]
},
"ngramSearchAnalyzer": {
"type": "custom",
"tokenizer": "keyword",
"filter": [
"lowercase"
],
"char_filter": [
"charconvert"
]
},
"ikIndexAnalyzer": {
"type": "custom",
"tokenizer": "ik_max_word",
"char_filter": [
"charconvert"
]
},
"ikSearchAnalyzer": {
"type": "custom",
"tokenizer": "ik_max_word",
"char_filter": [
"charconvert"
]
},
"pinyiSimpleIndexAnalyzer": {
"tokenizer": "keyword",
"filter": [
"pinyin_simple_filter",
"edge_ngram_filter",
"lowercase"
]
},
"pinyiSimpleSearchAnalyzer": {
"tokenizer": "keyword",
"filter": [
"pinyin_simple_filter",
"lowercase"
]
},
"pinyiFullIndexAnalyzer": {
"tokenizer": "keyword",
"filter": [
"pinyin_full_filter",
"lowercase"
]
},
"pinyiFullSearchAnalyzer": {
"tokenizer": "keyword",
"filter": [
"pinyin_full_filter",
"lowercase"
]
}
}
}
}
}
2.建立mapping
PUT enterpriseextend/_mapping/enterpriseextend
{
"properties": {
"id": {
"type": "long"
},
"entName": {
"type": "text",
"analyzer": "ikIndexAnalyzer",
"fields": {
"ngram": {
"type": "text",
"analyzer": "ngramIndexAnalyzer"
},
"SPY": {
"type": "text",
"analyzer": "pinyiSimpleIndexAnalyzer"
},
"FPY": {
"type": "text",
"analyzer": "pinyiFullIndexAnalyzer"
}
}
},
"serviceFinanceEntType": {
"type": "text",
"analyzer": "ik_max_word",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"serviceSupport": {
"type": "text",
"analyzer": "ik_max_word",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"serviceEntRat": {
"type": "text",
"analyzer": "ik_max_word",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
}
拼音插件的使用请参考:https://github.com/medcl/elasticsearch-analysis-pinyin
2、搜索构建
以下是dao层搜索实现代码(非完整代码,只摘录核心部分,主要是思路):
package com.boao.platform.search.dao;
import com.boao.platform.common.util.ChineseToPinYinUtil;
import org.elasticsearch.action.search.SearchRequest;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.client.RestHighLevelClient;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.index.query.*;
import org.elasticsearch.search.builder.SearchSourceBuilder;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Repository;
import java.io.IOException;
import java.util.concurrent.TimeUnit;
@Repository
public class EnterpriseExtendRecommendRepository {
@Autowired
private RestHighLevelClient client;
//索引库名称
private static final String INDEX = "enterpriseextend";
//文档类型
private static final String TYPE = "enterpriseextend";
public SearchResponse getList(String words,int pageNo,int pageSize){
// 这个sourcebuilder就类似于查询语句中最外层的部分。包括查询分页的起始,
// 查询语句的核心,查询结果的排序,查询结果截取部分返回等一系列配置
SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();
// 结果开始处
sourceBuilder.from((pageNo-1)*pageSize);
// 查询结果终止处
sourceBuilder.size(pageSize);
// 查询的等待时间
sourceBuilder.timeout(new TimeValue(60, TimeUnit.SECONDS));
//执行查询
sourceBuilder.query(chineseAndPinYinSearch(words));
System.out.println(sourceBuilder);
//指定索引库和类型
SearchRequest searchRequest = new SearchRequest(INDEX);
searchRequest.types(TYPE);
searchRequest.source(sourceBuilder);
try {
return client.search(searchRequest);
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
//中文、拼音混合搜索
private QueryBuilder chineseAndPinYinSearch(String words){
//使用dis_max直接取多个query中,分数最高的那一个query的分数即可
DisMaxQueryBuilder disMaxQueryBuilder=QueryBuilders.disMaxQuery();
/**
* 纯中文搜索,不做拼音转换,采用edge_ngram分词(优先级最高)
* 权重* 5
*/
QueryBuilder normSearchBuilder=QueryBuilders.matchQuery("entName.ngram",words).analyzer("ngramSearchAnalyzer").boost(5f);
/**
* 拼音简写搜索
* 1、分析key,转换为简写 case: 南京东路==>njdl,南京dl==>njdl,njdl==>njdl
* 2、搜索匹配,必须完整匹配简写词干
* 3、如果有中文前缀,则排序优先
* 权重*1
*/
String firstChar = ChineseToPinYinUtil.ToFirstChar(words);
TermQueryBuilder pingYinSampleQueryBuilder = QueryBuilders.termQuery("entName.SPY", firstChar);
/**
* 拼音简写包含匹配,如 njdl可以查出 "城市公牛 南京东路店",虽然非南京东路开头
* 权重*0.8
*/
QueryBuilder pingYinSampleContainQueryBuilder=null;
if(firstChar.length()>1){
pingYinSampleContainQueryBuilder=QueryBuilders.wildcardQuery("entName.SPY", "*"+firstChar+"*").boost(0.8f);
}
/**
* 拼音全拼搜索
* 1、分析key,获取拼音词干 case : 南京东路==>[nan,jing,dong,lu],南京donglu==>[nan,jing,dong,lu]
* 2、搜索查询,必须匹配所有拼音词,如南京东路,则nan,jing,dong,lu四个词干必须完全匹配
* 3、如果有中文前缀,则排序优先
* 权重*1
*/
QueryBuilder pingYinFullQueryBuilder=null;
if(words.length()>1){
pingYinFullQueryBuilder=QueryBuilders.matchPhraseQuery("entName.FPY", words).analyzer("pinyiFullSearchAnalyzer");
}
/**
* 完整包含关键字查询(优先级最低,只有以上四种方式查询无结果时才考虑)
* 权重*0.8
*/
QueryBuilder containSearchBuilder=QueryBuilders.matchQuery("entName", words).analyzer("ikSearchAnalyzer").minimumShouldMatch("100%");
disMaxQueryBuilder
.add(normSearchBuilder)
.add(pingYinSampleQueryBuilder)
.add(containSearchBuilder);
//以下两个对性能有一定的影响,故作此判定,单个字符不执行此类搜索
if(pingYinFullQueryBuilder!=null){
disMaxQueryBuilder.add(pingYinFullQueryBuilder);
}
if(pingYinSampleContainQueryBuilder!=null){
disMaxQueryBuilder.add(pingYinSampleContainQueryBuilder);
}
return disMaxQueryBuilder;
}
}
注:以上JAVA示例代码皆以 elasticsearch官方推荐的 Java High Level REST Client 6.2.2 为基础。
定制特殊符号及简繁转换文本:char_filter_text.txt,由于文件有点长,以下是部分内容,参考格式即可。
à=>a
á=>a
â=>a
ä=>a
À=>a
Â=>a
Ä=>a
è=>e
é=>e
ê=>e
ë=>e
È=>e
É=>e
Ê=>e
Ë=>e
î=>i
ï=>i
Î=>i
Ï=>i
ô=>o
ö=>o
Ô=>o
Ö=>o
ù=>u
û=>u
ü=>u
Ù=>u
Û=>u
Ü=>u
ç=>c
œ=>c
&=>
^=>
.=>
·=>
-=>
'=>
’=>
‘=>
/=>
醯壶=>醯壶
屢顧爾僕=>屡顾尔仆
見=>见
往裡=>往里
置言成範=>置言成范
捲動=>卷动
規=>规
齣電視=>出电视
覎=>觃
後堂=>后堂