业务需求:公司项目需要搜索实现拼音搜索,中文搜索,并且需要支持拼音中文混合搜索,同时支持简拼搜索,并且高亮搜索字段的结果
在公司之前的业务需求下,发现的一个拼音和中文混合搜索的时候,实际使用拼音搜索并且返回结果进行高亮查询的时候,发现的一个问题,高亮标签无法使用,参考百度,CSDN,ElasticSerach中文社区,发现提出当前问题的人很多,基本没有一个好的解决方案,至此,我写了这一篇博客。希望能帮到大家
拼音搜索,中文搜索,拼音中文混合搜索,同时支持简拼搜索
详情见博客
中文拼音分词器,安装(简单使用) 详情见博客:https://blog.csdn.net/weixin_38822045/article/details/88970264
________________________________________实操开始,华丽分割线___________________________________________
第①步:上错误实例,问题实例!
https://blog.csdn.net/weixin_38822045/article/details/89138168
错误的索引settings
PUT http://192.168.1.200:9200/liuwunan
{
"settings": {
"index": {
"max_result_window": 10000000
},
"refresh_interval": "5s",
"number_of_shards": 1,
"number_of_replicas": 1,
"analysis": { #分析
"filter": {
"edge_ngram_filter": { #自动补全
"type": "edge_ngram",
"min_gram": 1,
"max_gram": 50
},
"pinyin_full_filter": { #拼音全拼
"type": "pinyin",
"keep_first_letter": false,
"keep_separate_first_letter": false,
"keep_full_pinyin": true,
"keep_original": false,
"limit_first_letter_length": 50,
"lowercase": true
},
"pinyin_simple_filter": { #拼音简拼
"type": "pinyin",
"keep_first_letter": true,
"keep_separate_first_letter": false,
"keep_full_pinyin": false,
"keep_original": false,
"limit_first_letter_length": 50,
"lowercase": true
}
},
"analyzer": { #分析器
"pinyiSimpleIndexAnalyzer": {
"type": "custom",
"tokenizer": "keyword",
"filter": ["pinyin_simple_filter","edge_ngram_filter","lowercase"]
},
"pinyiFullIndexAnalyzer": {
"type": "custom","tokenizer": "keyword",
"filter": ["pinyin_full_filter", "lowercase"]
}
}
}
}
}
ElasticSerach的错误Mapping的实例
PUT http://192.168.1.200:9200/liuwunan/_mapping/doc
{
"dynamic_templates": [{
"text": {
"match_mapping_type": "text",
"mapping": {
"analyzer": "ik_max_word",
"fields": {
"raw": {
"ignore_above": 256,
"type": "keyword"
}
},
"search_analyzer": "ik_max_word",
"type": "text"
}
}
},
{
"integer": {
"match_mapping_type": "long",
"mapping": {
"fields": {
"raw": {
"type": "integer"
}
},
"type": "integer"
}
}
}
],
"properties": {
"title": {
"type": "keyword",
"fields": {
"fpy": { #针对Title 全拼
"type": "text",
"index": true,
"analyzer": "pinyiFullIndexAnalyzer"
},
"spy": { #针对Title 简拼
"type": "text",
"index": true,
"analyzer": "pinyiSimpleIndexAnalyzer"
}
}
}
}
}
第二步:添加测试数据
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 3,
"max_score": 1,
"hits": [
{
"_index": "liuwunan",
"_type": "doc",
"_id": "1",
"_score": 1,
"_source": {
"title": "小明你好"
}
},
{
"_index": "liuwunan",
"_type": "doc",
"_id": "2",
"_score": 1,
"_source": {
"title": "明天你好"
}
},
{
"_index": "liuwunan",
"_type": "doc",
"_id": "3",
"_score": 1,
"_source": {
"title": "中华人"
}
}
]
}
}
第三步:进行拼音,中文,中文拼音混合搜索测试
①:拼音全拼搜索
POST : http://192.168.1.200:9200/liuwunan/doc/_search
{
"query": {
"match": {
"title.fpy": "mingtian" #fpy 意为全拼缩写
}
}
}
返回结果如下:
{
"took": 1,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 1.398811,
"hits": [
{
"_index": "liuwunan",
"_type": "doc",
"_id": "2",
"_score": 1.398811, #分数最高
"_source": {
"title": "明天你好"
}
},
{
"_index": "liuwunan",
"_type": "doc",
"_id": "1",
"_score": 0.45315093,
"_source": {
"title": "小明你好"
}
}
]
}
}
②:中文搜索
http://192.168.1.200:9200/liuwunan/doc/_search
{
"query": {
"match": {
"title.fpy": "明天"
}
}
}
返回结果
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 1.398811,
"hits": [
{
"_index": "liuwunan",
"_type": "doc",
"_id": "2",
"_score": 1.398811, #分数第一
"_source": {
"title": "明天你好"
}
},
{
"_index": "liuwunan",
"_type": "doc",
"_id": "1",
"_score": 0.45315093,
"_source": {
"title": "小明你好"
}
}
]
}
}
③:中文拼音混合搜索
http://192.168.1.200:9200/liuwunan/doc/_search
{
"query": {
"match": {
"title.fpy": "明tian"
}
}
}
返回结果如下:
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 1.398811,
"hits": [
{
"_index": "liuwunan",
"_type": "doc",
"_id": "2",
"_score": 1.398811, #分数第一
"_source": {
"title": "明天你好"
}
},
{
"_index": "liuwunan",
"_type": "doc",
"_id": "1",
"_score": 0.45315093,
"_source": {
"title": "小明你好"
}
}
]
}
}
④:简拼搜索
http://192.168.1.200:9200/liuwunan/doc/_search
{
"query": {
"match": {
"title.spy": "mt"
}
}
}
返回结果如下:
{
"took": 6,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 3.0916727,
"hits": [
{
"_index": "liuwunan",
"_type": "doc",
"_id": "2",
"_score": 3.0916727, #只有一条
"_source": {
"title": "明天你好"
}
}
]
}
}
由上述截图可知,一切需求感觉好像都满足了...........项目需求都解决了,突然,被同事告知,拼音查询,返回的结果,高亮标签打的有问题,一开始以为是同事使用的API语法写的有问题,后来我手动写JSON语法,进行Post请求提交。
http://192.168.1.200:9200/liuwunan/doc/_search
{
"query": {
"match": {
"title.spy": "小明"
}
},
"highlight" : {
"fields": {
"title.spy": {
}
}
}
}
返回结果如下:
{
"took": 112,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 1.6954334,
"hits": [
{
"_index": "liuwunan",
"_type": "doc",
"_id": "1",
"_score": 1.6954334,
"_source": {
"title": "小明你好"
},
"highlight": {
"title.spy": [
"小明你好" #高亮标签
]
}
}
]
}
}
有上述可知,高亮标签打的是title所有的数据,所以出错了!
实际需要效果为
{
"took": 112,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 1.6954334,
"hits": [
{
"_index": "liuwunan",
"_type": "doc",
"_id": "1",
"_score": 1.6954334,
"_source": {
"title": "小明你好"
},
"highlight": {
"title.spy": [
"小明你好" #高亮标签
]
}
}
]
}
}
查询百度,CSDN发现很多同学遇到了此问题,以ElasticSerach中文社区这文章为例,https://elasticsearch.cn/article/6166 相同的问题,解决方案类似,我在CSDN上也找了一篇2.X版本的ElasticSerach,按照他的,Setting和 ,Mapping 进行测试发现,在2.X版本的Es确实 是按照上述博客所述,并不存在这个问题,
但是6.X的ElasticSerach 就完全不行,我按照2.X操作执行了一遍,完美的避开了正确的返回结果! 一个高亮标签都咩有,见到这篇文章,发现确实是高版本,是有存在这个问题的!!!【填坑中】
经过大量的测试,Mapping的修改,Setting的设置,ElaticSerach测试分词器,发现
POST http://192.168.1.200:9200/liuwunan/_analyze
{
"field": "title.fpy"
, "text": ["环球"]
}
返回結果:
{
"tokens": [
{
"token": "huan",
"start_offset": 0,
"end_offset": 2,
"type": "word",
"position": 0
},
{
"token": "qiu",
"start_offset": 0,
"end_offset": 2,
"type": "word",
"position": 1
}
]
}
分词结果显示 "type": "word", 发现这完全就是按照英文的分词器的方法! 非中文的分词方式
正确实例
POST http://192.168.1.245:9200/b_and_r/_analyze
{
"field": "title.fpy",
"text": ["环球"]
}
返回结果 :
{
"tokens": [
{
"token": "huan",
"start_offset": 0,
"end_offset": 2,
"type": "CN_WORD",
"position": 0
},
{
"token": "huanqiu",
"start_offset": 0,
"end_offset": 2,
"type": "CN_WORD",
"position": 0
},
{
"token": "qiu",
"start_offset": 0,
"end_offset": 2,
"type": "CN_WORD",
"position": 1
}
]
}
正确的结果是 "type": "CN_WORD" 中文分词器!!! 由此可知,setting和Mapping 有问题!
分析结果如下,在做拼音搜索的时候,如果按照上述的Setting和Mapping发送请求是按照英文分词器来的,并且是单个字母拆分,并且发送中文请求,会先将中文拆分问拼音,将数据进行拼音搜索匹配,返回的是拼音,就是中文 ,拿过去拆成拼音,然后去匹配,然后返回的是匹配拼音上打的标签,但是实际结果,返回的数据是中文,不是拼音,所以标签打不上!
解决方案 :
是配混合过滤器,你中文,拿到后台,会分析成,中文和拼音,两种方式,返回的数据,就如果有拼音,就能匹配赛拼音,如果是中文,就能匹配上,中文.打上高亮的标签。
ElasticSerach 的Setting正确的配置方式:
PUT http://192.168.1.200:9200/test
{
"settings": {
"index": {
"max_result_window": 10000000
},
"refresh_interval": "5s",
"number_of_shards": 1,
"number_of_replicas": 1,
"analysis": {
"filter": {
"pinyin_full_filter": {
"keep_joined_full_pinyin": "true",
"lowercase": "true",
"keep_original": "false",
"keep_first_letter": "false",
"keep_separate_first_letter": "false",
"type": "pinyin",
"keep_none_chinese": "false",
"limit_first_letter_length": "50",
"keep_full_pinyin": "true"
},
"pinyin_simple_filter": {
"keep_joined_full_pinyin": "true",
"lowercase": "true",
"none_chinese_pinyin_tokenize": "false",
"padding_char": " ",
"keep_original": "true",
"keep_first_letter": "true",
"keep_separate_first_letter": "false",
"type": "pinyin",
"keep_full_pinyin": "false"
}
},
"analyzer": {
"pinyinFullIndexAnalyzer": {
"filter": ["asciifolding", "lowercase", "pinyin_full_filter"],
"type": "custom",
"tokenizer": "ik_max_word"
},
"ik_pinyin_analyzer": {
"filter": ["asciifolding", "lowercase", "pinyin_full_filter", "word_delimiter"],
"type": "custom",
"tokenizer": "ik_smart"
},
"ikIndexAnalyzer": {
"filter": ["asciifolding", "lowercase"],
"type": "custom",
"tokenizer": "ik_max_word"
},
"pinyiSimpleIndexAnalyzer": {
"type": "custom",
"tokenizer": "ik_max_word",
"filter": ["pinyin_simple_filter", "lowercase"]
}
}
}
}
}
ElasticSerach 的Mapping正确的配置方式:
PUT http://192.168.1.200:9200/test/_mapping/doc
{
"dynamic_templates": [{
"strings": {
"match_mapping_type": "string",
"mapping": {
"analyzer": "ik_max_word",
"fields": {
"raw": {
"ignore_above": 256,
"type": "keyword"
}
},
"search_analyzer": "ik_max_word",
"type": "text"
}
}
},
{
"integer": {
"match_mapping_type": "long",
"mapping": {
"fields": {
"raw": {
"type": "integer"
}
},
"type": "integer"
}
}
}
],
"properties": {
"title": {
"type": "text",
"fields": {
"fpy": {
"type": "text",
"index": true,
"analyzer": "pinyinFullIndexAnalyzer"
},
"spy": {
"type": "text",
"index": true,
"analyzer": "pinyiSimpleIndexAnalyzer"
}
},
"analyzer": "ikIndexAnalyzer"
}
}
}
重点为 ,"analyzer":"ik_max_word"
最后进行结果查询
①全拼搜索
POST http://192.168.1.200:9200/liuwunan/doc/_search
{
"_source": [ "title"],
"query": {
"match": {
"title.spy": "huanqiu"
}
},
"highlight" : {
"fields": {
"title.spy": {
}
}
}
}
返回结果:
"hits": [
{
"_index": "b_and_r",
"_type": "doc",
"_id": "hubmH2oBGUgj470FD0s1",
"_score": 5.3819566,
"_source": {
"title": "cast公司与NBC环球公司啊"
},
"highlight": {
"title.spy": [
"cast公司与NBC环球公司啊"
]
}
}
}
]
可以实现项目需求。 使用拼音搜索,返回中文数据标签打到了!然后标签没有出问题,现在测试简拼和混合搜索
②简拼搜索
POST http://192.168.1.200:9200/liuwunan/doc/_search
{
"_source": [ "title"],
"query": {
"match": {
"title.spy": "hq"
}
},
"highlight" : {
"fields": {
"title.spy": {
}
}
}
}
返回结果:
"hits": [
{
"_index": "b_and_r",
"_type": "doc",
"_id": "hubmH2oBGUgj470FD0s1",
"_score": 4.294393,
"_source": {
"title": "cast公司与NBC环球公司啊"
},
"highlight": {
"title.spy": [
"cast公司与NBC环球公司啊"
]
}
}
]
③:中文搜索
POST http://192.168.1.200:9200/liuwunan/doc/_search
{
"_source": [ "title"],
"query": {
"match": {
"title.fpy": "环球"
}
},
"highlight" : {
"fields": {
"title.fpy": {
}
}
}
}
=======================================================================
{
"_source": [ "title"],
"query": {
"match": {
"title": "环球"
}
},
"highlight" : {
"fields": {
"title": {
}
}
}
}
两种方式皆可
==========================================================================
返回結果
"hits": [
{
"_index": "b_and_r",
"_type": "doc",
"_id": "hubmH2oBGUgj470FD0s1",
"_score": 3.652075,
"_source": {
"title": "cast公司与NBC环球公司啊"
},
"highlight": {
"title": [
"cast公司与NBC环球公司啊"
]
}
}
]
綜上所展示,所有需求都满足了!
至此,ElasticSerach 6.0.1版本的拼音搜索,和高亮查询,已经彻底解决了! 有需要的同学,直接复制我的配置文件即可。
详细配置请见博客 https://blog.csdn.net/weixin_38822045/article/details/89138168