下载与elasticsearch版本匹配的 ik ,
地址: https://github.com/medcl/elasticsearch-analysis-ik
进入 es 容器, cd your-es-root/plugins/ && mkdir ik
[root@startsky dc]# docker exec -it elasticsearch /bin/bash
[root@startsky dc]# cd plugins
[root@startsky dc]# unzip elasticsearch.zip
<properties>
<comment>IK Analyzer 扩展配置comment>
<entry key="ext_dict">custom/mydict.dic;custom/single_word_low_freq.dicentry>
<entry key="ext_stopwords">custom/ext_stopword.dicentry>
<entry key="remote_ext_dict">locationentry>
<entry key="remote_ext_stopwords">http://xxx.com/xxx.dicentry>
properties>
创建 custom
目录,创建 mydict.dic
和 ext_stopword.dic
在 mydict.dic
中:
又
及
对
就
并
很
或
把
是
的
着
给
而
被
让
在
还
比
等
当
与
在 ext_stopword.dic
中:
es你好
重启elasticsearch
POST _analyze
{
"tokenizer": "standard",
"text": "hello workd es你好好了"
}
//结果
{
"tokens" : [
{
"token" : "hello",
"start_offset" : 0,
"end_offset" : 5,
"type" : "" ,
"position" : 0
},
{
"token" : "workd",
"start_offset" : 6,
"end_offset" : 11,
"type" : "" ,
"position" : 1
},
{
"token" : "es",
"start_offset" : 12,
"end_offset" : 14,
"type" : "" ,
"position" : 2
},
{
"token" : "你",
"start_offset" : 14,
"end_offset" : 15,
"type" : "" ,
"position" : 3
},
{
"token" : "好",
"start_offset" : 15,
"end_offset" : 16,
"type" : "" ,
"position" : 4
},
{
"token" : "好",
"start_offset" : 16,
"end_offset" : 17,
"type" : "" ,
"position" : 5
},
{
"token" : "了",
"start_offset" : 17,
"end_offset" : 18,
"type" : "" ,
"position" : 6
}
]
}
可以发现,使用标准的分词器,es你好 这个词被拆分了
使用 ik
分词器:
POST _analyze
{
"tokenizer": "ik_max_word",
"text": "hello workd es你好好了"
}
//结果
{
"tokens" : [
{
"token" : "hello",
"start_offset" : 0,
"end_offset" : 5,
"type" : "ENGLISH",
"position" : 0
},
{
"token" : "workd",
"start_offset" : 6,
"end_offset" : 11,
"type" : "ENGLISH",
"position" : 1
},
{
"token" : "es你好",
"start_offset" : 12,
"end_offset" : 16,
"type" : "CN_WORD",
"position" : 2
},
{
"token" : "es",
"start_offset" : 12,
"end_offset" : 14,
"type" : "ENGLISH",
"position" : 3
},
{
"token" : "你好",
"start_offset" : 14,
"end_offset" : 16,
"type" : "CN_WORD",
"position" : 4
},
{
"token" : "好好",
"start_offset" : 15,
"end_offset" : 17,
"type" : "CN_WORD",
"position" : 5
},
{
"token" : "好了",
"start_offset" : 16,
"end_offset" : 18,
"type" : "CN_WORD",
"position" : 6
}
]
}
可以发现, es你好
被分词成一个词语