一、IK分词插件
ElasticSearch 默认采用分词器, 单个字分词 ,效果很差。
Elasticsearch-analysis-ik,这是一个将Lucence IK分词器集成到elasticsearch的ik分词器插件,并且支持自定义的词典。
地址:https://github.com/medcl/elasticsearch-analysis-ik
releases 地址:
https://github.com/medcl/elasticsearch-analysis-ik/releases?after=v6.6.2
- 安装
elasticsearch7.3版本已经不需要额外安装中文分词插件。
# 安装插件
[root@localhost elasticsearch-6.4.3]# ./bin/elasticsearch-plugin install https://github.com/medcl/elasticsearch-analysis-ik/releases/download/v6.4.3/elasticsearch-analysis-ik-6.4.3.zip
-> Downloading https://github.com/medcl/elasticsearch-analysis-ik/releases/download/v6.4.3/elasticsearch-analysis-ik-6.4.3.zip
[=================================================] 100%
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
@ WARNING: plugin requires additional permissions @
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
* java.net.SocketPermission * connect,resolve
See http://docs.oracle.com/javase/8/docs/technotes/guides/security/permissions.html
for descriptions of what these permissions allow and the associated risks.
Continue with installation? [y/N]y
-> Installed analysis-ik
# 安装完成
[root@localhost ~]# ls /usr/local/elasticsearch-6.4.3/plugins/
analysis-ik
# 修改配置
[es@localhost elasticsearch-6.4.3]$ vi config/elasticsearch.yml
# 添加内容
# 重启elasticsearch
注意:中文词分版本与ES版本必须一致。
- java.net.SocketPermission错误解决
# 在config目录中创建文件socketPolicy.policy
[root@localhost config]# vi socketPolicy.policy
# 内容
grant {
permission java.net.SocketPermission "*:*","connect,resolve";
};
# 在config目录 jvm.option 文件最后添加
[root@localhost elasticsearch-6.4.3]# vi config/jvm.options
-Djava.security.policy=/usr/local/elasticsearch-6.4.3/config/socketPolicy.policy
- 测试IK分词
两种analyzer,一般是选用ik_max_word
ik_max_word
: 会将文本做最细粒度的拆分,比如会将“中华人民共和国国歌”拆分为“中华人民共和国,中华人民,中华,华人,人民共和国,人民,人,民,共和国,共和,和,国国,国歌”,会穷尽各种可能的组合;
ik_smart
: 会做最粗粒度的拆分,比如会将“中华人民共和国国歌”拆分为“中华人民共和国,国歌”。
elasticsearch7.3版本已经不需要额外安装中文分词插件。
GET /test_index/_analyze
{
"text": "安徽高校新设殡葬专业上热搜 校方称学生刚入学就被 预定",
"analyzer": "ik_max_word"
}
结果:
{
"tokens": [
{
"token": "安徽",
"start_offset": 0,
"end_offset": 2,
"type": "CN_WORD",
"position": 0
},
{
"token": "高校",
"start_offset": 2,
"end_offset": 4,
"type": "CN_WORD",
"position": 1
},
{
"token": "新设",
"start_offset": 4,
"end_offset": 6,
"type": "CN_WORD",
"position": 2
},
{
"token": "殡葬",
"start_offset": 6,
"end_offset": 8,
"type": "CN_WORD",
"position": 3
},
{
"token": "专业",
"start_offset": 8,
"end_offset": 10,
"type": "CN_WORD",
"position": 4
},
{
"token": "上",
"start_offset": 10,
"end_offset": 11,
"type": "CN_CHAR",
"position": 5
},
{
"token": "热",
"start_offset": 11,
"end_offset": 12,
"type": "CN_CHAR",
"position": 6
},
{
"token": "搜",
"start_offset": 12,
"end_offset": 13,
"type": "CN_CHAR",
"position": 7
},
{
"token": "校方",
"start_offset": 14,
"end_offset": 16,
"type": "CN_WORD",
"position": 8
},
{
"token": "称",
"start_offset": 16,
"end_offset": 17,
"type": "CN_CHAR",
"position": 9
},
{
"token": "学生",
"start_offset": 17,
"end_offset": 19,
"type": "CN_WORD",
"position": 10
},
{
"token": "刚",
"start_offset": 19,
"end_offset": 20,
"type": "CN_CHAR",
"position": 11
},
{
"token": "入学",
"start_offset": 20,
"end_offset": 22,
"type": "CN_WORD",
"position": 12
},
{
"token": "就被",
"start_offset": 22,
"end_offset": 24,
"type": "CN_WORD",
"position": 13
},
{
"token": "预定",
"start_offset": 25,
"end_offset": 27,
"type": "CN_WORD",
"position": 14
}
]
}
- ES中文分词示例
英文分词:"analyzer": "english"
中文分词:"analyzer": "ik_max_word"
创建一个test_index索引,类型为:test_type,包含两个字段id与content,其中对content进行分词。
PUT /test_index
{
"mappings": {
"test_type": {
"properties": {
"id": {
"type": "long"
},
"content": {
"type": "text",
"analyzer": "ik_max_word",
"search_analyzer": "ik_smart"
}
}
}
}
}
添加数据:
POST /test_index/test_type/_bulk
{"index":{}}
{"id":1,"content":"美国留给伊拉克的是个烂摊子吗"}
{"index":{}}
{"id":2,"content":"公安部:各地校车将享最高路权"}
{"index":{}}
{"id":3,"content":"中韩渔警冲突调查:韩警平均每天扣1艘中国渔船"}
{"index":{}}
{"id":4,"content":"中国驻洛杉矶领事馆遭亚裔男子枪击 嫌犯已自首"}
查询数据:
GET /test_index/_search
{
"query" : { "match" : { "content" : "韩警" }}
}
结果:
{
"took": 8,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 1.6462245,
"hits": [
{
"_index": "test_index",
"_type": "test_type",
"_id": "hwVVNG4B1-XKoZdra0Jm",
"_score": 1.6462245,
"_source": {
"id": 3,
"content": "中韩渔警冲突调查:韩警平均每天扣1艘中国渔船"
}
}
]
}
}
高亮查询
GET /test_index/_search
{
"query" : { "match" : { "content" : "美国" }},
"highlight" : {
"pre_tags" : ["", ""],
"post_tags" : [" ", ""],
"fields" : {
"content" : {}
}
}
}
结果:
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0.2876821,
"hits": [
{
"_index": "test_index",
"_type": "test_type",
"_id": "hQVVNG4B1-XKoZdra0Jm",
"_score": 0.2876821,
"_source": {
"id": 1,
"content": "美国留给伊拉克的是个烂摊子吗"
},
"highlight": {
"content": [
"美国 留给伊拉克的是个烂摊子吗"
]
}
}
]
}
}
二、拼音插件pinyin
地址:https://github.com/medcl/elasticsearch-analysis-pinyin/releases
源码地址:https://github.com/medcl/elasticsearch-analysis-pinyin#pinyin-analysis-for-elasticsearch
- 安装
[root@localhost elasticsearch-6.4.3]$ ./bin/elasticsearch-plugin install https://github.com/medcl/elasticsearch-analysis-pinyin/releases/download/v6.4.3/elasticsearch-analysis-pinyin-6.4.3.zip
-> Downloading https://github.com/medcl/elasticsearch-analysis-pinyin/releases/download/v6.4.3/elasticsearch-analysis-pinyin-6.4.3.zip
[=================================================] 100%
-> Installed analysis-pinyin
# 插件查看
[root@localhost elasticsearch-6.4.3]$ ls plugins/
analysis-ik analysis-pinyin
[root@localhost elasticsearch-6.4.3]$ ls plugins/analysis-pinyin/
elasticsearch-analysis-pinyin-6.4.3.jar nlp-lang-1.7.jar plugin-descriptor.properties
# 重启ES
[es@localhost elasticsearch-6.4.3]$ ./bin/elasticsearch
- 自定义拼音分析器
PUT /medcl/
{
"settings" : {
"analysis" : {
"analyzer" : {
"pinyin_analyzer" : {
"tokenizer" : "my_pinyin"
}
},
"tokenizer" : {
"my_pinyin" : {
"type" : "pinyin",
"keep_separate_first_letter" : false,
"keep_full_pinyin" : true,
"keep_original" : true,
"limit_first_letter_length" : 16,
"lowercase" : true,
"remove_duplicated_term" : true
}
}
}
}
}
- 分析测试
GET /medcl/_analyze
{
"text": ["刘德华"],
"analyzer": "pinyin_analyzer"
}
结果:
{
"tokens": [
{
"token": "liu",
"start_offset": 0,
"end_offset": 0,
"type": "word",
"position": 0
},
{
"token": "刘德华",
"start_offset": 0,
"end_offset": 0,
"type": "word",
"position": 0
},
{
"token": "ldh",
"start_offset": 0,
"end_offset": 0,
"type": "word",
"position": 0
},
{
"token": "de",
"start_offset": 0,
"end_offset": 0,
"type": "word",
"position": 1
},
{
"token": "hua",
"start_offset": 0,
"end_offset": 0,
"type": "word",
"position": 2
}
]
}
- 示例
1)settings设置
PUT /pinyin_index/
{
"settings" : {
"analysis" : {
"analyzer" : {
"pinyin_analyzer" : {
"tokenizer" : "my_pinyin"
}
},
"tokenizer" : {
"my_pinyin" : {
"type" : "pinyin",
"keep_separate_first_letter" : false,
"keep_full_pinyin" : true,
"keep_original" : true,
"limit_first_letter_length" : 16,
"lowercase" : true,
"remove_duplicated_term" : true
}
}
}
}
}
2)创建mapping
索引:pinyin_index
类型:test_type
字段:name
POST /pinyin_index/test_type/_mapping
{
"test_type": {
"properties": {
"name": {
"type": "text",
"analyzer": "ik_max_word",
"copy_to": true,
"fields": {
"pinyin": {
"type": "text",
"analyzer": "pinyin_analyzer"
}
}
}
}
}
}
3)添加文档内容
POST /pinyin_index/test_type/1
{"name":"刘德华"}
POST /pinyin_index/test_type/2
{"name":"中华人民共和国国歌"}
POST /pinyin_index/_search
# 结果
{
"took": 0,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 1,
"hits": [
{
"_index": "pinyin_index",
"_type": "test_type",
"_id": "2",
"_score": 1,
"_source": {
"name": "中华人民共和国国歌"
}
},
{
"_index": "pinyin_index",
"_type": "test_type",
"_id": "1",
"_score": 1,
"_source": {
"name": "刘德华"
}
}
]
}
}
4)测试
POST /pinyin_index/test_type/_search?q=name.pinyin:liu
# 结果
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 0.3439677,
"hits": [
{
"_index": "pinyin_index",
"_type": "test_type",
"_id": "1",
"_score": 0.3439677,
"_source": {
"name": "刘德华"
}
}
]
}
}
POST /pinyin_index/test_type/_search?q=name.pinyin:zhong
POST /pinyin_index/test_type/_search?q=name.pinyin:de
POST /pinyin_index/test_type/_search?q=name.pinyin:gouge
- IK+pinyin分词配置
1)settings设置
PUT /pinyin_index/
{
"settings": {
"analysis": {
"analyzer": {
"ik_pinyin_analyzer": {
"type": "custom",
"tokenizer": "ik_smart",
"filter": ["my_pinyin", "word_delimiter"]
}
},
"filter": {
"my_pinyin": {
"type": "pinyin",
"first_letter": "prefix",
"padding_char": " "
}
}
}
}
}
- 创建mapping
POST /pinyin_index/test_type/_mapping
{
"test_type": {
"properties": {
"name": {
"type": "text",
"fields": {
"pinyin": {
"type": "text",
"store": "true",
"term_vector": "with_positions_offsets",
"analyzer": "ik_pinyin_analyzer",
"boost": 10
}
}
}
}
}
}
3) 添加文档内容
POST /pinyin_index/test_type/1
{"name":"刘德华"}
POST /pinyin_index/test_type/2
{"name":"中华人民共和国国歌"}
- 查询测试
POST /pinyin_index/test_type/_search?q=name.pinyin:gouge
GET /pinyin_index/test_type/_search
{
"query": {
"match": {
"name.pinyin": "国歌"
}
},
"highlight": {
"fields": {
"name.pinyin": {}
}
}
}
GET /pinyin_index/test_type/_search
{
"query": {
"match": {
"name.pinyin": "zhong"
}
},
"highlight": {
"fields": {
"name.pinyin": {}
}
}
}