版本:Elasticsearch 7.2.0
1.定义索引时,使用多个分词器
2.在模板中定义中使用多个分词器
3.ik+pinyin
对人工智能感兴趣的同学,可以点击以下链接:
现在人工智能非常火爆,很多朋友都想学,但是一般的教程都是为博硕生准备的,太难看懂了。最近发现了一个非常适合小白入门的教程,不仅通俗易懂而且还很风趣幽默。所以忍不住分享一下给大家。点这里可以跳转到教程。
https://www.cbedai.net/u014646662
Ik+English分词器
Ik分词器可以对汉语分词,English分词器可以对英文中时态、复数、大小写等处理
put /test
{
"settings":{
"number_of_shards" : "3",
"number_of_replicas" : "0",
"analysis": {
"analyzer": {
"ik_en_analyzer": {
"type": "custom",
"tokenizer": "ik_max_word",
"filter": ["en_stemmer"]
}
},
"filter": {
"en_stemmer" : {
"type" : "stemmer",
"name" : "english"
}
}
}
},
"mappings":{
"properties":{
"id":{"type":"long"},
"name":{
"type" : "text",
"analyzer" : "ik_en_analyzer"
},
"text":{"type":"text"}
}
}
}
测试
GET test/_analyze
{
"field": "name",
"text":"Saying and doing are two different things."
}
{
"tokens" : [
{
"token" : "sai",
"start_offset" : 0,
"end_offset" : 6,
"type" : "ENGLISH",
"position" : 0
},
{
"token" : "do",
"start_offset" : 11,
"end_offset" : 16,
"type" : "ENGLISH",
"position" : 1
},
{
"token" : "two",
"start_offset" : 21,
"end_offset" : 24,
"type" : "ENGLISH",
"position" : 2
},
{
"token" : "differ",
"start_offset" : 25,
"end_offset" : 34,
"type" : "ENGLISH",
"position" : 3
},
{
"token" : "things.",
"start_offset" : 35,
"end_offset" : 42,
"type" : "LETTER",
"position" : 4
},
{
"token" : "thing",
"start_offset" : 35,
"end_offset" : 41,
"type" : "ENGLISH",
"position" : 5
}
]
}
GET test/_analyze
{
"field": "name",
"text":"Ik分词器可以对汉语分词"
}
{
"tokens" : [
{
"token" : "ik",
"start_offset" : 0,
"end_offset" : 2,
"type" : "ENGLISH",
"position" : 0
},
{
"token" : "分词器",
"start_offset" : 2,
"end_offset" : 5,
"type" : "CN_WORD",
"position" : 1
},
{
"token" : "分词",
"start_offset" : 2,
"end_offset" : 4,
"type" : "CN_WORD",
"position" : 2
},
{
"token" : "器",
"start_offset" : 4,
"end_offset" : 5,
"type" : "CN_CHAR",
"position" : 3
},
{
"token" : "可以",
"start_offset" : 5,
"end_offset" : 7,
"type" : "CN_WORD",
"position" : 4
},
{
"token" : "以对",
"start_offset" : 6,
"end_offset" : 8,
"type" : "CN_WORD",
"position" : 5
},
{
"token" : "汉语",
"start_offset" : 8,
"end_offset" : 10,
"type" : "CN_WORD",
"position" : 6
},
{
"token" : "分词",
"start_offset" : 10,
"end_offset" : 12,
"type" : "CN_WORD",
"position" : 7
}
]
}
Post _template/template_default
{
"index_patterns": ["*"],
"order" : 0,
"version": 0,
"settings": {
"number_of_shards": 2,
"number_of_replicas":1 ,
"analysis": {
"analyzer": {
"ik_en_analyzer": {
"type": "custom",
"tokenizer": "ik_max_word",
"filter": ["en_stemmer"]
}
},
"filter": {
"en_stemmer" : {
"type" : "stemmer",
"name" : "english"
}
}
}
},
"mappings": {
"date_detection": true,
"numeric_detection": true,
"dynamic_templates": [
{
"string_fields": {
"match": "*",
"match_mapping_type": "string",
"mapping": {
"type": "text",
"norms": false,
"analyzer": "ik_en_analyzer",
"fields": {
"keyword": {
"type": "keyword"
}
}
}
}
}
]
}
}
put /test
{
"settings":{
"number_of_shards" : "3",
"number_of_replicas" : "0",
"analysis": {
"analyzer": {
"ik_en_analyzer": {
"type": "custom",
"tokenizer": "ik_max_word",
"filter": ["my_pinyin"]
}
},
"filter": {
"my_pinyin" : {
"type" : "pinyin",
"keep_separate_first_letter" : false,
"keep_full_pinyin" : true,
"keep_original" : true,
"limit_first_letter_length" : 16,
"lowercase" : true,
"remove_duplicated_term" : true
}
}
}
},
"mappings":{
"properties":{
"id":{"type":"long"},
"name":{
"type" : "text",
"analyzer" : "ik_en_analyzer"
},
"text":{"type":"text"}
}
}
}
pinyin分词器详解:https://github.com/medcl/elasticsearch-analysis-pinyin