#normalization
GET _analyze
{
"text": "Mr. Ma is an excellent teacher",
"analyzer": "english"
}
##HTML Strip Character Filter
###测试数据<p>I'm so <a>happy</a>!</p>
DELETE my_index
PUT my_index
{
"settings": {
"analysis": {
"char_filter": {
"my_char_filter":{
"type":"html_strip",
"escaped_tags":["a"]
}
},
"analyzer": {
"my_analyzer":{
"tokenizer":"keyword",
"char_filter":["my_char_filter"]
}
}
}
}
}
##Mapping Character Filter
DELETE my_index
PUT my_index
{
"settings": {
"analysis": {
"char_filter": {
"my_char_filter":{
"type":"mapping",
"mappings":[
"滚 => *",
"垃 => *",
"圾 => *"
]
}
},
"analyzer": {
"my_analyzer":{
"tokenizer":"keyword",
"char_filter":["my_char_filter"]
}
}
}
}
}
GET my_index/_analyze
{
"analyzer": "my_analyzer",
"text": "你就是个垃圾!滚"
}
##Pattern Replace Character Filter
#17611001200
DELETE my_index
PUT my_index
{
"settings": {
"analysis": {
"char_filter": {
"my_char_filter":{
"type":"pattern_replace",
"pattern":"(\\d{3})\\d{4}(\\d{4})",
"replacement":"$1****$2"
}
},
"analyzer": {
"my_analyzer":{
"tokenizer":"keyword",
"char_filter":["my_char_filter"]
}
}
}
}
}
GET my_index/_analyze
{
"analyzer": "my_analyzer",
"text": "您的手机号是17611001200"
}
#token filter
DELETE test_index
PUT /test_index
{
"settings": {
"analysis": {
"filter": {
"my_synonym": {
"type": "synonym_graph",
"synonyms_path": "analysis/synonym.txt"
}
},
"analyzer": {
"my_analyzer": {
"tokenizer": "ik_max_word",
"filter": [ "my_synonym" ]
}
}
}
}
}
GET test_index/_analyze
{
"analyzer": "my_analyzer",
"text": ["蒙丢丢,大G,霸道,daG"]
}
GET test_index/_analyze
{
"analyzer": "ik_max_word",
"text": ["奔驰G级"]
}
DELETE test_index
PUT /test_index
{
"settings": {
"analysis": {
"filter": {
"my_synonym": {
"type": "sys",
"synonyms": ["赵,钱,孙,李=>吴","周=>王"]
}
},
"analyzer": {
"my_analyzer": {
"tokenizer": "standard",
"filter": [ "my_synonym" ]
}
}
}
}
}
GET test_index/_analyze
{
"analyzer": "my_analyzer",
"text": ["赵,钱,孙,李","周"]
}
#大小写
GET test_index/_analyze
{
"tokenizer": "standard",
"filter": ["lowercase"],
"text": ["AASD ASDA SDASD ASDASD"]
}
GET test_index/_analyze
{
"tokenizer": "standard",
"filter": ["uppercase"],
"text": ["asdasd asd asg dsfg gfhjsdf asfdg g"]
}
GET test_index/_analyze
{
"tokenizer": "standard",
"filter": {
"type": "condition",
"filter":"uppercase",
"script": {
"source": "token.getTerm().length() < 5"
}
},
"text": ["asdasd asd asg dsfg gfhjsdf asfdg g"]
}
#停用词
DELETE test_index
PUT /test_index
{
"settings": {
"analysis": {
"analyzer": {
"my_analyzer": {
"type": "standard",
"stopwords":["me","you"]
}
}
}
}
}
GET test_index/_analyze
{
"analyzer": "my_analyzer",
"text": ["Teacher me and you in the china"]
}
#分词器 tokenizer
GET test_index/_analyze
{
"tokenizer": "ik_max_word",
"text": ["我爱北京天安门","天安门上太阳升"]
}
#自定义分词器
DELETE custom_analysis
PUT custom_analysis
{
"settings": {
"analysis": {
"char_filter": {
"my_char_filter": {
"type": "mapping",
"mappings": [
"& => and",
"| => or"
]
},
"html_strip_char_filter":{
"type":"html_strip",
"escaped_tags":["a"]
}
},
"filter": {
"my_stopword": {
"type": "stop",
"stopwords": [
"is",
"in",
"the",
"a",
"at",
"for"
]
}
},
"tokenizer": {
"my_tokenizer": {
"type": "pattern",
"pattern": "[ ,.!?]"
}
},
"analyzer": {
"my_analyzer":{
"type":"custom",
"char_filter":["my_char_filter","html_strip_char_filter"],
"filter":["my_stopword","lowercase"],
"tokenizer":"my_tokenizer"
}
}
}
}
}
GET custom_analysis/_analyze
{
"analyzer": "my_analyzer",
"text": ["What is ,as.df ss in ? &
| is ! in the a at for "]
}
GET custom_analysis/_analyze
{
"analyzer": "ik_max_word",
"text": ["我爱中华人民共和国"]
}
GET custom_analysis/_analyze
{
"analyzer": "ik_max_word",
"text": ["蒙丢丢","大G","霸道","渣男","渣女","奥巴马"]
}
GET custom_analysis/_analyze
{
"analyzer": "ik_max_word",
"text": ["吴磊","美国","日本","澳大利亚"]
}