默认情况下,Elasticsearch 按相关性得分对匹配的搜索结果进行排序,相关性得分衡量每个文档与查询的匹配程度。
相关性分数是一个正浮点数,返回结果字段 _score的
元数据字段中返回。越高 ,文档的相关性越高
关于queryDSL的文档
下面是对应kibana开发者工具执行es脚本示例
DELETE /user
# 修改默认分词器为ik细粒度
PUT /user
{
"settings": {
"index":{
"analysis.analyzer.default.type":"ik_max_word"
}
}
}
#DSL实战查询
GET /user
###match_all查询###
#查询获取通过_source来指定需要返回的字段
GET /user/_search
{
"query":{
"match_all": {}
},
"_source":["name"]
}
#分页,默认返回10个文档信息
GET /user/_search
{
"query":{
"match_all": {}
},
"from":0,
"size": 4,
"sort": [
{
"age": {
"order": "desc"
}
}
]
}
###match_all查询###
###term精确匹配查询###
GET /user/_search
{
"query":{
"term": {
"name.keyword": {
"value": "张三"
}
}
}
}
#多术语查询
GET /user/_search
{
"query":{
"terms": {
"name.keyword": ["张三","李四"]
}
}
}
GET /user/_search
PUT /user/_doc/1
{
"id":1,
"name":"张三",
"age":16,
"address":"陕西西安"
}
PUT /user/_doc/2
{
"id":2,
"name":"李四",
"age":18,
"address":"陕西渭南"
}
PUT /user/_doc/3
{
"id":3,
"name":"王五",
"age":19,
"address":"广州天河"
}
PUT /user/_doc/4
{
"id":4,
"name":"赵六",
"age":30,
"address":"广州白云"
}
POST /user/_update/1
{
"doc": {
"brithday":"2007-04-12"
}
}
POST /user/_update/2
{
"doc": {
"brithday":"2005-04-12"
}
}
POST /user/_update/3
{
"doc": {
"brithday":"2004-04-12"
}
}
POST /user/_update/4
{
"doc": {
"brithday":"1993-04-12"
}
}
#根据范围查询 大于等于 (当前时间-20=2003)年的
GET /user/_search
{
"query": {
"range": {
"brithday": {
"gte": "now-20y"
}
}
}
}
#根据范围查询 小于等于 (当前时间-20=2003)年的
GET /user/_search
{
"query": {
"range": {
"brithday": {
"lte": "now-20y"
}
}
}
}
# 前缀查询
# prefix的原理需要遍历所有倒排索引,并比较每个term是否以
# 所指定的前缀开头
GET /user/_search
{
"query": {
"prefix": {
"address": {
"value": "广州"
}
}
}
}
#模糊查询 fuzziness允许错几个字 最多支持错2个
GET /user/_search
{
"query": {
"fuzzy": {
"address": {
"value": "广周",
"fuzziness": 1
}
}
}
}
###term精确匹配查询###
GET /user/_search
{
"query": {
"term": {
"address": {
"value": "广州"
}
}
}
}
全文检索文档查询
下面是对应kibana开发者工具执行es脚本示例
# matchQuery匹配查询
# match在匹配时会对所查找的关键词进行分词,然后按分词匹配查找。
# 分词后 and的效果
GET /user/_search
{
"query": {
"match": {
"address": {
"query": "陕西广州",
"operator": "and"
}
}
}
}
# 在match中的应用: 当operator参数设置为or时,
# minnum_should_match参数用来控制匹配的分词的最少数量。
# 最少匹配陕西,广州两个词
GET /user/_search
{
"query": {
"match": {
"address": {
"query": "陕西广州",
"minimum_should_match": 2
}
}
}
}
# multi_match query 多字段查询
# 下面不使用name字段分词查询
GET /user/_search
{
"query": {
"multi_match": {
"query": "张三陕西",
"fields": [
"address",
"name.keyword"
]
}
}
}
# match_phrase query短语查询
# 要求分词相邻
# 可以通过调整slop参数设置分词出现的最大间隔距离
GET /user/_search
{
"query": {
"match_phrase": {
"address": {
"query": "陕西渭南",
"slop": 2
}
}
}
}
# query_string query
GET /user/_search
{
"query": {
"query_string": {
"default_field": "address",
"query": "西安 OR 渭南"
}
}
}
#bool查询
GET /user/_search
{
"query": {
"bool": {
"must": [
{
"match": {
"address": "陕西"
}
},{
"match": {
"name": "张三"
}
}
]
}
}
}
# 至少有一个匹配
GET /user/_search
{
"query": {
"bool": {
"should": [
{
"match": {
"address": "陕西"
}
},{
"match": {
"name": "张三"
}
}
]
}
}
}
# 高亮显示
# require_field_match 多字段高亮需要设置为false
GET /user/_search
{
"query": {
"term": {
"name": {
"value": "张三"
}
}
},
"highlight": {
"pre_tags": [""],
"post_tags": [""],
"require_field_match": "false",
"fields": {
"*":{}
}
}
}
如果我们查询的数据页数特别大,当from + size大于10000的时候,就会出现问题
如果查询的是第10001-10100这一百条数据,ES必须将前10100全部取出进行二次查询。因此,如果查询的数据排序越靠后,就越容易导致OOM(Out Of Memory)情况的发生,频繁的深分页查询会导致频繁的FGC。
ES为了避免用户在不了解其内部原理的情况下而做出错误的操作,设置了一个阈值,即max_result_window,其默认值为10000,其作用是为了保护堆内存不被错误操作导致溢出
解决深度分页问题最好的办法就是避免使用深度分页。谷歌、百度目前作为全球和国内做大的搜索引擎不约而同的在分页条中删除了“跳页”功能 选择使用下一页,淘宝和京东查询都是直接限制了只能到100页
search_after查询
scroll API适用于高效的深度滚动,但滚动上下文成本高昂,不建议将其用于实时用户请求。而search_after参数通过提供一个活动光标来规避这个问题。这样可以使用上一页的结果来帮助检索下一页。
Character filters reference | Elasticsearch Guide [7.17] | Elastic
分词器的组成
tokenizer 是分词器的核心组成部分之一,其主要作用是分词,或称之为切词。主要用来对原始文本进行细粒度拆分。拆分之后的每一个部分称之为一个 Term,或称之为一个词项。可以把切词器理解为预定义的切词规则。官方内置了很多种切词器,默认的切词器位 standard。
分词器不会对源数据造成任何影响,分词仅仅是对倒排索引或者搜索词的行为。
# 词项过滤器
GET _analyze
{
"tokenizer": "standard",
"filter": ["stop"],
"text": ["What are you doing"]
}
# 自定义中文分词规则
# 配置文件在 elasticsearch-7.17.3\plugins\elasticsearch-analysis-ik\config 目录下,打开目录# 可以看到有 stopword.dic # 和 extra_stopword.dic 两个文件。stopword.dic 里面的为配置的英文停# 用词,extra_stopword.dic 里面为配置的中文停用词。
# 默认是使用 stopword.dic 的,想要使用 extra_stopword.dic 的话是需要在ik中进行配置的,配置文# # 件为 IKAnalyzer.cfg.xml
# IK Analyzer 扩展配置extra_stopword.dic
# 默认extra_stopword.dic里面有个停用词是了,执行后就会把了字去掉
GET _analyze
{
"tokenizer": "ik_max_word",
"filter": ["stop"],
"text":["我饿了"]
}
# 自定义停用词
DELETE /my_token_filter_stop
PUT my_token_filter_stop
{
"settings": {
"analysis": {
"filter": {
"my_filter":{
"type":"stop",
"stopwords":["www"],
"ignore_case":true
}
}
}
}
}
GET my_token_filter_stop/_analyze
{
"tokenizer": "standard",
"filter": ["my_filter"],
"text":["What are you doing today www "]
}
#同义词
DELETE test_token_filter_synonym
PUT test_token_filter_synonym
{
"settings": {
"analysis": {
"filter": {
"my_synonym": {
"type": "synonym",
"synonyms": [ "你好, 喂 => hello" ]
}
}
}
}
}
GET test_token_filter_synonym/_analyze
{
"tokenizer": "ik_max_word",
"filter": ["my_synonym"],
"text": ["你好"]
}
# 字符过滤器 保留a标签
PUT test_html_strip
{
"settings": {
"analysis": {
"char_filter": {
"my_char_filter": {
"type": "html_strip",
"escaped_tags":["a"]
}
}
}
}
}
GET test_html_strip/_analyze
{
"tokenizer": "standard",
"char_filter": ["my_char_filter"],
"text": ["I'm learning elasticSearch
"]
}
# 字符映射过滤器:Mapping Character Filter
# mapping 代表使用字符映射过滤器
# mappings数组中规定的字符会被等价替换为 => 指定的字符
PUT test_html_strip_filter
{
"settings": {
"analysis": {
"char_filter": {
"my_char_filter": {
"type": "mapping",
"mappings": [
"笨 => *",
"蛋 => *"
]
}
}
}
}
}
GET test_html_strip_filter/_analyze
{
"char_filter": ["my_char_filter"],
"text": "你就是个大笨蛋!!"
}
# pattern_replace 代表使用正则替换过滤器
PUT text_pattern_replace_filter
{
"settings": {
"analysis": {
"char_filter": {
"my_char_filter": {
"type": "pattern_replace",
"pattern": """(\d{3})\d{4}(\d{4})""",
"replacement": "$1****$2"
}
}
}
}
}
GET text_pattern_replace_filter/_analyze
{
"char_filter": ["my_char_filter"],
"text": "您的手机号是18868686688"
}
Disjunction max query | Elasticsearch Guide [7.17] | Elastic
DELETE test_score
PUT /test_score/_bulk
{"index":{"_id":1}}
{"content":"我们都在使用elasticsearch做搜索引擎"}
{"index":{"_id":2}}
{"content":"我喜欢elasticsearch"}
{"index":{"_id":3}}
{"content":"你知道elasticsearch吗"}
# 通过Explain API查看TF-IDF
GET /test_score/_search
{
"explain": true,
"query": {
"match": {
"content": "elasticsearch"
}
}
}
GET /test_score/_explain/2
{
"query": {
"match": {
"content": "elasticsearch"
}
}
}
#Boosting是控制相关度的一种手段。可以通过指定字段的boost值影响查询结果
POST /blogs/_bulk
{
"index": {
"_id": 1
}
}
{
"title": "华为手机",
"content": "华为手机,华为电脑"
}
{
"index": {
"_id": 2
}
}
{
"title": "华为手机,华为电脑",
"content": "华为手机"
}
# boost设置权重大小 大于1值权重相对性提升
# 案例中我们希望内容匹配的化 权重高点
GET /blogs/_search
{
"query": {
"bool": {
"should": [
{
"match": {
"title": {
"query": "华为,电脑",
"boost": 1
}
}
},
{
"match": {
"content": {
"query": "华为,电脑",
"boost": 4
}
}
}
]
}
}
}
# 对某些返回结果不满意,但又不想排除掉( must_not),可以考虑boosting query的negative_boost。
# positive代表想要 negative代表不想要
# negative_boost降低相关性 negative_boost值越小分数越低就控制了它排在后面
# 案例中希望内容字段相关的 手机靠前,电脑靠后
GET /blogs/_search
{
"query": {
"boosting": {
"positive": {
"match": {
"content": "手机"
}
},
"negative": {
"match": {
"content": "电脑"
}
},
"negative_boost": 0.2
}
}
}
DELETE /news
PUT /news/_doc/1
{
"title": "棕色的动物",
"body": "熊"
}
PUT /news/_doc/2
{
"title": "关于一只的故事",
"body": "有一只棕色的熊"
}
POST /news/_search
{
"query": {
"bool": {
"should": [
{ "match": { "title": "棕色的熊" }},
{ "match": { "body": "棕色的熊" }}
]
}
}
}
# 使用最佳字段查询dis max query
# 将任何与任一查询匹配的文档作为结果返回,采用字段上最匹配的评分最终评分返回。 max(a,b)
POST /news/_search
{
"query": {
"dis_max": {
"queries": [
{ "match": { "title": "棕色的熊" }},
{ "match": { "body": "棕色的熊" }}
]
}
}
}
# 通过tie_breaker参数调整
# 最终得分=最佳匹配字段+其他匹配字段*tie_breaker
POST /news/_search
{
"query": {
"dis_max": {
"queries": [
{ "match": { "title": "棕色的熊" }},
{ "match": { "body": "棕色的熊" }}
],
"tie_breaker": 0.1
}
}
}
# 最佳字段(Best Fields)搜索
# best_fields策略获取最佳匹配字段的得分, final_score = max(其他匹配字段得分,
# 最佳匹配字段得分)
POST /news/_search
{
"query": {
"multi_match": {
"type": "best_fields",
"query": "棕色的熊",
"fields": ["title","body"],
"tie_breaker": 0.2
}
}
}
Bucket aggregations | Elasticsearch Guide [7.17] | Elastic
ElasticSearch在对海量数据进行聚合分析的时候会损失搜索的精准度来满足实时性的需求。
不精准的原因: 数据分散到多个分片,聚合是每个分片的取 Top X,导致结果不精准。ES 可以不每个分片Top X,而是全量聚合,但势必这会有很大的性能问题。
如果想降低损失度可以使用两种方式
# 聚合查询
GET /user/_search
#多个 Metric 聚合,找到最低最高和平均年龄
POST /user/_search
{
"size": 0,
"aggs": {
"max_age": {
"max": {
"field": "age"
}
},
"min_age": {
"min": {
"field": "age"
}
},
"avg_age": {
"avg": {
"field": "age"
}
}
}
}
POST /user/_search
{
"size": 0,
"aggs": {
"stats_age": {
"stats": {
"field":"age"
}
}
}
}
# 分组查询 桶集合
# 对keword 进行聚合
PUT /user/_doc/5
{
"id":5,
"name":"小明",
"age":30,
"address":"广州白云",
"brithday":"1993-04-12"
}
GET /user/_search
{
"size": 0,
"aggs": {
"addr": {
"terms": {
"field":"address.keyword"
}
}
}
}
# 限定聚合范围
# 只针对年龄大于等于20的人分组统计
GET /user/_search
{
"query": {
"range": {
"age": {
"gte": 20
}
}
},
"size": 0,
"aggs": {
"addr": {
"terms": {
"field":"address.keyword",
"size": 10,
"order": {
"_count": "desc"
}
}
}
}
}
# 按照年龄范围,进行分桶
# 在Range Aggregation中,可以自定义Key
POST /user/_search
{
"size": 0,
"aggs": {
"age_range": {
"range": {
"field":"age",
"ranges":[
{
"to":19
},
{
"from":19,
"to":100
},
{
"key":"大于19岁的",
"from":19
}
]
}
}
}
}
# interval指定区间按照每隔5岁来分组统计
POST /user/_search
{
"size": 0,
"aggs": {
"age_histrogram": {
"histogram": {
"field":"age",
"interval":5,
"extended_bounds":{
"min":0,
"max":50
}
}
}
}
}
# top_hits应用场景: 当获取分桶后,桶内最匹配的文档列表
# 指定size,根据地址查询分组,并统计不同地方的,年龄最大的2个用户的分组信息
POST /user/_search
{
"size": 0,
"aggs": {
"addr": {
"terms": {
"field":"address.keyword"
},
"aggs":{
"old_addr":{
"top_hits":{
"size":2,
"sort":[
{
"age":{
"order":"desc"
}
}
]
}
}
}
}
}
}
# 多次嵌套。根据用户地址分桶,然后按照年龄分桶,计算年龄的统计信息
POST /user/_search
{
"size": 0,
"aggs": {
"addr_stats": {
"terms": {
"field": "address.keyword"
},
"aggs": {
"age_stats": {
"terms": {
"field": "age"
},
"aggs": {
"age_stats": {
"stats": {
"field": "age"
}
}
}
}
}
}
}
}
# 平均年龄最低的用户是哪里的
POST /user/_search
{
"size": 0,
"aggs": {
"addr": {
"terms": {
"field": "address.keyword",
"size": 10
},
"aggs": {
"avg_age": {
"avg": {
"field": "age"
}
}
}
},
"min_age_user":{
"min_bucket": {
"buckets_path": "addr>avg_age"
}
}
}
}
# 查询大于等于20岁的用户,并根据查询的结果集进行根据地址分组
POST /user/_search
{
"size": 0,
"query": {
"range": {
"age": {
"gte": 20
}
}
},
"aggs": {
"addr": {
"terms": {
"field":"address.keyword"
}
}
}
}
# 在分组查询的时候过滤年龄大于等于20岁的用户,并进行根据地址分组统计
POST /user/_search
{
"size": 0,
"aggs": {
"old_user": {
"filter": {
"range": {
"age": {
"from": 19
}
}
},
"aggs": {
"addr": {
"terms": {
"field": "address.keyword"
}
}
}
},
"all_addrs": {
"terms": {
"field": "address.keyword"
}
}
}
}
# 找出所有地址用户,根据聚合后的结果筛选处符合条件的结果
POST /user/_search
{
"aggs": {
"addr": {
"terms": {
"field": "address.keyword"
}
}
},
"post_filter": {
"match": {
"address.keyword": "陕西西安"
}
}
}
#排序 order
POST /user/_search
{
"size": 0,
"query": {
"range": {
"age": {
"gte": 20
}
}
},
"aggs": {
"addr": {
"terms": {
"field":"address.keyword",
"order":[
{"_key":"desc"}
]
}
}
}
}
Nested query | Elasticsearch Guide [7.17] | Elastic
Nested数据类型: 允许对象数组中的对象被独立索引
在内部, Nested文档会被保存在两个Lucene文档中,在查询时做Join处理
DELETE /my_movies
# 创建 Nested 对象 Mapping
PUT /my_movies
{
"mappings": {
"properties": {
"actors": {
"type": "nested",
"properties": {
"first_name": {
"type": "keyword"
},
"last_name": {
"type": "keyword"
}
}
},
"title": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
}
}
POST /my_movies/_doc/1
{
"title": "龙门飞甲",
"actors": [
{
"first_name": "李",
"last_name": "连杰"
},
{
"first_name": "甄",
"last_name": "子丹"
}
]
}
# Nested 查询
POST /my_movies/_search
{
"query": {
"bool": {
"must": [
{"match": {"title": "龙门飞甲"}},
{
"nested": {
"path": "actors",
"query": {
"bool": {
"must": [
{"match": {
"actors.first_name": "李"
}},
{"match": {
"actors.last_name": "连杰"
}}
]
}
}
}
}
]
}
}
}
# Nested Aggregation
POST /my_movies/_search
{
"size": 0,
"aggs": {
"actors_agg": {
"nested": {
"path": "actors"
},
"aggs": {
"actor_name": {
"terms": {
"field": "actors.first_name",
"size": 10
}
}
}
}
}
}
# 普通 aggregation不工作
POST /my_movies/_search
{
"size": 0,
"aggs": {
"actors_agg": {
"terms": {
"field": "actors.first_name",
"size": 10
}
}
}
}
Has child query | Elasticsearch Guide [7.17] | Elastic
对象和Nested对象的局限性: 每次更新,可能需要重新索引整个对象(包括根对象和嵌套对象
ES提供了类似关系型数据库中Join 的实现。使用Join数据类型实现,可以通过维护Parent/ Child的关系,从而分离两个对象
DELETE /my_blogs
# 设定 Parent/Child Mapping
# relations对象中blog字段是父名称comment为子名称
PUT /my_blogs
{
"settings": {
"number_of_shards": 2
},
"mappings": {
"properties": {
"blog_comments_relation": {
"type": "join",
"relations": {
"blog": "comment"
}
},
"content": {
"type": "text"
},
"title": {
"type": "keyword"
}
}
}
}
# 创建索引父文档blog1
PUT /my_blogs/_doc/blog1
{
"title":"我正在学习elasticSearch",
"content":"elasticSearch感觉它很强大",
"blog_comments_relation":{
"name":"blog"
}
}
# 创建索引父文档blog2
PUT /my_blogs/_doc/blog2
{
"title":"我正在学习go语言",
"content":"go语言感觉在处理并发方面感觉它很强大",
"blog_comments_relation":{
"name":"blog"
}
}
# 创建索引子文档comment1 parent字段指定父文档id
# 父文档和子文档必须存在相同的分片上,能够确保查询join 的性能
# 当指定子文档时候,必须指定它的父文档ld。使用routing参数来保证,
#分配到相的分片
PUT /my_blogs/_doc/comment1?routing=blog1
{
"comment":"我也在学习ELK",
"username":"小张",
"blog_comments_relation":{
"name":"comment",
"parent":"blog1"
}
}
# 创建索引子文档comment2
PUT /my_blogs/_doc/comment2?routing=blog2
{
"comment":"我也学习go语言方面的知识",
"username":"小张",
"blog_comments_relation":{
"name":"comment",
"parent":"blog2"
}
}
# 创建索引子文档comment3
PUT /my_blogs/_doc/comment3?routing=blog2
{
"comment":"我觉得elk知识点还是很多",
"username":"小丽",
"blog_comments_relation":{
"name":"comment",
"parent":"blog2"
}
}
# 查询所有文档
POST /my_blogs/_search
#根据父文档ID查看
GET /my_blogs/_doc/blog2
# Parent Id 查询
POST /my_blogs/_search
{
"query": {
"parent_id": {
"type": "comment",
"id": "blog2"
}
}
}
# Has Child 查询,返回父文档
POST /my_blogs/_search
{
"query": {
"has_child": {
"type": "comment",
"query": {
"match": {
"username": "小张"
}
}
}
}
}
# Has Parent 查询,返回相关的子文档
POST /my_blogs/_search
{
"query": {
"has_parent": {
"parent_type": "blog",
"query": {
"match": {
"title": "我正在学习go语言"
}
}
}
}
}
# 通过ID ,访问子文档
GET /my_blogs/_doc/comment3
# 通过ID和routing ,访问子文档
GET /my_blogs/_doc/comment3?routing=blog2
# 更新子文档
PUT /my_blogs/_doc/comment3?routing=blog2
{
"comment": "我觉得elk知识点学习起来还是很多了",
"blog_comments_relation": {
"name": "comment",
"parent": "blog2"
}
}