Elasticsearch 聚合功能

起因：在项目开发过程中，要使用到搜索引擎来对一些关键字实现逆向查询，如果仅用模糊搜索，那么搜索的时间会根据数据量的增大而增大，对比之下就学了elasticsearch，也记录一下，常常回顾。

1. 什么是Elasticsearch聚合功能

类型数据的分组，ES中聚合是对查询出来的数据结果进行分组

ES中的聚合命令是aggregation

Bucket Aggregation：一些列满足特定条件的的文档集合
Metric Aggregation：一些数学运算，可以对文档字段进行统计分析
Pipeline Aggregation：对聚合结果进行二次聚合
Matrix Aggregation：支持对多个字段的操作并提供一个结果矩阵

select count(brand) #这个就相当于上面的Metric
from cars
group by brand #这个就相当于上面的bucket

大多数Metric都是输出一个值

min / max / sum / avg / cardinality

部分metric支持多个数值

stats / percentiles / percentiles_ranks

注意：聚合分桶只能对keyword字段进行，对text进行是不行的

如果想要在text字段上进行聚合分桶如何做

第一种方式：加个keyword的子字段 nickname.keyword
第二种方式：就对text分词进行分桶

# 在ES中默认对fielddata是false的
# 因为开启text的fielddata后对内存占用高
# 开启后该field进行分词后桶合并
"nickname": {
 "type": "text",
 "fielddata": true
}

2. 聚合桶实例功能使用和分析

2.1. 使用terms分桶加metric数学运算

# 场景1: 对查询结果进行聚合，得出不同城市的学员数
{
 "query": {
 "match_all": {}
 },
 "aggs": {
 "city_count": {
 "terms": {
 "field": "city"
 }
 }
 }
}
# 场景2: 对查询结果进行聚合，得出不同城市的学员数,并得出各个城市学员消费的最大最小平均
# 关键字： terms / aggs / min / max / avg 
# aggs下面的第一个key是你的分组名
{
 "query": {
 "match_all": {}
 },
 "size": 0,
 "aggs": {
 "city_count": {
 "terms": {
 "field": "city"
 },
 "aggs": {
 "avg_consume": {
 "avg": {
 "field": "consume"
 }
 },
 "max_consume": {
 "max": {
 "field": "consume"
 }
 },
 "min_consume": {
 "min": {
 "field": "consume"
 }
 }
 }
 }
 }
}
# 场景3: 对查询结果进行聚合，得出不同城市的学员数,城市的平均消费水平,该城市学院的性别分布
{
 "query": {
 "match_all": {}
 },
 "size": 0,
 "aggs": {
 "city_count": {
 "terms": {
 "field": "city"
 },
 "aggs": {
 "avg_consume": {
 "avg": {
 "field": "consume"
 }
 },
 "sex_count": {
 "terms": {
 "field": "sex"
 }
 }
 }
 }
 }
}
# 将场景3修改为stats的多输出metric
{
 "query": {
 "match_all": {}
 },
 "size": 0,
 "aggs": {
 "city_count": {
 "terms": {
 "field": "city"
 },
 "aggs": {
 "avg_stats": {
 "stats": {
 "field": "consume"
 }
 },
 "sex_count": {
 "terms": {
 "field": "sex"
 }
 }
 }
 }
 }
}
# 场景4：统计一下各个城市学员，年龄最大排前2的并返回信息
{
 "query": {
 "match_all": {}
 },
 "size": 0,
 "aggs": {
 "city_count": {
 "terms": {
 "field": "city"
 },
 "aggs": {
 "age_top": {
 "top_hits": {
 "size": 2,
 "sort": [
 {
 "age": "desc"
 }
 ]
 }
 }
 }
 }
 }
}

2.2. 基础数据

Elasticsearch安装时要提前把ik分词器，pinyin分词器安装好

mapping

# POST  /index_customer/_mapping
{
 "properties": {
 "id": {
 "type": "long"
 },
 "age": {
 "type": "integer"
 },
 "username": {
 "type": "keyword"
 },
 "nickname": {
 "type": "text",
 "analyzer": "ik_max_word",
 "fields": {
 "pinyin": {
 "type": "text",
 "analyzer": "pinyin"
 }
 }
 },
 "consume": {
 "type": "float"
 },
 "desc": {
 "type": "text",
 "analyzer": "ik_max_word"
 },
 "sex": {
 "type": "byte"
 },
 "birthday": {
 "type": "date"
 },
 "city": {
 "type": "keyword"
 },
 "faceimg": {
 "type": "text",
 "index": false
 }
 }
}

document

# POST /index_customer/_doc/1001
{
 "id": 1001,
 "age": 24,
 "username": "kingliu",
 "nickname": "飞上天空做太阳",
 "consume": 1289.88,
 "desc": "我在艾编程学习java和vue，学习到了很多知识",
 "sex": 1,
 "birthday": "1996-12-24",
 "city": "北京",
 "faceimg": "https://www.icodingedu.com/img/customers/1001/logo.png"
}
# POST /index_customer/_doc/1002
{
 "id": 1002,
 "age": 26,
 "username": "lucywang",
 "nickname": "夜空中最亮的星",
 "consume": 6699.88,
 "desc": "我在艾编程学习VIP课程，非常打动我",
 "sex": 0,
 "birthday": "1994-02-24",
 "city": "北京",
 "faceimg": "https://www.icodingedu.com/img/customers/1002/logo.png"
}
# POST /index_customer/_doc/1003
{
 "id": 1003,
 "age": 30,
 "username": "kkstar",
 "nickname": "照亮你的星",
 "consume": 7799.66,
 "desc": "老师们授课风格不同，但结果却是异曲同工，讲的很不错，值得推荐",
 "sex": 1,
 "birthday": "1990-12-02",
 "city": "北京",
 "faceimg": "https://www.icodingedu.com/img/customers/1003/logo.png"
}
# POST /index_customer/_doc/1004
{
 "id": 1004,
 "age": 31,
 "username": "alexwang",
 "nickname": "骑着老虎看日出",
 "consume": 9799.66,
 "desc": "课程内容充实，有料，很有吸引力，赞一个",
 "sex": 1,
 "birthday": "1989-05-09",
 "city": "上海",
 "faceimg": "https://www.icodingedu.com/img/customers/1004/logo.png"
}
# POST /index_customer/_doc/1005
{
 "id": 1005,
 "age": 32,
 "username": "jsonzhang",
 "nickname": "我是你的神话",
 "consume": 12789.66,
 "desc": "需要抽时间把所有内容都学个遍，太给力料",
 "sex": 1,
 "birthday": "1988-07-19",
 "city": "上海",
 "faceimg": "https://www.icodingedu.com/img/customers/1005/logo.png"
}
# POST /index_customer/_doc/1006
{
 "id": 1006,
 "age": 27,
 "username": "abbyli",
 "nickname": "好甜的棉花糖",
 "consume": 10789.86,
 "desc": "还不错，内容超过我的预期值，钱花的值",
 "sex": 0,
 "birthday": "1993-10-11",
 "city": "上海",
 "faceimg": "https://www.icodingedu.com/img/customers/1006/logo.png"
}
# POST /index_customer/_doc/1007
{
 "id": 1007,
 "age": 33,
 "username": "jacktian",
 "nickname": "船长jack",
 "consume": 9789.86,
 "desc": "一直想抽时间学习，这下有时间整了，给力",
 "sex": 1,
 "birthday": "1987-09-16",
 "city": "深圳",
 "faceimg": "https://www.icodingedu.com/img/customers/1007/logo.png"
}
# POST /index_customer/_doc/1008
{
 "id": 1008,
 "age": 23,
 "username": "feifei",
 "nickname": "我爱篮球",
 "consume": 6689.86,
 "desc": "虽然又一些不太懂，但只要有时间，相信一定能学好的",
 "sex": 1,
 "birthday": "1997-04-18",
 "city": "深圳",
 "faceimg": "https://www.icodingedu.com/img/customers/1008/logo.png"
}
# POST /index_customer/_doc/1009
{
 "id": 1009,
 "age": 25,
 "username": "daisyzhang",
 "nickname": "一起看日出",
 "consume": 6680,
 "desc": "学习起来还是很有意思的，值得我多次学习",
 "sex": 0,
 "birthday": "1995-03-27",
 "city": "深圳",
 "faceimg": "https://www.icodingedu.com/img/customers/1009/logo.png"
}
# POST /index_customer/_doc/1010
{
 "id": 1010,
 "age": 29,
 "username": "ethenhe",
 "nickname": "旋风小子",
 "consume": 6699.99,
 "desc": "课程严谨，知识丰富，讲解到位，编程给力",
 "sex": 1,
 "birthday": "1991-06-22",
 "city": "广州",
 "faceimg": "https://www.icodingedu.com/img/customers/1010/logo.png"
}
# POST /index_customer/_doc/1011
{
 "id": 1011,
 "age": 27,
 "username": "lydiazheng",
 "nickname": "跳芭蕾的小妮",
 "consume": 8899.99,
 "desc": "自己一直都没有动力去系统学习，这次有了老师和大家的督促，非常不错，终于坚持下来了",
 "sex": 0,
 "birthday": "1993-08-27",
 "city": "广州",
 "faceimg": "https://www.icodingedu.com/img/customers/1011/logo.png"
}
# POST /index_customer/_doc/1012
{
 "id": 1012,
 "age": 37,
 "username": "draglong",
 "nickname": "飞龙在天",
 "consume": 18899.99,
 "desc": "技术宅，就喜欢研究技术，和大家碰撞感觉很好",
 "sex": 1,
 "birthday": "1983-11-22",
 "city": "广州",
 "faceimg": "https://www.icodingedu.com/img/customers/1012/logo.png"
}

3. ES桶聚合后相关指标分析&text字段聚合方式

如果想要对text类型的字段进行分桶

1、给field增加keyword的子字段

POST /index_customer/_mapping
{
    "properties": {
        "nickname": {
            "analyzer": "ik_max_word",
            "type": "text",
            "fields": {
                "pinyin": {
                    "analyzer": "pinyin",
                    "type": "text"
                },
                "keyword": {
                    "ignore_above": 256,
                    "type": "keyword"
                }
            }
        }
    }
}
# 在数据添加后增加子字段需要将index进行更新
POST /index_customer/_update_by_query

2、给field增加fielddata

# fielddata是对text文本进行分词后的桶聚合
# 默认是false，打开会比较占内存，所以没有必要的情况
POST /index_customer/_mapping
{
    "properties": {
        "nickname": {
            "analyzer": "ik_max_word",
            "type": "text",
            "fielddata": true,
            "fields": {
                "pinyin": {
                    "analyzer": "pinyin",
                    "type": "text",
                    "fielddata": true
                },
                "keyword": {
                    "ignore_above": 256,
                    "type": "keyword"
                }
            }
        }
    }
}

分桶返回的参数分析

doc_count_error_upper_bound：可能存在潜在的结果是聚合后结果排行第二的值
sum_other_doc_count：表示本次聚合中还有多少没有统计展示出
- 桶默认聚合展示10条
- 可以使用size来调整条目数
- 只能指定条目数，不能分页
buckets：会根据结果的统计降序排列

size进行桶查询的展示

POST /index_customer/_search
{
    "query": {
        "match_all": {}
    },
    "size": 0,
    "aggs": {
        "nickname_term": {
            "terms": {
                "field": "nickname",
                "size": 20
            }
        }
    }
}

当doc频繁有数据加入到文档中，并且这个field会频繁进行分桶，需要添加一个缓存配置

# 频繁聚合查询，索引不断有新的doc加入
# "eager_global_ordinals": true
POST /index_customer/_mapping
{
    "properties": {
        "nickname": {
            "analyzer": "ik_max_word",
            "type": "text",
            "fielddata": true,
            "eager_global_ordinals": true,
            "fields": {
                "pinyin": {
                    "analyzer": "pinyin",
                    "type": "text",
                    "fielddata": true,
                    "eager_global_ordinals": true
                },
                "keyword": {
                    "ignore_above": 256,
                    "type": "keyword"
                }
            }
        }
    }
}

分组基数查询

# cardinality统计桶分词的基数
POST /index_customer/_search
{
    "query": {
        "match_all": {}
    },
    "size": 0,
    "aggs": {
        "nickname_term": {
            "cardinality": {
                "field": "nickname"
            }
        }
    }
}

4. 桶range计算

就是一个区间值的查询

POST POST /index_customer/_search
{
    "query": {
        "match_all": {}
    },
    "size": 0,
    "sort": [
        {
            "consume": "desc"
        }
    ],
    "aggs": {
        "city_count": {
            "terms": {
                "field": "city"
            }
        },
        "consume_range": {
            "range": {
                "field": "consume",
                "ranges": [
                    {
                        "to": 3000
                    },
                    {
                        "from": 3000,
                        "to": 6000
                    },
                    {
                        "from": 6000,
                        "to": 9000
                    },
                    {
                        "from": 9000
                    }
                ]
            }
        }
    }
}

直方图的聚合

POST /index_customer/_search
{
    "query": {
        "match_all": {}
    },
    "size": 0,
    "sort": [
        {
            "consume": "desc"
        }
    ],
    "aggs": {
        "city_count": {
            "terms": {
                "field": "city"
            }
        },
        "consume_histogram": {
            "histogram": {
                "field": "consume",
                "interval": 2000,
                "extended_bounds": {
                    "min": 0,
                    "max": 20000
                }
            }
        }
    }
}

5. Pipeline聚合计算

pipeline就是对聚合分析再做一次聚合分析

# 场景：从所有城市的平均消费中，拿出最低的那个城市
GET /index_customer/_search
{
    "query": {
        "match_all": {}
    },
    "size": 0,
    "sort": [
        {
            "consume": "desc"
        }
    ],
    "aggs": {
        "city_count": {
            "terms": {
                "field": "city"
            },
            "aggs": {
                "avg_consume": {
                    "avg": {
                        "field": "consume"
                    }
                }
            }
        },
        "min_consume_by_city": {
            "min_bucket": {
                "buckets_path": "city_count>avg_consume"
            }
        }
    }
}
# min_bucket / buckets_path 是关键字
# max_bucket / min_bucket / avg_bucket / sum_bucket / stats_bucket

不要以为每天把功能完成了就行了，这种思想是要不得的，互勉~！