之所以不进行es聚合实时查询一个是查询数量过大,另一方面是实时查询要保存大量的原始日志,现在只有5台es data节点,不能承受这么大的原始日志量。原始日志保留一定的天数要进行删除。
当然也有的数据只是查询几天内的数据就直接用es的自身聚合能力了
python部分脚本示例:
def main_statistic(domain,userId):
body = {
"query": {
"bool": {
"must": [
{
"term": {
"uriHost.raw": domain
}
}
]
}
},
"size": 0,
"aggs": {
"fileCount": {
"terms": {
"field": "mime.raw"
},
"aggs": {
"totalFileSize": {
"sum": {
"field": "repsize"
}
}
}
}
}
}
result = in_es.search(index=common_index.logstash_index,doc_type="fc_access",body=body)
name = result["aggregations"]["fileCount"]
buckets = name["buckets"]
for name_item in buckets:
name_key = name_item["key"]
doc_count = name_item["doc_count"]
totalFileSize = name_item["totalFileSize"]["value"]
if doc_count > 0:
browser_count_item = {
"_index": common_index.spark_portal_index,
"_type": "logstashIndexDF_filetype_totalsize",
"_source": {
"@timestamp": common_index.timestamp_attr,
"add_time": common_index.add_time_attr,
"uriHost": domain,
"userId": userId,
"mime": name_key,
"fileCount": doc_count,
"totalFileSize": totalFileSize
}
}
print browser_count_item
out_count_arr.append(browser_count_item)
# 这是按照用户分类进行数据填充的
browser_count_item_use = {
"_index": common_index.spark_portal_index,
"_type": "logstashIndexDF_filetype_totalsize_sum",
"_source": {
"@timestamp": common_index.timestamp_attr,
"add_time": common_index.add_time_attr,
"userId": userId,
"mime": name_key,
"fileCountSum": doc_count,
"totalFileSizeSum": totalFileSize
}
}
print browser_count_item_use
out_count_arr.append(browser_count_item_use)
def cacl_main(common_index_obj,domain_users):
global common_index
common_index = common_index_obj
global out_count_arr
out_count_arr = []
for domain_user_item in domain_users:
domain = domain_user_item["key"]
userId = domain_user_item["user_id"]
main_statistic(domain=domain, userId=userId)
if len(out_count_arr) > 300:
helpers.bulk(out_es, out_count_arr)
out_count_arr = []