备份indice历史数据
# -*- coding: utf-8 -*-
import time
import json
import os
import sys
from multiprocessing import Process,Pool
from elasticsearch import Elasticsearch
class Elastic_dump(object):
def __init__(self,es_addr,indice):
self._es_addr = es_addr
self._indice = indice
self._es = Elasticsearch(es_addr)
#获取需要备份索引的type,索引使用人(开发)定义,作者后面改变了备份方法,其实没用到
def get_indice(self):
indice_description = self._es.indices.get(index='ad_humanproperty_v2_201919')
type = indice_description['ad_humanproperty_v2_201919']['mappings'].keys()
return type
#聚合查询接口默认每页最多查询100000条hits, 如果超大数据量情况下,分页一次性查出来聚合到缓存当中,es主机现有资源配置即lucene内存会被撑爆掉,所以利用scroll(游标)方法分批查询归并返回给客户端
def export_data(self):
size = 10000 #定义游标大小,及每次查询10000条hits
start = 0
# dump_time = time.strftime("%Y-%m-%d")
# indice_time = self._indice.split('_')[-1]
body = {
"query": {"match_all": {}}, #开发需求备份该索引所有数据,所以过滤条件为简单的match_all
"size": '{0}'.format(size)
}
indice_detail = self._es.search(index='{0}'.format(self._indice),scroll='3m',body=body) #本次查询主要为获取游标id,同时也会返回本次查询的10000条hits记录,scroll='6m'为定义游标有效时间,数据量大的情况下,可以适当设置地较长
scroll_id = indice_detail['_scroll_id'] #拿到游标的ID
scroll_size = indice_detail['hits']['total']
end = scroll_size/size +1 #在当前取游标定义的hits数量下,计算一共需要取到少次
while (start < end):
page = self._es.scroll(scroll_id=scroll_id, scroll='3m')
self.write_to_file(page)
start += 1
#备份完的索引数据, 调用系统命令压缩成gzip包
os.system(r"tar -zcPf {0}.gz {1}".format(self._json_file,self._json_file))
def write_to_file(self,page):
datas = page["hits"]["hits"]
json_file = r'/data/es-back/ad_humanproperty_v2/{0}.json'.format(self._indice)
self._json_file = json_file
# json_file = r'D:\python-code\{0}.json'.format(self._indice)
try:
# with open(json_file,'a') as file:
file = open(json_file,'a')
for data in datas:
dump_data = json.dumps(data["_source"],ensure_ascii=False)
file.write(dump_data+"\n")
finally:
file.flush()
file.close()
#作者通过端口映射将elasticsearch集群节点映射到了本地
#作者这里四个节点只有master节点开启了ingest负载路由功能,所以其实只连接127.0.0.1:8060(即master)就可以了
def back_task(num):
url = 'http://1127.0.0.1:8060'
es_addr=[
'127.0.0.1:8060',
'127.0.0.1:8061',
'127.0.0.1:8062',
'127.0.0.1:8063'
]
indice_header = 'ad_humanproperty_v2'
#索引按周创建例如2019年第1周的数据存放于索引ad_humanproperty_v2_201901中
indice_name = '{0}'.format(indice_header) + '_' + '2019' + str(num)
Elastic_dump(es_addr,indice_name).export_data()
if __name__ == '__main__':
#定义一个进程池,并定义可复用的进程数量
pool = Pool(4)
for num in range(1,52+1):
#异步方式
pool.apply_async(back_task,args=(str(num).rjust(2,'0'),))
pool.close()
pool.join()
indice日常备份通用版
#!/usr/local/python3/bin/python3
import time
from datetime import datetime, date, timedelta
import json
import os
import sys
import requests
from multiprocessing import Pool
from elasticsearch import Elasticsearch
class Elastic_dump(object):
def __init__(self,es_addr,indice_header,indice):
self._es_addr = es_addr
self._indice_header = indice_header
self._indice = indice
self._es = Elasticsearch(self._es_addr)
def export_data(self):
size = 10000
start = 0
body = {
"query": {"match_all": {}},
"size": 10000
}
indice_detail = self._es.search(index='{0}'.format(self._indice),scroll='3m',body=body)
scroll_id = indice_detail['_scroll_id']
scroll_size = indice_detail['hits']['total']
end = scroll_size/size +1
try:
while (start < end):
page = self._es.scroll(scroll_id=scroll_id, scroll='3m')
self.write_to_file(page)
start += 1
finally:
os.system(r"tar -zcPf {0}.gz {1}".format(self._json_file,self._json_file))
os.system("rm -rf {0}".format(self._json_file))
def write_to_file(self,page):
datas = page["hits"]["hits"]
month = time.strftime('%Y-%m',time.localtime(time.time()))
backdir = '/data/es-back/{0}/{1}'.format(self._indice_header,month)
if not os.path.exists(backdir):
os.makedirs(backdir)
json_file = '{0}/{1}.json'.format(backdir,self._indice)
self._json_file = json_file
try:
file = open(json_file,'a')
for data in datas:
dump_data = json.dumps(data["_source"],ensure_ascii=False)
file.write(dump_data+"\n")
finally:
file.flush()
file.close()
def back_task(indice_header,elas_addr):
global indice_name
#由于各indice的命名规则各有不同,需要单独定义
if indice_header == 'indice01':
print(indice_header)
indice_time = (date.today() + timedelta(days=-1)).strftime("%Y%m%d")
indice_name = '{0}'.format(indice_header) + '_' + '{0}'.format(indice_time)
elif indice_header == 'indice02':
print(indice_header)
indice_time = (date.today() + timedelta(days=-1)).strftime("%Y-%m-%d")
indice_name = '{0}'.format(indice_header) + '-' + '{0}'.format(indice_time)
Elastic_dump(elas_addr,indice_header,indice_name).export_data()
if __name__ == "__main__":
# 定义一个进程池
pool = Pool(2)
# 定义每日需要备份的索引名称
header_name = ['indice01','indice02']
elas_addr = ['127.0.0.1:9200']
for num in range(len(header_name)):
indice_header = header_name[num]
pool.apply_async(back_task,args=(indice_header,elas_addr,))
pool.close()
pool.join()