Python备份elasticsearch指定indice索引数据

备份indice历史数据 

# -*- coding: utf-8 -*-
import time
import json
import os
import sys
from multiprocessing import Process,Pool
from elasticsearch import Elasticsearch
class Elastic_dump(object):
    def __init__(self,es_addr,indice):
        self._es_addr = es_addr
        self._indice = indice
        self._es = Elasticsearch(es_addr)
   #获取需要备份索引的type,索引使用人(开发)定义,作者后面改变了备份方法,其实没用到
    def get_indice(self):
        indice_description = self._es.indices.get(index='ad_humanproperty_v2_201919')
        type = indice_description['ad_humanproperty_v2_201919']['mappings'].keys()
        return type
   #聚合查询接口默认每页最多查询100000条hits, 如果超大数据量情况下,分页一次性查出来聚合到缓存当中,es主机现有资源配置即lucene内存会被撑爆掉,所以利用scroll(游标)方法分批查询归并返回给客户端
    def export_data(self):
        size = 10000  #定义游标大小,及每次查询10000条hits
        start = 0
        # dump_time = time.strftime("%Y-%m-%d")
        # indice_time = self._indice.split('_')[-1]
        body = {
            "query": {"match_all": {}}, #开发需求备份该索引所有数据,所以过滤条件为简单的match_all
            "size": '{0}'.format(size)
        }
        indice_detail = self._es.search(index='{0}'.format(self._indice),scroll='3m',body=body)  #本次查询主要为获取游标id,同时也会返回本次查询的10000条hits记录,scroll='6m'为定义游标有效时间,数据量大的情况下,可以适当设置地较长
        scroll_id = indice_detail['_scroll_id'] #拿到游标的ID
        scroll_size = indice_detail['hits']['total']
        end = scroll_size/size +1 #在当前取游标定义的hits数量下,计算一共需要取到少次
        while (start < end):
            page = self._es.scroll(scroll_id=scroll_id, scroll='3m')
            self.write_to_file(page)
            start += 1
        #备份完的索引数据, 调用系统命令压缩成gzip包
        os.system(r"tar -zcPf {0}.gz {1}".format(self._json_file,self._json_file))
    def write_to_file(self,page):
        datas = page["hits"]["hits"]
        json_file = r'/data/es-back/ad_humanproperty_v2/{0}.json'.format(self._indice)
        self._json_file = json_file
        # json_file = r'D:\python-code\{0}.json'.format(self._indice)
        try:
            # with open(json_file,'a') as file:
            file = open(json_file,'a')
            for data in datas:
                dump_data = json.dumps(data["_source"],ensure_ascii=False)
                file.write(dump_data+"\n")
        finally:
            file.flush()
            file.close()
#作者通过端口映射将elasticsearch集群节点映射到了本地
#作者这里四个节点只有master节点开启了ingest负载路由功能,所以其实只连接127.0.0.1:8060(即master)就可以了
def back_task(num):
    url = 'http://1127.0.0.1:8060'
    es_addr=[
    '127.0.0.1:8060',
    '127.0.0.1:8061',
    '127.0.0.1:8062',
    '127.0.0.1:8063'
    ]
    indice_header = 'ad_humanproperty_v2'
    #索引按周创建例如2019年第1周的数据存放于索引ad_humanproperty_v2_201901中
    indice_name = '{0}'.format(indice_header) + '_' + '2019' + str(num)
    Elastic_dump(es_addr,indice_name).export_data()

if __name__ == '__main__':
   
    #定义一个进程池,并定义可复用的进程数量  
    pool = Pool(4)
    for num in range(1,52+1):
    #异步方式
        pool.apply_async(back_task,args=(str(num).rjust(2,'0'),))
    pool.close()
    pool.join()

indice日常备份通用版

#!/usr/local/python3/bin/python3
import time
from datetime import datetime, date, timedelta
import json
import os
import sys
import requests
from multiprocessing import Pool
from elasticsearch import Elasticsearch
class Elastic_dump(object):
    def __init__(self,es_addr,indice_header,indice):
        self._es_addr = es_addr
        self._indice_header = indice_header
        self._indice = indice
        self._es = Elasticsearch(self._es_addr)
    def export_data(self):
        size = 10000
        start = 0
        body = {
            "query": {"match_all": {}},
            "size": 10000
        }
        indice_detail = self._es.search(index='{0}'.format(self._indice),scroll='3m',body=body)
        scroll_id = indice_detail['_scroll_id']
        scroll_size = indice_detail['hits']['total']
        end = scroll_size/size +1
        try:
            while (start < end):
                page = self._es.scroll(scroll_id=scroll_id, scroll='3m')
                self.write_to_file(page)
                start += 1
        finally:
            os.system(r"tar -zcPf {0}.gz {1}".format(self._json_file,self._json_file))
            os.system("rm -rf {0}".format(self._json_file))
    def write_to_file(self,page):
        datas = page["hits"]["hits"]
        month = time.strftime('%Y-%m',time.localtime(time.time()))
        backdir = '/data/es-back/{0}/{1}'.format(self._indice_header,month)
        if not os.path.exists(backdir):
            os.makedirs(backdir)
        json_file = '{0}/{1}.json'.format(backdir,self._indice)
        self._json_file = json_file
        try:
            file = open(json_file,'a')
            for data in datas:
                dump_data = json.dumps(data["_source"],ensure_ascii=False)
                file.write(dump_data+"\n")
        finally:
            file.flush()
            file.close()

def back_task(indice_header,elas_addr):
    global indice_name

    #由于各indice的命名规则各有不同,需要单独定义
    if  indice_header == 'indice01':
        print(indice_header)
        indice_time = (date.today() + timedelta(days=-1)).strftime("%Y%m%d")
        indice_name = '{0}'.format(indice_header) + '_' + '{0}'.format(indice_time)

    elif indice_header == 'indice02':
        print(indice_header)
        indice_time = (date.today() + timedelta(days=-1)).strftime("%Y-%m-%d")
        indice_name = '{0}'.format(indice_header) + '-' + '{0}'.format(indice_time)
    Elastic_dump(elas_addr,indice_header,indice_name).export_data()

if __name__ == "__main__":

    # 定义一个进程池
    pool = Pool(2)
    # 定义每日需要备份的索引名称
    header_name = ['indice01','indice02']
    elas_addr = ['127.0.0.1:9200']
    for num in range(len(header_name)):
        indice_header = header_name[num]
        pool.apply_async(back_task,args=(indice_header,elas_addr,))
    pool.close()
    pool.join()

 

你可能感兴趣的:(Python)