废话不多说直接上干货!
# ES相关包
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
class ElasticSearchClient(object): # 启动ES
@staticmethod
def get_es_servers():
es_servers = [{
"host": "localhost",
"port": "9200"
}]
es_client = Elasticsearch(hosts=es_servers)
return es_client
class LoadElasticSearch(object): # 在ES中加载、存储和处理数据
def __init__(self):
#self.index = 'my-index-cleaned' #"my-index-yzm-1"
self.index = "my-index-yzm-1"
#self.doc_type = "test-type" #"test-type-yzm-1"
self.doc_type = "test-type-yzm-1"
self.es_client = ElasticSearchClient.get_es_servers()
self.set_mapping()
def set_mapping(self):
"""
设置mapping
"""
mapping = {
self.doc_type: {
"properties": {
"qa_id": {
"type": "integer"
},
"q": {
"type": "string"
},
"a": {
"type": "string"
},
"pos": {
"type": "string"
},
"neg": {
"type": "string"
}
}
}
}
if not self.es_client.indices.exists(index=self.index):
# 创建Index和mapping
self.es_client.indices.create(index=self.index, body=mapping, ignore=400)
self.es_client.indices.put_mapping(index=self.index, doc_type=self.doc_type, body=mapping)
def add_date(self, row_obj):
"""
单条插入ES
"""
_id = row_obj.get("_id", 1)
row_obj.pop("_id")
self.es_client.index(index=self.index, doc_type=self.doc_type, body=row_obj, id=_id)
def add_date_bulk(self, row_obj_list):
"""
批量插入ES
"""
load_data = []
i = 1
bulk_num = 100000 # 10万条为一批
for row_obj in row_obj_list:
action = {
"_index": self.index,
"_type": self.doc_type,
"_id": row_obj.get('_id', 'None'),
"_source": {
'qa_id': row_obj.get('qa_id', None),
'q': row_obj.get('q', None),
'a': row_obj.get('a', None),
'pos': row_obj.get('pos', None),
'neg': row_obj.get('neg', None),
}
}
load_data.append(action)
i += 1
# 批量处理
if len(load_data) == bulk_num:
print('插入', i / bulk_num, '批数据')
print(len(load_data))
success, failed = bulk(self.es_client, load_data, index=self.index, raise_on_error=True)
del load_data[0:len(load_data)]
print(success, failed)
if len(load_data) > 0:
success, failed = bulk(self.es_client, load_data, index=self.index, raise_on_error=True)
del load_data[0:len(load_data)]
#print(success, failed)
print('加载成功:',success,'加载失败:',failed)
def update_by_id(self, row_obj):
"""print('加载成功:',success,'加载失败:',failed)
根据给定的_id,更新ES文档
:return:
"""
_id = row_obj.get("_id", 1)
row_obj.pop("_id")
self.es_client.update(index=self.index, doc_type=self.doc_type, body={"doc": row_obj}, id=_id)
def delete_by_id(self, _id):
"""
根据给定的id,删除文档
:return:
"""
self.es_client.delete(index=self.index, doc_type=self.doc_type, id=42)