之前分享过 使用elasticsearch库【一】https://blog.csdn.net/zyooooxie/article/details/109588072 , 继续分享下 ;
@blog: https://blog.csdn.net/zyooooxie
@qq: 153132336
@email: [email protected]
def test_0821(index_str: str):
exist_id = 'xie-xie-xie-xie-xie'
no_exist_id = 'xie'
list_ = [gl_es_host_new, gl_es_host_new_2, gl_es_host_new_3, gl_es_host_new_4]
# list_ = [gl_es_host_new]
for es_host in list_:
client = Elasticsearch(es_host, sniff_on_start=True, sniff_on_node_failure=True, request_timeout=60,
# Returns whether the cluster is running.
# Returns basic information about the cluster.
# Returns number of documents matching a query.
body={'query': term_terms_change(index_str=index_str, client=client,
# The filter_path parameter is used to reduce the response returned by elasticsearch.
Log.info(client.search(index=index_str, filter_path=['hits.hits._id', 'hits.hits._type', 'hits.total']))
# It also supports the * wildcard character to match any field or part of a field’s name
Log.info(client.search(index=index_str, filter_path=['hits.hits.*']))
# Returns information about whether a document exists in an index.
Log.info(client.exists(index=index_str, id=exist_id))
Log.info(client.exists(index=index_str, id=no_exist_id))
# Returns the source of a document.
Log.info(client.get_source(index=index_str, id=exist_id))
# Allows to get multiple documents in one request.
Log.info(client.mget(index=index_str, body={'ids': [exist_id, no_exist_id]}))
# Closes the Transport and all internal connections
def test_es_indices_management(client: Elasticsearch):
:param client:
# 索引中每个文档都有 类型 。每种类型都有它自己的 映射 ,或者 模式定义 。
# 映射定义了 类型 中的 域,每个域的数据类型,以及Elasticsearch如何处理这些域。
# Elasticsearch 支持如下简单域类型:
# 字符串: string
# 整数 : byte, short, integer, long
# 浮点数: float, double
# 布尔型: boolean
# 日期: date
# 当你索引一个包含新域的文档 之前未曾出现-- Elasticsearch 会使用 动态映射 ,通过JSON中基本数据类型,尝试猜测域类型。
# 下面是索引 两个 最重要的设置:
# number_of_shards 每个索引的主分片数 。这个配置在索引创建后不能修改。
# number_of_replicas 每个主分片的副本数 。后期可以动态修改。
# primary shard:主分片,每个文档都存储在一个分片中,当你存储一个文档的时候,系统会首先存储在主分片中,然后会复制到不同的副本中。
# replica shard:副本分片,每一个分片有零个或多个副本。副本主要是主分片的复制,可以 增加高可用性,提高性能。
abc = 'xie' + str(random.randint(1, 9999))
# The 'body' parameter is deprecated for the 'create' API
"properties": {
"test_{}".format(abc): {"type": "text"}
"number_of_shards": 1
def _es_get(index_str: str, client: Elasticsearch, id_str: str, doc_type: str = gl_type, **kwargs):
:param index_str:
:param client:
:param id_str:
:param doc_type: The type of the document (use `_all` to fetch the first document matching the ID across all types)
:param kwargs:
res = client.exists(index=index_str,
id=id_str) # Returns information about whether a document exists in an index.
if not res:
Log.error(f'当前id:{id_str} 不存在')
Log.info(client.get(index=index_str, id=id_str, **kwargs))
def _es_delete(index_str: str, client: Elasticsearch, id_str: str, **kwargs):
:param index_str:
:param client:
:param id_str:
:param kwargs:
res = client.exists(index=index_str, id=id_str)
if not res:
res = client.delete(index=index_str, id=id_str, **kwargs)
assert res.get('result') == 'deleted'
return res
def _es_delete_by_query(index_str: str, client: Elasticsearch, body: dict, **kwargs):
:param index_str:
:param client:
:param body: The search definition using the Query DSL
:param kwargs:
# scroll_size: Size on the scroll request powering the delete by query
# Default: 100
res = client.delete_by_query(index=index_str, body=body, scroll_size=1000, **kwargs)
Log.info(f'{res.get("total")}, {res.get("deleted")}')
return res
def _es_create(index_str: str, client: Elasticsearch, id_str: str, document: dict, **kwargs):
:param index_str:
:param client:
:param id_str:
:param document:
:param kwargs:
res = client.exists(index=index_str, id=id_str)
if res:
res = client.create(index=index_str, id=id_str, document=document, **kwargs)
assert res.get('result') == 'created'
return res
def _es_index(index_str: str, client: Elasticsearch, document: dict, id_str: str = None, **kwargs):
:param index_str:
:param client:
:param document:
:param id_str:
:param kwargs:
# # ✅ New usage:
# es.index(document={...})
# # ❌ Deprecated usage:
# es.index(body={...})
if id_str: # 用全量覆盖的方式更新 某id
res = client.exists(index=index_str, id=id_str)
if not res:
Log.error(f'当前id:{id_str} 不存在')
res = client.index(index=index_str, document=document, id=id_str, **kwargs)
assert res.get('result') == 'updated'
res = client.index(index=index_str, document=document, **kwargs)
assert res.get('result') == 'created'
return res
def _es_update(index_str: str, client: Elasticsearch, id_str: str, doc: dict, **kwargs):
Enables you to script document updates. The script can update, delete, or skip modifying the document.
The update API also supports passing a partial document, which is merged into the existing document.
To fully replace an existing document, use the index API.
:param index_str:
:param client:
:param id_str:
:param doc:
:param kwargs:
res = client.exists(index=index_str, id=id_str)
if res:
# The 'body' parameter is deprecated for the 'update' API and will be removed in a future version. Instead use API parameters directly.
res = client.update(index=index_str, id=id_str, doc=doc, **kwargs)
return res
Log.error(f'当前id:{id_str} 不存在')
def _bulk_actions(index_str: str,
id_str: List[str], body: Union[List[dict], List[int]],
op_type: str = 'index'):
:param index_str:
:param id_str:
:param body:
:param op_type: defaults to index
# All bulk helpers accept an instance of {es} class and an iterable action (any iterable, can also be a generator, which is ideal in most cases since it allows you to index large datasets without the need of loading them into memory).
ib_list = list(zip(id_str, body))
for id_body in ib_list:
actions_dict = {"_index": index_str, "_id": id_body[0], '_op_type': op_type}
# The bulk() api accepts index, create, delete, and update actions.
# Use the _op_type field to specify an action (_op_type defaults to index)
if op_type == 'index' or op_type == 'create':
elif op_type == 'delete':
elif op_type == 'update':
raise Exception('传参有误')
yield actions_dict
def _es_bulk(index_str: str, client: Elasticsearch,
body_list: List[dict], id_list: List[str],
op_type: str, **kwargs):
:param index_str:
:param client:
:param body_list:
:param id_list:
:param op_type:
:param kwargs:
# https://elasticsearch-py.readthedocs.io/en/v7.17.0/helpers.html#bulk-helpers
actions = _bulk_actions(index_str=index_str, body=body_list, id_str=id_list, op_type=op_type)
from elasticsearch.helpers import bulk
res = bulk(client=client, actions=actions, **kwargs)
return res
def _es_analyze(index_str: str, client: Elasticsearch, text: str):
:param index_str:
:param client:
:param text: 建议 只使用str;If an array of strings is provided, it is analyzed as a multi-value field.
# Elasticsearch 中的数据可以概括的分为两类:精确值和全文。
# 精确值 如它们听起来那样精确。例如日期或者用户 ID,但字符串也可以表示精确值,例如用户名或邮箱地址。对于精确值来讲,Foo 和 foo 是不同的,2014 和 2014-09-15 也是不同的。
# 另一方面,全文 是指文本数据(通常以人类容易识别的语言书写),例如一个推文的内容或一封邮件的内容。
# 当我们 索引 一个文档,它的全文域被分析成词条以用来创建倒排索引。
# 但是,当我们在全文域 搜索 的时候,我们需要将查询字符串通过 相同的分析过程 ,以保证我们搜索的词条格式与索引中的词条格式一致。
# 当你查询一个 全文 域时, 会对查询字符串应用相同的分析器,以产生正确的搜索词条列表。
# 当你查询一个 精确值 域时,不会分析查询字符串,而是搜索你指定的精确值。
# By default, Elasticsearch changes the values of text fields during analysis.
# For example, the default standard analyzer changes text field values as follows:
# 1.Removes most punctuation
# 2.Divides the remaining content into individual words, called tokens
# 3.Lowercases the tokens
# analyzer
# 分析器可以由每个字段决定。每个字段都可以有不同的分析器,既可以通过配置为字段指定分析器,也可以使用更高层的类型(type)、索引(index)或节点(node)的默认配置
# If this parameter is not specified, the analyze API uses the analyzer defined in the field’s mapping.
# If no field is specified, the analyze API uses the default analyzer for the index.
# If no index is specified, or the index does not have a default analyzer, the analyze API uses the standard analyzer.
res = client.indices.analyze(body={'text': text}, index=index_str)
tokens_list = res.get('tokens')
if len(tokens_list) > 1:
return [tl.get('token') for tl in tokens_list]
# token 是实际存储到索引中的词条。
# position 指明词条在原始文本中出现的位置。 start_offset 和 end_offset 指明字符在原始字符串中的位置。
return [tokens_list[0].get('token')]
def query_term(index_: str, es_: Elasticsearch,
field: str = None, value: Union[str, int] = None,
field_field: str = None):
【搜索条件 field=value、field.field_field=value】
:param index_:
:param es_:
:param field:
:param value:
:param field_field:
search_dict = gl_search_dict.copy()
if field_field:
search_dict.update(query=term_terms_change(index_str=index_, client=es_,
field='.'.join([field, field_field]), value=value))
elif field:
search_dict.update(query=term_terms_change(index_str=index_, client=es_,
field=field, value=value))
print_result(index_, search_dict)
return search_dict.get('query')
def print_result(index_: str, dict_1: dict):
res = f'GET /{index_}/_doc/_search'
res_ = json.dumps(dict_1, ensure_ascii=False)
def term_terms_change(index_str: str, client: Elasticsearch, field: str, value: Any,
simple_use: bool = False,
term_terms: str = 'term', **kwargs
) -> dict:
:param index_str:
:param client:
:param field:
:param value:
:param simple_use:
:param term_terms:
:param kwargs:
assert term_terms in ['term', 'terms']
if not value:
return {term_terms: {field + '.keyword': value}, **kwargs}
if simple_use:
return {term_terms: {field: value}, **kwargs}
# https://www.elastic.co/guide/en/elasticsearch/reference/7.17/query-dsl-term-query.html#avoid-term-query-text-fields
# The term query does not analyze the search term. The term query only searches for the exact term you provide.
# This means the term query may return poor or no results when searching text fields.
analyze_list = _es_analyze(index_str=index_str, client=client, text=value)
if value in analyze_list and isinstance(value, str):
return {term_terms: {field: value}, **kwargs}
return {term_terms: {field + '.keyword': value}, **kwargs}
def es_func(func_name: str, **kwargs):
:param func_name:
:param kwargs:
func_list = ['_es_get', '_es_delete', '_es_create', '_es_index', '_es_search', '_es_update', '_es_bulk',
if not kwargs.get('client'):
client = connect_es_client(gl_es_host_new, gl_es_auth)
close_ = True
close_ = False
# Log.debug(f'传参:{kwargs}')
assert func_name in func_list
# 两种方式 二选一
# globals().get(func_name)(**kwargs)
return getattr(sys.modules[__name__], func_name)(**kwargs)
except AssertionError:
except BulkIndexError:
Log.error('Bulk 遇到错误,被中断')
except Exception as e:
Log.info('es_func() 执行结束')
if close_:
def get_seq_max(index_str: str, client: Elasticsearch):
:param index_str:
:param client:
# res_list = es_func('_es_search', client=client, index_str=index_str, body={}, sort_='seq:desc', size_=1)
# res_list = _es_search(client=client, index_str=index_str, body={}, sort_='seq:desc', size_=1)
# q = {'constant_score': {'filter': {'exists': {'field': 'seq'}}}}
q = {'exists': {'field': 'seq'}}
res_list = es_func('_es_search', client=client, index_str=index_str, query=q,
sort_={"seq": {"order": "desc"}}, size_=1)
if not res_list:
seq = 0
seq = res_list[0].get('_source').get('seq')
return seq
