Elasticsearch 是一个开源的分布式搜索和分析引擎,它提供了强大的查询和分析功能。它基于 Apache Lucene 构建,支持大规模数据的实时搜索,并具有高可用性和可扩展性。
在 Elasticsearch 中,查询分析器负责处理用户搜索的输入,将文本进行分词并生成倒排索引。分析器在搜索过程中起到了关键作用,它们决定了搜索的准确性和效率。
标准分析器是 Elasticsearch 默认的分析器,它使用了标准的分词器、小写转换器和停用词过滤器
// 创建查询分析器
Analyzer analyzer = new StandardAnalyzer();
// 使用查询分析器进行分词
String text = "Elasticsearch is a distributed, RESTful search and analytics engine.";
TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(text));
CharTermAttribute termAttribute = tokenStream.addAttribute(CharTermAttribute.class);
// 遍历分词结果
tokenStream.reset();
while (tokenStream.incrementToken()) {
System.out.println(termAttribute.toString());
}
tokenStream.end();
tokenStream.close();
简单分析器将输入文本按照非字母字符进行分词,并将所有的词转换成小写
// 创建查询分析器
Analyzer analyzer = new SimpleAnalyzer();
// 使用查询分析器进行分词
String text = "Elasticsearch is a distributed, RESTful search and analytics engine.";
TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(text));
CharTermAttribute termAttribute = tokenStream.addAttribute(CharTermAttribute.class);
// 遍历分词结果
tokenStream.reset();
while (tokenStream.incrementToken()) {
System.out.println(termAttribute.toString());
}
tokenStream.end();
tokenStream.close();
空格分析器根据空格进行分词,不进行任何其他处理
// 创建查询分析器
Analyzer analyzer = new WhitespaceAnalyzer();
// 使用查询分析器进行分词
String text = "Elasticsearch is a distributed, RESTful search and analytics engine.";
TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(text));
CharTermAttribute termAttribute = tokenStream.addAttribute(CharTermAttribute.class);
// 遍历分词结果
tokenStream.reset();
while (tokenStream.incrementToken()) {
System.out.println(termAttribute.toString());
}
tokenStream.end();
tokenStream.close();
停用词分析器移除一些常见的英文停用词,例如 “a”, “the”, “is” 等
// 创建查询分析器,指定停用词列表
Analyzer analyzer = new StopAnalyzer(CharArraySet.EMPTY_SET);
// 使用查询分析器进行分词
String text = "Elasticsearch is a distributed, RESTful search and analytics engine.";
TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(text));
CharTermAttribute termAttribute = tokenStream.addAttribute(CharTermAttribute.class);
// 遍历分词结果
tokenStream.reset();
while (tokenStream.incrementToken()) {
System.out.println(termAttribute.toString());
}
tokenStream.end();
tokenStream.close();
关键词分析器将输入视为单个关键词,不进行分词和任何其他处理
// 创建查询分析器
Analyzer analyzer = new KeywordAnalyzer();
// 使用查询分析器进行分词
String text = "Elasticsearch is a distributed, RESTful search and analytics engine.";
TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(text));
CharTermAttribute termAttribute = tokenStream.addAttribute(CharTermAttribute.class);
// 遍历分词结果
tokenStream.reset();
while (tokenStream.incrementToken()) {
System.out.println(termAttribute.toString());
}
tokenStream.end();
tokenStream.close();
模式分析器根据正则表达式来分词,它将输入文本匹配正则表达式的部分作为词汇
// 创建查询分析器,指定正则表达式
Analyzer analyzer = new PatternAnalyzer(Pattern.compile("\\W+"), true, true);
// 使用查询分析器进行分词
String text = "Elasticsearch is a distributed, RESTful search and analytics engine.";
TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(text));
CharTermAttribute termAttribute = tokenStream.addAttribute(CharTermAttribute.class);
// 遍历分词结果
tokenStream.reset();
while (tokenStream.incrementToken()) {
System.out.println(termAttribute.toString());
}
tokenStream.end();
tokenStream.close();
语言分析器是针对不同语言的特定分析器,可以提供更好的分词和处理效果
英语分析器基于英语特定的分词规则和处理方式
// 创建查询分析器
Analyzer analyzer = new EnglishAnalyzer();
// 使用查询分析器进行分词
String text = "Elasticsearch is a distributed, RESTful search and analytics engine.";
TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(text));
CharTermAttribute termAttribute = tokenStream.addAttribute(CharTermAttribute.class);
// 遍历分词结果
tokenStream.reset();
while (tokenStream.incrementToken()) {
System.out.println(termAttribute.toString());
}
tokenStream.end();
tokenStream.close();
中文分析器基于中文特定的分词规则和处理方式
// 创建查询分析器
Analyzer analyzer = new SmartChineseAnalyzer();
// 使用查询分析器进行分词
String text = "Elasticsearch is a distributed, RESTful search and analytics engine.";
TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(text));
CharTermAttribute termAttribute = tokenStream.addAttribute(CharTermAttribute.class);
// 遍历分词结果
tokenStream.reset();
while (tokenStream.incrementToken()) {
System.out.println(termAttribute.toString());
}
tokenStream.end();
tokenStream.close();
代码示例
import org.elasticsearch.index.analysis.Analyzer;
import org.elasticsearch.index.analysis.TokenizerFactory;
class CustomTokenizerFactory extends TokenizerFactory {
public CustomTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
}
@Override
public Tokenizer create() {
// 在这里实现自定义的 tokenizer 逻辑
return new CustomTokenizer();
}
}
class CustomTokenizer implements Tokenizer {
// 实现自定义 tokenizer 的逻辑
@Override
public Token next() throws IOException {
// 返回下一个 token
}
}
代码示例
import org.elasticsearch.index.analysis.TokenFilterFactory;
class CustomFilterFactory extends TokenFilterFactory {
public CustomFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
}
@Override
public TokenStream create(TokenStream tokenStream) {
// 在这里实现自定义的 filter 逻辑
return new CustomFilter(tokenStream);
}
}
class CustomFilter extends TokenFilter {
public CustomFilter(TokenStream tokenStream) {
super(tokenStream);
}
@Override
public Token next(Token reusableToken) throws IOException {
// 实现自定义 filter 的逻辑
}
}
代码示例
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.index.analysis.TokenizerFactory;
import org.elasticsearch.index.analysis.Analyzer;
import org.elasticsearch.index.analysis.Tokenizer;
class CustomAnalyzerProvider extends AbstractIndexAnalyzerProvider<Analyzer> {
private final Analyzer analyzer;
public CustomAnalyzerProvider(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
super(indexSettings, name, settings);
TokenizerFactory tokenizerFactory = new CustomTokenizerFactory(indexSettings, environment, name, settings);
TokenFilterFactory tokenFilterFactory = new CustomFilterFactory(indexSettings, environment, name, settings);
this.analyzer = new CustomAnalyzer(tokenizerFactory, tokenFilterFactory);
}
@Override
public Analyzer get() {
return this.analyzer;
}
}
class CustomAnalyzer extends Analyzer {
private final TokenizerFactory tokenizerFactory;
private final TokenFilterFactory tokenFilterFactory;
public CustomAnalyzer(TokenizerFactory tokenizerFactory, TokenFilterFactory tokenFilterFactory) {
this.tokenizerFactory = tokenizerFactory;
this.tokenFilterFactory = tokenFilterFactory;
}
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = tokenizerFactory.create();
TokenStream tokenStream = tokenFilterFactory.create(tokenizer);
return new TokenStreamComponents(tokenizer, tokenStream);
}
}
代码示例
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.client.transport.TransportClient;
import org.elasticsearch.common.transport.TransportAddress;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.transport.InetSocketTransportAddress;
import org.elasticsearch.index.query.QueryBuilder;
// 创建 TransportClient
Settings settings = Settings.builder()
.put("cluster.name", "your_cluster_name")
.build();
TransportClient client = new PreBuiltTransportClient(settings);
// 添加节点地址
client.addTransportAddress(new InetSocketTransportAddress(new InetSocketAddress("your_host_name", 9300)));
// 创建自定义 Analyzer 查询
QueryBuilder query = QueryBuilders.matchQuery("your_field", "your_query_text")
.analyzer("your_custom_analyzer");
// 执行搜索操作
SearchResponse response = client.prepareSearch("your_index_name")
.setQuery(query)
.execute()
.actionGet();
模糊搜索,是指根据用户提供的关键词在搜索时进行近似匹配,而非完全精确匹配。在 Elasticsearch 中,可以使用 Fuzzy Query 实现模糊搜索。
SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder();
QueryBuilder queryBuilder = QueryBuilders.fuzzyQuery("field", "keyword")
.fuzziness(Fuzziness.AUTO)
.prefixLength(3)
.maxExpansions(10);
searchSourceBuilder.query(queryBuilder);
SearchRequest searchRequest = new SearchRequest("index");
searchRequest.source(searchSourceBuilder);
SearchResponse searchResponse = client.search(searchRequest, RequestOptions.DEFAULT);
field
是需要进行模糊搜索的字段名称。keyword
是用户提供的关键词。fuzziness
参数指定了模糊匹配的程度,Fuzziness.AUTO
表示自动选择最佳匹配程度。prefixLength
参数指定了模糊匹配时前缀的最小长度。maxExpansions
参数指定了在扩展搜索条件时的最大扩展次数。细粒度搜索,是指根据用户提供的关键词进行精确匹配,包括前缀匹配、通配符匹配和正则表达式匹配。
前缀匹配,是指根据用户提供的关键词匹配字段值的前缀。在 Elasticsearch 中,可以使用 Prefix Query 实现前缀匹配。
SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder();
QueryBuilder queryBuilder = QueryBuilders.prefixQuery("field", "prefix");
searchSourceBuilder.query(queryBuilder);
SearchRequest searchRequest = new SearchRequest("index");
searchRequest.source(searchSourceBuilder);
SearchResponse searchResponse = client.search(searchRequest, RequestOptions.DEFAULT);
field
是需要进行前缀匹配的字段名称。prefix
是用户提供的前缀关键词。通配符匹配,是指根据用户提供的带有通配符的关键词进行匹配。在 Elasticsearch 中,可以使用 Wildcard Query 实现通配符匹配。
SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder();
QueryBuilder queryBuilder = QueryBuilders.wildcardQuery("field", "keyword*");
searchSourceBuilder.query(queryBuilder);
SearchRequest searchRequest = new SearchRequest("index");
searchRequest.source(searchSourceBuilder);
SearchResponse searchResponse = client.search(searchRequest, RequestOptions.DEFAULT);
field
是需要进行通配符匹配的字段名称。keyword*
是用户提供的带有通配符的关键词,*
表示匹配任意多个字符。正则表达式匹配,是指根据用户提供的正则表达式进行匹配。在 Elasticsearch 中,可以使用 Regexp Query 实现正则表达式匹配。
SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder();
QueryBuilder queryBuilder = QueryBuilders.regexpQuery("field", "regex");
searchSourceBuilder.query(queryBuilder);
SearchRequest searchRequest = new SearchRequest("index");
searchRequest.source(searchSourceBuilder);
SearchResponse searchResponse = client.search(searchRequest, RequestOptions.DEFAULT);
field
是需要进行正则表达式匹配的字段名称。regex
是用户提供的正则表达式。多语言搜索,是指根据用户提供的关键词进行跨语言的搜索。在 Elasticsearch 中,可以使用 MultiMatch Query 实现多语言搜索。
SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder();
QueryBuilder queryBuilder = QueryBuilders.multiMatchQuery("keyword", "field1", "field2");
searchSourceBuilder.query(queryBuilder);
SearchRequest searchRequest = new SearchRequest("index");
searchRequest.source(searchSourceBuilder);
SearchResponse searchResponse = client.search(searchRequest, RequestOptions.DEFAULT);
keyword
是用户提供的关键词。field1
和 field2
是需要进行搜索的字段名称列表。数据清理和标准化,是指对用户提供的关键词进行处理,使其符合特定规范,以便更好地匹配。在 Elasticsearch 中,可以使用 Analyzers 和 Tokenizers 实现数据清理和标准化。
String keyword = "原始关键词";
AnalyzeRequest analyzeRequest = AnalyzeRequest.withGlobalAnalyzer("index", keyword);
AnalyzeResponse analyzeResponse = client.indices().analyze(analyzeRequest, RequestOptions.DEFAULT);
List<AnalyzeResponse.AnalyzeToken> tokens = analyzeResponse.getTokens();
for (AnalyzeResponse.AnalyzeToken token : tokens) {
System.out.println(token.getTerm());
}
keyword
是用户提供的原始关键词。index
是需要进行数据清理和标准化的索引名称。以上就是 Elasticsearch 查询分析器的简要介绍以及常见使用场景的示例代码。通过这些查询类型和数据处理功能,可以实现高效、准确的搜索和分析。
在 Elasticsearch 中,查询的解析过程是将查询字符串转换为查询对象的过程。为了分析查询解析过程,可以使用 Elasticsearch 的 SearchRequest
对象的 source
方法来传递搜索请求的源代码。
SearchRequest searchRequest = new SearchRequest("your_index");
SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();
sourceBuilder.query(QueryBuilders.matchQuery("field_name", "search_query"));
searchRequest.source(sourceBuilder);
对于调试和查看查询的匹配细节,可以使用 Elasticsearch 的 Explain API。Explain API 接受一个搜索请求并返回与该请求匹配的文档,以及有关如何计算匹配度的详细信息。
ExplainRequest explainRequest = new ExplainRequest("your_index");
SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();
sourceBuilder.query(QueryBuilders.matchQuery("field_name", "search_query"));
explainRequest.source(sourceBuilder);
性能测试是评估 Elasticsearch 集群查询性能的一种方法。可以使用 Elasticsearch 的搜索 Profile 来帮助识别潜在的性能问题。
SearchRequest searchRequest = new SearchRequest("your_index");
SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();
sourceBuilder.query(QueryBuilders.matchQuery("field_name", "search_query"));
sourceBuilder.profile(true);
searchRequest.source(sourceBuilder);
除了性能测试,还可以使用 Elasticsearch 的 Warmers 来优化搜索性能。Warmers 是一种预热索引的机制,它可以事先计算某些搜索的结果并缓存这些结果,以提高后续的搜索性能。
PutWarmersRequest putWarmersRequest = new PutWarmersRequest("your_index");
SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();
sourceBuilder.query(QueryBuilders.matchQuery("field_name", "search_query"));
SearchRequest searchRequest = new SearchRequest("your_index");
searchRequest.source(sourceBuilder);
putWarmersRequest.addWarmers("your_warmer", searchRequest);