1.数据源于
2.加载数据:
curl -s -XPOST http://172.22.112.1:9200/_bulk --data-binary @elasticsearch.txt
-S:显示错误,-s静默模式
【上传数据时,中途kill,不会部分上传成功,皆失败】
3.设置分词器
在没有设置分词器之前,ES会将汉字单个切分(当安装完Elasticsearch之后,默认已经含有一个分词法,叫做“standard”,对中文的支持较差),比如搜索:
http://172.22.112.1:9200/listing_new/listing/_search?q=category_name:海水,
GET /listing_new/listing/_search?q=category_name:%E6%B5%B7%E6%B0%B4
会出来“海鲜水产”,而我们可能希望“海水”二字并不分开。
安装分词器:
.\elasticsearch-plugin.bat install https://github.com/medcl/elasticsearch-analysis-ik/releases/download/v5.6.8/ela sticsearch-analysis-ik-5.6.8.zip
重启ES;
删除之前自动生成的索引:
DELETE /listing_new
在创建index之前新建mapping
- 默认_all是开启的(也就是说,不指定字段时会检索所有没有禁用_all参数的字段),通过”_all”: { “enabled”: false }来禁用
action.auto_create_index: false
## 手动设置mapping,ik
PUT /listing_new
{
"settings": {
"number_of_shards": 3,
"number_of_replicas": 0,
"analysis": {
"analyzer": {
"ik": {
"tokenizer": "ik_smart"
}
}
}
},
"mappings": {
"listing": {
"dynamic": true,
"properties": {
"listing_title": {
"type": "text",
"analyzer": "ik"
},
"category_name": {
"type": "text",
"analyzer": "ik"
},
"listing_id": {
"type": "long"
},
"category_id": {
"type": "long"
}
}
}
}
}
报错:"Custom Analyzer [ik] failed to find tokenizer under name [ik_smart]"
安装分词器后需要重启集群,不需要设置index.analysis.analyzer.ik.type : “ik”
再次执行查询:http://172.22.112.1:9200/listing_new/listing/_search?q=category_name:海水
可以看到无命中
ES关于停用词的文档
可使用stopwords参数设置停用词
文档字段的fielddata
对文档字段(text fielddata)执行terms聚合时报错:
Fielddata is disabled on text fields by default. Set fielddata=true on [category_name] in order to load fielddata in memory by uninverting the inverted index. Note that this can however use significant memory. Alternatively use a keyword field instead.
对于文本字段(text),默认情况下,fielddata功能是关闭的。 在[category_name]上设置fielddata = true,以便通过再次反转倒排索引来加载内存中的fielddata。 请注意,这可能会消耗大量的内存。 或者使用关键字字段代替。
【text字段默认是不开启fielddata的,因为ES将keyword字段进行分词并将其作为索引来倒排,所以当我们对text进行聚合时,首先得在mapping中设置fielddata为ture,然后在将倒排索引反转得到文档id与词条的关系。】
支持两种同义词格式:Solr,WordNet。同义词synonym过滤参考
关于Solr synonyms的格式
1. 使用符号=>指定,比如易写错的词:
松驰 => 松弛
ture => true
2. 逗号分隔的列表
if a match is found then the synonym is emitted in place of the token.
比如,我们的文档中有“中国”二字,我们可以:
# example
中国,中华,China,中华人名共和国
这样一来,“中国,中华,China,中华人名共和国”都将与“中国”匹配
MORE:https://examples.javacodegeeks.com/enterprise-java/apache-solr/apache-solr-synonyms-example/
PUT /listing_new
{
"settings": {
"number_of_shards": 3,
"number_of_replicas": 0,
"analysis": {
"analyzer": {
"ik_synonym": {
"tokenizer": "ik_smart",
"filter": ["synonym"]
}
},
"filter": {
"synonym": {
"type": "synonym",
"synonyms_path": "analysis/synonym_listing.txt"
}
}
}
},
"mappings": {
"listing": {
"dynamic": true,
"properties": {
"listing_title": {
"type": "text",
"analyzer": "ik_synonym"
},
"category_name": {
"type": "text",
"analyzer": "ik_synonym",
"fielddata": true
},
"listing_id": {
"type": "long"
},
"category_id": {
"type": "long"
}
}
}
}
}
"caused_by": { "type": "malformed_input_exception", "reason": "Input length = 1"
import org.elasticsearch.client.transport.TransportClient;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.transport.InetSocketTransportAddress;
import org.elasticsearch.transport.client.PreBuiltTransportClient;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.util.Map;
/**
* Descirption:打开Elastic客户端以及关闭客户端
*
* @author 王海
* @version V1.0
* @package PACKAGE_NAME
* @date 2018/5/18 14:55
* @since api1.0
*/
public class OpenClose {
private static final Logger LOGGER = LoggerFactory.getLogger(OpenClose.class);
TransportClient client = null;
/**
* 从Map中解析出参数来初始化客户端
*
* @param mapParms 参数Map
*/
public TransportClient getClientWithMap(Map mapParms) {
// 初始化client通常需要ip地址,端口,集群名称三个参数
// 获取集群名
String clusterName = mapParms.get("cluster.name").toString();
// 获取ip地址
//String addressMaster = mapParms.get("server").toString();
byte[] addressMaster = (byte[]) mapParms.get("server");
// 获取端口(ES的默认传输端口为9300)
int transport = (int) mapParms.get("port");
// 根据以上设置,初始化elasticsearch客户端
// Builder是Settings中的一个静态内部类
Settings settings = Settings.builder().put("cluster.name", clusterName).build();
try {
//client = new PreBuiltTransportClient(settings).addTransportAddress(new InetSocketTransportAddress(InetAddress.getByName(addressMaster), transport));
client = new PreBuiltTransportClient(settings).addTransportAddress(new InetSocketTransportAddress(InetAddress.getByAddress(addressMaster), transport));
} catch (UnknownHostException e) {
e.printStackTrace();
LOGGER.error("初始化client失败 ===== " + e.getMessage());
}
LOGGER.info("初始化client成功 ===== " + System.currentTimeMillis());
return client;
}
public void closeClient(TransportClient client) {
if (client != null) {
client.close();
client = null;
LOGGER.info("关闭client成功 ===== " + System.currentTimeMillis());
}
}
}
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.action.search.SearchType;
import org.elasticsearch.client.transport.TransportClient;
import org.elasticsearch.index.query.QueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import java.util.Map;
/**
* Descirption:默认查询和bool查询
*
* @author 王海
* @version V1.0
* @since 2018/5/18 22:23
*/
public class BaseAndBool {
public static String query(Map queryParams, TransportClient esClient) {
// 检索词
String queryWords = queryParams.get("query").toString();
// 检索模式
String queryMode = queryParams.get("mode").toString();
// 检索字段
String[] queryFields = (String[]) queryParams.get("fields");
// index
String index = queryParams.get("index").toString();
// type
String type = queryParams.get("type").toString();
int from = (int) queryParams.get("from");
int size = (int) queryParams.get("size");
// org.elasticsearch.index.query.QueryBuilder
QueryBuilder queryBuilder = null;
if ("MultiMatchQuery".equalsIgnoreCase(queryMode)) {
// 基础查询,默认使用OR
queryBuilder = QueryBuilders.multiMatchQuery(queryWords, queryFields);
} else {
// 使用bool查询,可使用must\should\mustNot等
String[] terms = queryWords.split("\\s+");
for (String term : terms) {
if (queryBuilder == null) {
queryBuilder = QueryBuilders.boolQuery().must(QueryBuilders.multiMatchQuery(term, queryFields));
} else {
queryBuilder = QueryBuilders.boolQuery().must(queryBuilder).must(QueryBuilders.multiMatchQuery(term, queryFields));
}
}
}
// QueryBuilders构建完毕,下面才是真正执行查询的地方
SearchResponse searchResponse = null;
// 在执行get()之前,返回的都是SearchRequestBuilder对象,SearchType.DEFAULT为QUERY_THEN_FETCH
// 不设置setTypes则默认index下的所有type
searchResponse = esClient.prepareSearch(index).setTypes(type).setSearchType(SearchType.DEFAULT).setQuery(queryBuilder).setFrom(from).setSize(size).get();
return searchResponse.toString();
}
}
import org.elasticsearch.client.transport.TransportClient;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import java.util.HashMap;
import java.util.Map;
/**
* BaseAndBool Tester.
*
* @author
* @version 1.0
* @since 05/18/2018
*/
public class BaseAndBoolTest {
@Before
public void before() throws Exception {
}
@After
public void after() throws Exception {
}
/**
* Method: query(Map queryParams, TransportClient esClient)
* 测试查询方法
*/
@Test
public void testQuery() throws Exception {
// 创建客户端
OpenClose openClose = new OpenClose();
Map hashMap = new HashMap<>();
hashMap.put("cluster.name", "elasticsearch-win");
hashMap.put("server", new byte[]{(byte) 172, (byte) 22, (byte) 112, 1});
hashMap.put("port", 9300);
TransportClient client = openClose.getClientWithMap(hashMap);
// 创建查询体
Map queryParams = new HashMap<>();
queryParams.put("index", "listing_new");
queryParams.put("type", "listing");
queryParams.put("query", "德芙 方便面");
queryParams.put("fields", new String[]{"listing_title","category_name"});
queryParams.put("from", 1);
queryParams.put("size", 10);
//
queryParams.put("mode", "MultiMatchQuery");
String result_multi = BaseAndBool.query(queryParams, client);
System.out.println(result_multi);
// bool
queryParams.put("mode", "bool");
String result_bool = BaseAndBool.query(queryParams, client);
System.out.println(result_bool);
// 关闭连接
openClose.closeClient(client);
}
}
部分返回结果:
{“took”:5,”timed_out”:false,”_shards”:{“total”:3,”successful”:3,”skipped”:0,”failed”:0},”hits”:{“total”:1528,”max_score”:7.5759635,”hits”:[{“_index”:”listing_new”,”_type”:”listing”,”_id”:”AWNxycq8SnQysQYb0_Zd”,”_score”:7.564964,”_source”:{ “listing_id” : “7132”, “listing_title” : “德芙 浓 醇 黑 巧克力 66 80g 零食 德芙 巧克力”, “category_id” : “6”, “category_name” : “巧克力” }},{“_index”:”listing_new”,”_type”:”listing”,”_id”:”AWNxycq8SnQysQYb0_Zb”,”_score”:7.4578214,”_source”:{ “listing_id” : “7130”, “listing_title” : “德芙 拼盘 290.5g 碗 装 德芙 巧克力 零食”, “category_id” : “6”, “category_name” : “巧克力” }}]}}
使用bool
{“took”:4,”timed_out”:false,”_shards”:{“total”:3,”successful”:3,”skipped”:0,”failed”:0},”hits”:{“total”:0,”max_score”:null,”hits”:[]}}