ES中的中文分词支持改为用IK分词
在调用java api时,需要指定字段使用IK分词创建mapping
同时ES还从原来使用的BulkRequestBuilder,改成参数更多更灵活的BulkProcessor。
1.原来的ElasticSearchOperator
package com.xxx.data;
import org.elasticsearch.action.bulk.BulkRequestBuilder;
import org.elasticsearch.action.bulk.BulkResponse;
import org.elasticsearch.action.delete.DeleteRequestBuilder;
import org.elasticsearch.action.update.UpdateRequestBuilder;
import org.elasticsearch.client.Client;
import java.util.HashMap;
import java.util.Map;
import java.util.Timer;
import java.util.TimerTask;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
//import org.elasticsearch.client.transport.TransportClient;
//import org.elasticsearch.common.settings.ImmutableSettings;
//import org.elasticsearch.common.settings.Settings;
//import org.elasticsearch.common.transport.InetSocketTransportAddress;
public class ElasticSearchOperator {
// 缓冲池容量
private static final int MAX_BULK_COUNT = 10;
// 最大提交间隔(秒)
private static final int MAX_COMMIT_INTERVAL = 60 * 5;
private static Client client = null;
private static BulkRequestBuilder bulkRequestBuilder = null;
private static Lock commitLock = new ReentrantLock();
static {
// elasticsearch1.5.0
// Settings settings = ImmutableSettings.settingsBuilder()
// .put("cluster.name", Config.clusterName).build();
// client = new TransportClient(settings)
// .addTransportAddress(new InetSocketTransportAddress(
// Config.nodeHost, Config.nodePort));
// 2.3.5
client = MyTransportClient.client;
bulkRequestBuilder = client.prepareBulk();
bulkRequestBuilder.setRefresh(true);
Timer timer = new Timer();
timer.schedule(new CommitTimer(), 10 * 1000, MAX_COMMIT_INTERVAL * 1000);
}
/**
* 判断缓存池是否已满,批量提交
*
* @param threshold
*/
private static void bulkRequest(int threshold) {
if (bulkRequestBuilder.numberOfActions() > threshold) {
BulkResponse bulkResponse = bulkRequestBuilder.execute().actionGet();
if (!bulkResponse.hasFailures()) {
bulkRequestBuilder = client.prepareBulk();
}
}
}
/**
* 加入索引请求到缓冲池
*
* @param builder
*/
public static void addUpdateBuilderToBulk(UpdateRequestBuilder builder) {
commitLock.lock();
try {
bulkRequestBuilder.add(builder);
bulkRequest(MAX_BULK_COUNT);
} catch (Exception ex) {
ex.printStackTrace();
} finally {
commitLock.unlock();
}
}
/**
* 加入删除请求到缓冲池
*
* @param builder
*/
public static void addDeleteBuilderToBulk(DeleteRequestBuilder builder) {
commitLock.lock();
try {
bulkRequestBuilder.add(builder);
bulkRequest(MAX_BULK_COUNT);
} catch (Exception ex) {
ex.printStackTrace();
} finally {
commitLock.unlock();
}
}
/**
* 定时任务,避免RegionServer迟迟无数据更新,导致ElasticSearch没有与HBase同步
*/
static class CommitTimer extends TimerTask {
@Override
public void run() {
commitLock.lock();
try {
bulkRequest(0);
} catch (Exception ex) {
ex.printStackTrace();
} finally {
commitLock.unlock();
}
}
}
private static void test() {
Config.indexName = "flume-2016-08-10";
Config.typeName = "tweet";
for (int i = 10; i < 20; i++) {
Map json = new HashMap();
json.put("field", "ttt");
//添加
// addUpdateBuilderToBulk(client.prepareUpdate(Config.indexName, Config.typeName, String.valueOf(i)).setDoc(json).setUpsert(json));
//删除
addDeleteBuilderToBulk(client.prepareDelete(Config.indexName, Config.typeName, String.valueOf(i)));
}
System.out.println(bulkRequestBuilder.numberOfActions());
}
public static void main(String[] args) {
test();
}
}
2.改成ElasticSearchBulkProcessor
package com.xxx.data;
import org.elasticsearch.action.admin.indices.exists.indices.IndicesExistsRequest;
import org.elasticsearch.action.admin.indices.mapping.put.PutMappingRequest;
import org.elasticsearch.action.bulk.BackoffPolicy;
import org.elasticsearch.action.bulk.BulkProcessor;
import org.elasticsearch.action.bulk.BulkRequest;
import org.elasticsearch.action.bulk.BulkResponse;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.client.Client;
import org.elasticsearch.client.Requests;
import org.elasticsearch.common.unit.ByteSizeUnit;
import org.elasticsearch.common.unit.ByteSizeValue;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentFactory;
import java.util.*;
/**
* Created by lisiyu on 16/9/19.
*/
public class ElasticSearchBulkProcessor {
private static Client client = null;
private static BulkProcessor bulkProcessor = null;
// 缓冲池容量(计数,request)
private static final int MAX_BULK_COUNT = 1000;
// 缓冲池容量(大小,MB)
private static final int MAX_BULK_SIZE = 1024;
// 最大提交间隔(秒)
private static final int MAX_COMMIT_INTERVAL = 60 * 1;
// 最大并发数量
private static final int MAX_CONCURRENT_REQUEST = 2;
// 失败重试等待时间 (ms)
private static final int REJECT_EXCEPTION_RETRY_WAIT = 500;
// 失败重试次数
private static final int REJECT_EXCEPTION_RETRY_TIMES = 3;
static {
// 2.3.5
client = MyTransportClient.client;
bulkProcessor = BulkProcessor.builder(
client,
new BulkProcessor.Listener() {
@Override
public void beforeBulk(long executionId,
BulkRequest request) { }
@Override
public void afterBulk(long executionId,
BulkRequest request,
BulkResponse response) { }
@Override
public void afterBulk(long executionId,
BulkRequest request,
Throwable failure) { }
})
.setBulkActions(MAX_BULK_COUNT)
.setBulkSize(new ByteSizeValue(MAX_BULK_SIZE, ByteSizeUnit.MB))
.setFlushInterval(TimeValue.timeValueSeconds(MAX_COMMIT_INTERVAL))
.setConcurrentRequests(MAX_CONCURRENT_REQUEST)
.setBackoffPolicy(
BackoffPolicy.exponentialBackoff(
TimeValue.timeValueMillis(REJECT_EXCEPTION_RETRY_WAIT),
REJECT_EXCEPTION_RETRY_TIMES))
.build();
}
/**
* 加入索引请求到缓冲池
*
* @param indexRequest
* @param fieldSet
*/
public static void addIndexRequestToBulkProcessor(IndexRequest indexRequest,Set fieldSet) {
try {
// 获取索引及类型信息
System.out.println("index:"+indexRequest.index());
System.out.println("type:"+indexRequest.type());
// 尝试创建索引,并指定ik中文分词
createMapping(indexRequest.index(),indexRequest.type(),fieldSet);
// 更新数据
bulkProcessor.add(indexRequest);
} catch (Exception ex) {
ex.printStackTrace();
}
}
/**
* 创建mapping(feid("indexAnalyzer","ik")该字段分词IK索引 ;feid("searchAnalyzer","ik")该字段分词ik查询;具体分词插件请看IK分词插件说明)
* @param index 索引名称;
* @param mappingType 索引类型
* @param fieldSet 列集合
* @throws Exception
*/
public static void createMapping(String index,String mappingType,Set fieldSet)throws Exception{
// 判断index是否存在,不存在则创建索引,并启用ik分词器
if(client.admin().indices().exists(new IndicesExistsRequest(index)).actionGet().isExists()){
System.out.println("index: '"+index+"' is exist!");
new XContentFactory();
XContentBuilder builder=XContentFactory.jsonBuilder()
.startObject()//注意不要加index和type
.startObject("properties")
.startObject("id").field("type", "string").field("store", "yes").endObject();
for(String field : fieldSet){
builder = builder.startObject(field).field("type", "string").field("store", "yes").field("analyzer", "ik").endObject();
}
builder = builder.endObject().endObject();
PutMappingRequest mapping = Requests.putMappingRequest(index).type(mappingType).source(builder);
client.admin().indices().putMapping(mapping).actionGet();
} else {
System.out.println("create index: '"+index+"'!");
new XContentFactory();
XContentBuilder builder=XContentFactory.jsonBuilder()
.startObject()//注意不要加index和type
.startObject("properties")
.startObject("id").field("type", "string").field("store", "yes").endObject();
for(String field : fieldSet){
builder = builder.startObject(field).field("type", "string").field("store", "yes").field("analyzer", "ik").endObject();
}
builder = builder.endObject().endObject();
client.admin().indices().prepareCreate(index).addMapping(mappingType, builder).get();
}
}
public static void test() {
// on startup
Client client = MyTransportClient.client;
BulkProcessor bulkProcessor = BulkProcessor.builder(
client,
new BulkProcessor.Listener() {
@Override
public void beforeBulk(long executionId,
BulkRequest request) { }
@Override
public void afterBulk(long executionId,
BulkRequest request,
BulkResponse response) { }
@Override
public void afterBulk(long executionId,
BulkRequest request,
Throwable failure) { }
})
.setBulkActions(10000)
.setBulkSize(new ByteSizeValue(1, ByteSizeUnit.GB))
.setFlushInterval(TimeValue.timeValueSeconds(5))
.setConcurrentRequests(1)
.setBackoffPolicy(
BackoffPolicy.exponentialBackoff(TimeValue.timeValueMillis(100), 3))
.build();
Map json = new HashMap();
json.put("field", "test");
bulkProcessor.add(new IndexRequest("twitter", "tweet", "1111").source(json));
}
public static void main(String[] args) {
test();
}
}
3.DataSyncObserver类修改
@Override
public void postPut(ObserverContext e, Put put, WALEdit edit, Durability durability) throws IOException {
/**
* 原方法调用ElasticSearchOperator,没有通过IK创建中文索引。
*/
// try {
// String indexId = new String(put.getRow());
// Map> familyMap = put.getFamilyCellMap();
//// NavigableMap> familyMap = put.getFamilyCellMap();
// Map json = new HashMap();
// for (Map.Entry> entry : familyMap.entrySet()) {
// for (Cell cell : entry.getValue()) {
// String key = Bytes.toString(CellUtil.cloneQualifier(cell));
// String value = Bytes.toString(CellUtil.cloneValue(cell));
// json.put(key, value);
// }
// }
// System.out.println();
// ElasticSearchOperator.addUpdateBuilderToBulk(client.prepareUpdate(Config.indexName, Config.typeName, indexId).setDoc(json).setUpsert(json));
// LOG.info("observer -- add new doc: " + indexId + " to type: " + Config.typeName);
// } catch (Exception ex) {
// LOG.error(ex);
// }
/**
* 新方法调用ElasticSearchBulkProcessor,通过IK创建中文索引。
*/
try {
String indexId = new String(put.getRow());
NavigableMap familyMap = put.getFamilyCellMap();
HashSet set = new HashSet();
HashMap json = new HashMap();
Iterator mapIterator = familyMap.entrySet().iterator();
while(mapIterator.hasNext()) {
Map.Entry entry = (Map.Entry)mapIterator.next();
Iterator valueIterator = ((List)entry.getValue()).iterator();
while(valueIterator.hasNext()) {
Cell cell = (Cell)valueIterator.next();
String key = Bytes.toString(CellUtil.cloneQualifier(cell));
String value = Bytes.toString(CellUtil.cloneValue(cell));
json.put(key, value);
set.add(key);
}
}
System.out.println();
ElasticSearchBulkProcessor.addIndexRequestToBulkProcessor((new IndexRequest(Config.indexName, Config.typeName, indexId)).source(json), set);
LOG.info("observer -- add new doc: " + indexId + " to type: " + Config.typeName);
} catch (Exception ex) {
LOG.error(ex);
}
}
4.测试
- 代码打包
- jar包上传到hdfs
- 创建hbase表,并修改表属性关联observer
- 测试put新数据
- 查看es中数据
- 中文分词测试
{"query":{"query_string":{"query":"拖鞋"}},"highlight":{"require_field_match":false,"explain":true,"fields":{"*":{}}}}
5.程序代码整体和其余测试等操作可以查看另一篇文章
Sqoop导入HBase,并借助Coprocessor协处理器同步索引到ES