flume优点之一就是支持插件扩展功能,现在clickhouse流行,数据想直接写入clickhouse,flume官网看不了一样,没有clickhouse sink,没有咱们就自已写呗。
网上开发自定义拦截器的文章很多,开发sink的反而找不到,特记录一下,供大家参考。
新建maven工程,在pom.xml添加如下依赖。
<dependencies>
<dependency>
<groupId>org.apache.flumegroupId>
<artifactId>flume-ng-coreartifactId>
dependency>
<dependency>
<groupId>com.google.guavagroupId>
<artifactId>guavaartifactId>
<version>19.0version>
dependency>
<dependency>
<groupId>ru.yandex.clickhousegroupId>
<artifactId>clickhouse-jdbcartifactId>
<version>0.2version>
dependency>
dependencies>
package org.apache.flume.sink.clickhouse;
import org.apache.flume.conf.Configurable;
import org.apache.flume.sink.AbstractSink;
public class ClickHouseSink extends AbstractSink implements Configurable {
@Override
public void configure(Context context) {
}
@Override
public void start() {
}
@Override
public void stop() {
}
@Override
public Status process() throws EventDeliveryException {
}
}
configure函数是解析处理配置参数,它接收Context对象,可以用context.getString(“xxx”)获取配置信息,例如context.getString(“host”)可以获得ClickHouseSink配置的host参数。
@Override
public void configure(Context context) {
Preconditions.checkArgument(context.getString(HOST) != null && context.getString(HOST).length() > 0, "ClickHouse host must be specified!");
this.host = context.getString(HOST);
if (!this.host.startsWith("jdbc:clickhouse://")) {
this.host = "jdbc:clickhouse://" + this.host;
}
}
start函数是启动是的初始化,用于建立clickhouse连接对象,启动flume内置计数器等。
@Override
public void start() {
String jdbcUrl = String.format("%s:%s/%s", this.host, this.port, this.database);
ClickHouseProperties properties = new ClickHouseProperties().withCredentials(this.user, this.password);
this.dataSource = new BalancedClickhouseDataSource(jdbcUrl, properties);
sinkCounter.start();
super.start();
}
stop函数是负责退出前的回收清理工作。
@Override
public void stop() {
logger.info("ClickHouse sink {} stopping", getName());
sinkCounter.incrementConnectionClosedCount();
sinkCounter.stop();
super.stop();
}
process函数是就核心的处理函数了。要注意是的flume是数据传输是事务的,可以保证数据不丟失,所以我们开发的sink在消费channel的数据时,也是要用事务。
@Override
public Status process() throws EventDeliveryException {
Status status = null;
Channel ch = getChannel();
Transaction txn = ch.getTransaction();
txn.begin();
try {
txn.commit();
} catch (Throwable t) {
txn.rollback();
} finally {
txn.close();
}
return status;
}
clickhouse的数据写入方式有很多种,具体大家可以参考官方文档,这里我们用的是json方式写入。
ClickHouseStatement sth = conn.createStatement();
sth.write().table(String.format(" %s.%s", database, table)).data(new ByteArrayInputStream(batch.toString().getBytes()), ClickHouseFormat.JSONEachRow).addDbParam(ClickHouseQueryParam.MAX_PARALLEL_REPLICAS, "2").send();
每次操作还需要修改flume的内置计数器,例如:
sinkCounter.addToEventDrainAttemptCount(count);//准备处理的event的个数
sinkCounter.incrementEventDrainSuccessCount();//处理成功,将准备处理的event数量累加到成功处理上。
用mvn package编译,生成flume-ng-clickhouse-sink-1.0.jar包,放到flume的lib目录下。
standard_storage.sinks.sink2ch.type = org.apache.flume.sink.clickhouse.ClickHouseSink
standard_storage.sinks.sink2ch.channel = channel2ch
standard_storage.sinks.sink2ch.host = xxxx.xxxxx.com
standard_storage.sinks.sink2ch.port = 8123
standard_storage.sinks.sink2ch.database = xxxxx
standard_storage.sinks.sink2ch.table = xxxxx
standard_storage.sinks.sink2ch.batchSize = 10000
standard_storage.sinks.sink2ch.user = xxxxxx
standard_storage.sinks.sink2ch.password = xxxxxxxxxxxx
上这个例子,配置clickhouse的host、port、database、table、batchSize、user、password即可。type是要配ClickHouseSink的main类名
ClickHouseSink.java
package org.apache.flume.sink.clickhouse;
import com.google.common.base.Preconditions;
import org.apache.flume.*;
import org.apache.flume.conf.Configurable;
import org.apache.flume.instrumentation.SinkCounter;
import org.apache.flume.sink.AbstractSink;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import ru.yandex.clickhouse.BalancedClickhouseDataSource;
import ru.yandex.clickhouse.ClickHouseConnectionImpl;
import ru.yandex.clickhouse.ClickHouseStatement;
import ru.yandex.clickhouse.domain.ClickHouseFormat;
import ru.yandex.clickhouse.settings.ClickHouseProperties;
import ru.yandex.clickhouse.settings.ClickHouseQueryParam;
import java.io.ByteArrayInputStream;
import static org.apache.flume.sink.clickhouse.ClickHouseSinkConstants.*;
public class ClickHouseSink extends AbstractSink implements Configurable {
private static final Logger logger = LoggerFactory.getLogger(ClickHouseSink.class);
private BalancedClickhouseDataSource dataSource = null;
private SinkCounter sinkCounter = null;
private String host = null;
private String port = null;
private String user = null;
private String password = null;
private String database = null;
private String table = null;
private int batchSize;
@Override
public void configure(Context context) {
if (sinkCounter == null) {
sinkCounter = new SinkCounter(getName());
}
Preconditions.checkArgument(context.getString(HOST) != null && context.getString(HOST).length() > 0, "ClickHouse host must be specified!");
this.host = context.getString(HOST);
if (!this.host.startsWith("jdbc:clickhouse://")) {
this.host = "jdbc:clickhouse://" + this.host;
}
Preconditions.checkArgument(context.getString(DATABASE) != null && context.getString(DATABASE).length() > 0, "ClickHouse database must be specified!");
this.database = context.getString(DATABASE);
Preconditions.checkArgument(context.getString(TABLE) != null && context.getString(TABLE).length() > 0, "ClickHouse table must be specified!");
this.table = context.getString(TABLE);
this.port = context.getString(PORT, DEFAULT_PORT);
this.user = context.getString(USER, DEFAULT_USER);
this.password = context.getString(PASSWORD, DEFAULT_PASSWORD);
this.batchSize = context.getInteger(BATCH_SIZE, DEFAULT_BATCH_SIZE);
}
@Override
public void start() {
String jdbcUrl = String.format("%s:%s/%s", this.host, this.port, this.database);
ClickHouseProperties properties = new ClickHouseProperties().withCredentials(this.user, this.password);
//properties.setUseServerTimeZone(false);
this.dataSource = new BalancedClickhouseDataSource(jdbcUrl, properties);
sinkCounter.start();
super.start();
}
@Override
public void stop() {
logger.info("ClickHouse sink {} stopping", getName());
sinkCounter.incrementConnectionClosedCount();
sinkCounter.stop();
super.stop();
}
@Override
public Status process() throws EventDeliveryException {
Status status = null;
// Start transaction
Channel ch = getChannel();
Transaction txn = ch.getTransaction();
txn.begin();
try {
ClickHouseConnectionImpl conn = (ClickHouseConnectionImpl) dataSource.getConnection();
int count;
StringBuilder batch = new StringBuilder();
for (count = 0; count < batchSize; ++count) {
Event event = ch.take();
if (event == null) {
break;
}
batch.append(new String(event.getBody(), "UTF-8")).append("\n");
}
if (count <= 0) {
sinkCounter.incrementBatchEmptyCount();
txn.commit();
return Status.BACKOFF;
} else if (count < batchSize) {
sinkCounter.incrementBatchUnderflowCount();
} else {
sinkCounter.incrementBatchCompleteCount();
}
sinkCounter.addToEventDrainAttemptCount(count);
ClickHouseStatement sth = conn.createStatement();
sth.write().table(String.format(" %s.%s", database, table)).data(new ByteArrayInputStream(batch.toString().getBytes()), ClickHouseFormat.JSONEachRow).addDbParam(ClickHouseQueryParam.MAX_PARALLEL_REPLICAS, "2").send();
sinkCounter.incrementEventDrainSuccessCount();
status = Status.READY;
txn.commit();
} catch (Throwable t) {
txn.rollback();
logger.error(t.getMessage(), t);
status = Status.BACKOFF;
// re-throw all Errors
if (t instanceof Error) {
throw (Error) t;
}
} finally {
txn.close();
}
return status;
}
}
ClickHouseSinkConstants.java
package org.apache.flume.sink.clickhouse;
public class ClickHouseSinkConstants {
public static final String HOST = "host";
public static final String PORT = "port";
public static final String BATCH_SIZE = "batchSize";
public static final String USER = "user";
public static final String PASSWORD = "password";
public static final String DATABASE = "database";
public static final String TABLE = "table";
public static final String DEFAULT_PORT = "8123";
public static final int DEFAULT_BATCH_SIZE = 10000;
public static final String DEFAULT_USER = "";
public static final String DEFAULT_PASSWORD = "";
}