flume篇2:flume把json数据写入hbase(flume-habse-sink)
对应非json数据同样适用,可以把非json数据通过拦截器拼接成一个以 :: 分隔的string,然后send出去,这样也是ok的
废话不多说,直接上干货
一、 自定义拦截器:
1 拦截器要求:新建一个新的工程,单独打包,保证每个flume的的拦截器都是单独的一个工程打的包,这样保证每次对拦截器修改的时候不影响其他flume业务
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.7</maven.compiler.source>
<maven.compiler.target>1.7</maven.compiler.target>
<scala.version>2.10.4</scala.version>
<flume.version>1.8.0</flume.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.flume</groupId>
<artifactId>flume-ng-core</artifactId>
<version>${flume.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>commons-net</groupId>
<artifactId>commons-net</artifactId>
<version>3.3</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.4</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.testng</groupId>
<artifactId>testng</artifactId>
<version>6.1.1</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.carbondata</groupId>
<artifactId>carbondata-store-sdk</artifactId>
<version>1.5.3</version>
</dependency>
<dependency>
<groupId>org.apache.carbondata</groupId>
<artifactId>carbondata-core</artifactId>
<version>1.5.3</version>
</dependency>
<dependency>
<groupId>org.apache.carbondata</groupId>
<artifactId>carbondata-common</artifactId>
<version>1.5.3</version>
</dependency>
<dependency>
<groupId>org.apache.carbondata</groupId>
<artifactId>carbondata-format</artifactId>
<version>1.5.3</version>
</dependency>
<dependency>
<groupId>org.apache.carbondata</groupId>
<artifactId>carbondata-hadoop</artifactId>
<version>1.5.3</version>
</dependency>
<dependency>
<groupId>org.apache.carbondata</groupId>
<artifactId>carbondata-processing</artifactId>
<version>1.5.3</version>
</dependency>
<dependency>
<groupId>org.apache.carbondata</groupId>
<artifactId>carbonata</artifactId>
<version>1.5.3</version>
<scope>system</scope>
<systemPath>${project.basedir}/lib/apache-carbondata-1.5.3-bin-spark2.3.2-hadoop2.6.0-cdh5.16.1.jar</systemPath>
</dependency>
<dependency>
<groupId>org.apache.mina</groupId>
<artifactId>mina-core</artifactId>
<version>2.0.9</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-all</artifactId>
<version>1.9.5</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.sshd</groupId>
<artifactId>sshd-core</artifactId>
<version>0.14.0</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>com.jcraft</groupId>
<artifactId>jsch</artifactId>
<version>0.1.54</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>1.7.12</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.47</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.5</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>16.0.1</version>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-clients</artifactId>
<version>0.11.0.0</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.46</version>
<scope>compile</scope>
</dependency>
</dependencies>
2 拦截器代码如下:
(以下拦截器主要目的是:把一个嵌套2层的body Json中的各个字段取出来,并拼接成一个以 :: 作为分隔符的string,当然什么分隔符都是可以的,这个由你自己决定)
package com.extracting.flume.interceptor.xy;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import com.google.common.collect.Lists;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.interceptor.Interceptor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.text.SimpleDateFormat;
import java.util.List;
public class XyHbaseInterceptorTC implements Interceptor {
private static final Logger logger = LoggerFactory.getLogger(XyHbaseInterceptorTC.class);
private SimpleDateFormat dataFormat;
@Override
public void initialize() {
dataFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
}
@Override
public Event intercept(Event event) {
String body = new String(event.getBody());
JSONObject jsonObject = JSON.parseObject(body);
JSONObject bodyObject1 = jsonObject.getJSONObject("body");
JSONObject bodyObject2 = bodyObject1.getJSONObject("body");
try {
String s =
getString(bodyObject2, "id") + "::"
+ getString(bodyObject2, "name") + "::"
+ getString(bodyObject2, "age") + "::"
+ getStringDate(bodyObject2, "time", dataFormat) ;
logger.info("拦截器最后输出" + s.toString());
event.setBody(s.toString().getBytes());
return event;
} catch (Exception e) {
logger.info("ERROR格式数据"+body.toString());
return null;
}
}
@Override
public List<Event> intercept(List<Event> events) {
List<Event> resultList = Lists.newArrayList();
for (Event event : events) {
Event result = intercept(event);
if (result != null) {
resultList.add(result);
}
}
return resultList;
}
@Override
public void close() {
}
public static class Builder implements Interceptor.Builder {
@Override
public Interceptor build() {
return new XyHbaseInterceptorTC();
}
@Override
public void configure(Context context) {
}
}
public static String getString(JSONObject jsonObject, String key) {
String s=null;
Object value = jsonObject.get(key);
if (value !=null){
s= value.toString();
}
return s;
}
public static String getStringDate(JSONObject jsonObject,String key,SimpleDateFormat dataFormat) {
Object value = jsonObject.get(key);
String s=null;
if (value !=null) {
String valueString = value.toString();
Long valuelong = Long.parseLong(valueString);
s = dataFormat.format(valuelong).toString() ;
}
return s;
}
}
3 打包上传,到flume的lib位置,cdh位置如下:/opt/cloudera/parcels/CDH/lib/flume-ng/lib/
二、上传hnase sink相关jar包
把hbase sink相关jar包都上传到 /opt/cloudera/parcels/CDH/lib/flume-ng/lib/ 目录
flume-ng-hbase-sink-1.6.0-cdh5.16.1.jar
hbase-client-1.2.0-cdh5.16.1.jar
hbase-server-1.2.0-cdh5.16.1.jar
以及其他habse依赖的jar,我这里记不清哪些包了,好像guava-12.0.1.jar等包也需要cp,所以建议你把hhase lib下所有的包都cp 到flume的lib下也是ok的
三、配置flume的conf
1 在 /opt/cloudera/parcels/CDH/lib/flume-ng/conf目录下,
vi hbase.conf
输入以下内容:
ng.sources = kafkaSource
ng.channels = memorychannel
ng.sinks = hbasesink
ng.sources.kafkaSource.type= org.apache.flume.source.kafka.KafkaSource
ng.sources.kafkaSource.kafka.bootstrap.servers=cdh01:9092,cdh02:9092,cdh03:9092
ng.sources.kafkaSource.kafka.consumer.group.id=xytest1
ng.sources.kafkaSource.kafka.topics=pd_ry_txjl
ng.sources.kafkaSource.batchSize=1000
ng.sources.kafkaSource.channels= memorychannel
ng.sources.kafkaSource.kafka.consumer.auto.offset.reset=latest
ng.sources.kafkaSource.interceptors= i1
ng.sources.kafkaSource.interceptors.i1.type= com.iflytek.extracting.flume.interceptor.XyHbaseInterceptorTC $Builder #前面写的拦截器
ng.channels.memorychannel.type = memory
ng.channels.memorychannel.keep-alive = 3
ng.channels.memorychannel.byteCapacityBufferPercentage = 20
ng.channels.memorychannel.transactionCapacity = 10000
ng.channels.memorychannel.capacity = 100000
ng.sinks.hbasesink.type = org.apache.flume.sink.hbase.HBaseSink
ng.sinks.hbasesink.table = table_boy
ng.sinks.hbasesink.columnFamily = cf1 #列簇名
ng.sinks.hbasesink.serializer = org.apache.flume.sink.hbase.RegexHbaseEventSerializer
ng.sinks.hbasesink.serializer.regex = (.*)::(.*)::(.*)::(.*) #以::作为分隔符号
ng.sinks.hbasesink.serializer.colNames = ROW_KEY,name,age,time #默认第一个为rowKey,也就是id
ng.sinks.hbasesink.serializer.rowKeyIndex = 0
ng.sinks.hbasesink.channel = memorychannel
2 启动flume:
启动之前需要在hbase中先建立表
启动hbase: hbase shell
创建table_boy表,包含cf1、cf2两个列族
create 'table_boy', 'cf1', 'cf2'
然后前台启动flume,进行调试:
bin/flume-ng agent -n ng -c conf -f conf/hbase.conf
cdh版本的flume默认的日志打在 /var/log/flume/flume.log里面
查看数据已经接入kudu,并确定没问题可以使用后台提交:
nohup bin/flume-ng agent -n ng -c conf -f conf/hbase.conf &
任务停止:
jcmd | grep hbase.conf # 找到含有 hbase.conf的任务
然后kill 任务id即可