1.编写flume-hbase.conf
a1.sources=r1
a1.channels= hbaseC
a1.sinks= hbaseS2
a1.sources.r1.type = avro
a1.sources.r1.bind = spark1
#spark1端的flume source的端口必须与8989保持一致
a1.sources.r1.port = 8989
a1.sources.r1.threads = 5
#*********************flume+hbase**************************
a1.sources.r1.channels = hbaseC
a1.sinks.hbaseS2.channel = hbaseC
a1.channels.hbaseC.type = memory
a1.channels.hbaseC.capacity = 10000
a1.channels.hbaseC.transactionCapacity = 10000
a1.channels.hbaseC.keep-alive = 20
a1.sinks.hbaseS2.type = asynchbase
#weblogs为hbase的表名
a1.sinks.hbaseS2.table = weblogs
#info为hbase表的列族名,UserDfAsyncHbaseEventSerializer为自定义类名
a1.sinks.hbaseS2.columnFamily = info
a1.sinks.hbaseS2.serializer = org.apache.flume.sink.hbase.UserDfAsyncHbaseEventSerializer
#payloadColumn为flume监听数据的列名,将会映射到hbase的列名
a1.sinks.hbaseS2.serializer.payloadColumn=datetime,userid,searchname,retorder,cliorder,cliurl
2.在hbase中创建表:create ‘weblogs’,‘info’
3.自定义UserDfAsyncHbaseEventSerializer类,下载flume的源码,加载源码的flume-ng-hbase-sink包,在改包下自定义一个UserDfAsyncHbaseEventSerializer类继承AsyncHbaseEventSerializer接口,具体方法如下:
/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.flume.sink.hbase; import com.google.common.base.Charsets; import org.apache.flume.Context; import org.apache.flume.Event; import org.apache.flume.FlumeException; import org.apache.flume.conf.ComponentConfiguration; import org.hbase.async.AtomicIncrementRequest; import org.hbase.async.PutRequest; import java.util.ArrayList; import java.util.List; public class UserDfAsyncHbaseEventSerializer implements AsyncHbaseEventSerializer { //表名 private byte[] table; //列族名 private byte[] colFam; //当前事件 private Event currentEvent; //列名 private byte[][] columnNames; //用于向Hbase批量存数据 private final Listputs = new ArrayList (); private final List incs = new ArrayList (); //当前行建 private byte[] currentRowKey; private final byte[] eventCountCol = "eventCount".getBytes(); @Override public void configure(Context context) { //从配置文件中获取列名 String cols = context.getString("payloadColumn"); String[] names = cols.split(","); columnNames = new byte[names.length][]; int i = 0; for (String name : names) { columnNames[i++] = name.getBytes(); } } @Override public void cleanUp() { table = null; colFam = null; currentEvent = null; columnNames = null; currentRowKey = null; } @Override public List getActions() { //分割事件,获取各列的值 String eventStr = new String(currentEvent.getBody()); String[] cols = eventStr.split(","); puts.clear(); String datetime = cols[0]; String userid = cols[1]; if (cols.length == columnNames.length) { //生成行键 try { currentRowKey = SimpleRowKeyGenerator.getHbaseRowKey(userid, datetime); } catch (Exception e) { throw new FlumeException("Could not get row key!", e); } //添加每列数据 for (int i = 0; i < cols.length; i++) { PutRequest putReq = new PutRequest(table, currentRowKey, colFam, columnNames[i], cols[i].getBytes(Charsets.UTF_8)); puts.add(putReq); } } return puts; } @Override public List getIncrements() { //增加接受的事件数量 incs.clear(); incs.add(new AtomicIncrementRequest(table, "totalEvents".getBytes(), colFam, eventCountCol)); return incs; } //初始化表名和列族名 @Override public void initialize(byte[] table, byte[] colFam) { this.table = table; this.colFam = colFam; } @Override public void setEvent(Event event) { this.currentEvent = event; } @Override public void configure(ComponentConfiguration conf) { // TODO Auto-generated method stub } }
4.打包上传到flume的lib下。
5.启动hbase、flume服务。
6.配置hive集成hbase方法:
a.在hive-site.xml文件中配置zookeeper,hive通过这些参数去连接hbase
b.将hbase的下面的依赖包拷贝到hive的lib下面,如果cdh同版本的,就不需要拷贝
hbase-server-1.2.9.jar
hbase-client-1.2.9.jar
hbase-protocol-1.2.9.jar
hbase-it-1.2.9.jar
htrace-core-3.1.0-incubating.jar
hbase-hadoop2-compat-1.2.9.jar
hbase-hadoop-compat-1.2.9.jar
c.在hive中创建表
create external table weblogs(
id string,
datetime string,
userid string,
searchname string,
retorder string,
cliorder string,
cliurl string
) stored by 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'
with serdeproperties ("hbase.columns.mapping" = ":key, info:datetime,info:userid,info:searchname,info:retorder,info:cliorder,info:cliurl") tblproperties("hbase.table.name"="weblogs");
最后可以从hbase中加载数据到hive表中进行离线分析。