HBase的MR实现和hive的整合

Hbase的MR实现

需求一:从mysuer表中读取数据,然后写入到Hbase的另一张表中

HBaseReadWrite:

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class HBaseReadWrite extends Configured implements Tool {
     
    @Override
    public int run(String[] strings) throws Exception {
     
        Scan scan = new Scan();
        Configuration conf = super.getConf();
        Job job = Job.getInstance(super.getConf(), "hbaseReadWrite");
        TableMapReduceUtil.initTableMapperJob("myuser",scan, HBaseMapper.class, Text.class, Put.class,job);
        TableMapReduceUtil.initTableReducerJob("myuser2", HBaseReducer.class,job);

        job.setNumReduceTasks(2);

        boolean verbose = true;
        boolean b = job.waitForCompletion(verbose);

        return  b?0:1 ;




    }




    public static void main(String[] args) throws Exception {
     

        Configuration configuration = HBaseConfiguration.create();
        configuration.set("hbase.zookeeper.quorum", "node01:2181,node02:2181,node03:2181");

        int run = ToolRunner.run(configuration, new HBaseReadWrite(), args);
        System.exit(run);

    }

}

HBaseMapper

import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;
import java.util.List;

public class HBaseMapper extends TableMapper <Text, Put> {
     

    @Override
    protected void map(ImmutableBytesWritable key, Result result, Context context) throws IOException, InterruptedException {
     

        byte[] rowKeyBytes = key.get();

        String rowkey = Bytes.toString(rowKeyBytes);

        List<Cell> cells = result.listCells();

        for (Cell cell : cells) {
     

            byte[] family = cell.getFamily();
            byte[] qualifier = cell.getQualifier();
            byte[] rowArray = cell.getRowArray();
            Put put = new Put(rowKeyBytes);

            if ("f1".equals(Bytes.toString(family))) {
     
                if ("name".equals(Bytes.toString(qualifier)) && "age".equals(Bytes.toString(qualifier))) {
     

                    put.add(cell);


                }
                if (!put.isEmpty()) {
     
                    context.write(new Text(rowkey), put);
                }
            }


        }


    }
}

HBaseReducer

     import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.io.Text;

import java.io.IOException;

public class HBaseReducer extends TableReducer<Text, Put, ImmutableBytesWritable> {
     

    @Override
    protected void reduce(Text key, Iterable<Put> values, Context context) throws IOException, InterruptedException {
     
        ImmutableBytesWritable immutableBytesWritable = new ImmutableBytesWritable();
        immutableBytesWritable.set(key.toString().getBytes());

        for (Put put : values)
        {
     
            context.write(immutableBytesWritable,put);
        }




    }
}

需求二:读取HDFS文件,写入到HBase表当中去
读取hdfs路径/hbase/input/user.txt内容如下
0007 zhangsan 18
0008 lisi 25
0009 wangwu 20

第一步:准备数据文件
主备数据文件,并将数据文件上传到HDFS上面去

hdfs dfs -mkdir -p /hbase/input
cd /export/servers/
vim user.txt
0007    zhangsan        18
0008    lisi    25
0009    wangwu  20

HDFSReadWrite:

import demo1.HBaseMapper;
import demo1.HBaseReducer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class HDFSReadWrite extends Configured implements Tool {
     
    @Override
    public int run(String[] strings) throws Exception {
     
        Job job = Job.getInstance(super.getConf(), "hdfsWrITERead");


        //读取文件
        job.setInputFormatClass(TextInputFormat.class);
        TextInputFormat.addInputPath(job,new Path("hdfs://node01:8020/hbase/input"));

        //自定义逻辑Mapper,接收
        job.setMapperClass(HDFSReadWriteMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(NullWritable.class);

        //排序分区规约

        //reducer

        TableMapReduceUtil.initTableReducerJob("myuser2",HBaseWrite.class,job);

        job.setNumReduceTasks(1);

        boolean b = job.waitForCompletion(true);

        return b?0:1;


    }




    public static void main(String[] args) throws Exception {
     

        Configuration configuration = HBaseConfiguration.create();
        configuration.set("hbase.zookeeper.quorum", "node01:2181,node02:2181,node03:2181");

        int run = ToolRunner.run(configuration, new HDFSReadWrite(), args);
        System.exit(run);

    }

}

HDFSReadWriteMapper:

import demo1.HBaseMapper;
import demo1.HBaseReducer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.io.IOException;

/***
 *     读取hdfs上面的数据
 *     LongWriteABLE:行偏移量
 *     Text:v1的类型   封装的数据
 *
 *     Text:封装数据作为k2
 *     Nullwritable:对应v2
 */



public class HDFSReadWriteMapper extends Mapper<LongWritable,Text,Text, NullWritable> {
     

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
     
         context.write(value,NullWritable.get());
    }
}

HBaseWrite:

import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;

import javax.ws.rs.PUT;
import java.io.IOException;
import java.util.List;

public class HBaseWrite extends TableReducer <Text, NullWritable,ImmutableBytesWritable> {
     

    @Override
    protected void reduce(Text key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
     
        //切割该数据,封装到put对象里面去,写出去
        String[] split = key.toString().split("\t");

        Put put = new Put(split[0].getBytes());

        put.addColumn("f1".getBytes(), "name".getBytes(), split[1].getBytes());
        put.addColumn("f1".getBytes(), "name".getBytes(), split[1].getBytes());
        // 构建我们写出去的k3 是ImmutableBytesWritable封装了rowkey
        ImmutableBytesWritable immutableBytesWritable = new ImmutableBytesWritable();
        immutableBytesWritable.set(split[0].getBytes());

        context.write(immutableBytesWritable, put);
    }


}

需求三:通过bulkload的方式批量加载数据到HBase当中去

加载数据到HBase当中去的方式多种多样,我们可以使用HBase的javaAPI或者使用sqoop将我们的数据写入或者导入到HBase当中去,但是这些方式不是慢就是在导入的过程的占用Region资料导致效率低下,我们也可以通过MR的程序,将我们的数据直接转换成HBase的最终存储格式HFile,然后直接load数据到HBase当中去即可

HBase中每张Table在根目录(/HBase)下用一个文件夹存储,Table名为文件夹名,在Table文件夹下每个Region同样用一个文件夹存储,每个Region文件夹下的每个列族也用文件夹存储,而每个列族下存储的就是一些HFile文件,HFile就是HBase数据在HFDS下存储格式,所以HBase存储文件最终在hdfs上面的表现形式就是HFile,如果我们可以直接将数据转换为HFile的格式,那么我们的HBase就可以直接读取加载HFile格式的文件,就可以直接读取了
优点:

  1.导入过程不占用Region资源

  2.能快速导入海量的数据

  3.节省内存

BullkLoadMain:


import demo2.HDFSReadWrite;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;

import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2;
import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class BullkLoadMain extends Configured implements Tool {
     

    @Override
    public int run(String[] strings) throws Exception {
     
        Configuration conf = super.getConf();
        Job job = Job.getInstance(conf,"bulkLoad");

         job.setInputFormatClass(TextInputFormat.class);

        job.setMapperClass(BulkLoadMapper.class);
        job.setMapOutputKeyClass(ImmutableBytesWritable.class);
        job.setMapOutputValueClass(Put.class);


        Connection connection = ConnectionFactory.createConnection(conf);
        Table table = connection.getTable(TableName.valueOf("myuser2"));


        //设置我们的数据增量添加
        HFileOutputFormat2.configureIncrementalLoad(job,table,connection.getRegionLocator(TableName.valueOf("myuser2")));




        //输出成为HFile格式
        //使用HFileOutputFormat2 将数据输出成为HFile格式
        job.setOutputFormatClass(HFileOutputFormat2.class);
        HFileOutputFormat2.setOutputPath(job,new Path("hdfs://node01:8020"));


        boolean b = job.waitForCompletion(true);
        return  b?0:1;



    }


    public static void main(String[] args) throws Exception {
     
        Configuration configuration = HBaseConfiguration.create();
        configuration.set("hbase.zookeeper.quorum", "node01:2181,node02:2181,node03:2181");

        int run = ToolRunner.run(configuration, new HDFSReadWrite(), args);
        System.exit(run);
    }


}

BulkLoadMapper

import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;


// 定义K2 V2的类型
public class BulkLoadMapper extends Mapper<LongWritable, Text, ImmutableBytesWritable, Put> {
     
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
     
        String[] split = value.toString().split("/t");
        ImmutableBytesWritable immutableBytesWritable = new ImmutableBytesWritable();
        immutableBytesWritable.set(split[0].getBytes());

        Put put = new Put(split[0].getBytes());

        put.addColumn("f1".getBytes(),"name".getBytes(),split[1].getBytes());
        put.addColumn("f1".getBytes(),"age".getBytes(),split[2].getBytes());

        context.write(immutableBytesWritable,put);
    }
}

LoadData:


 import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Admin;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles;

import java.io.File;
import java.io.IOException;
import java.net.URI;
import java.nio.file.*;
import java.util.Iterator;

public class LoadData {
     


    public static void main(String[] args) throws Exception {
     
        Configuration configuration = HBaseConfiguration.create();
        configuration.set("hbase.zookeeper.property.clientPort", "2181");
        configuration.set("hbase.zookeeper.quorum", "node01,node02,node03");

        Connection connection = ConnectionFactory.createConnection(configuration);
        Admin admin = connection.getAdmin();
        Table table = connection.getTable(TableName.valueOf("myuser2"));
        LoadIncrementalHFiles load = new LoadIncrementalHFiles(configuration);
        load.doBulkLoad(new Path("hdfs://node01:8020/hbase/output_hfile"), admin, table, connection.getRegionLocator(TableName.valueOf("myuser2")));

    }
}

HBase与hive的对比

Hive:

  数据仓库
  Hive的本质其实就相当于将HDFS中已经存储的文件在Mysql中做了一个双射关系,以方便使用HQL去管理查询。
  用于数据分析、清洗
 Hive适用于离线的数据分析和清洗,延迟较高
 基于HDFS、MapReduce
 Hive存储的数据依旧在DataNode上,编写的HQL语句终将是转换为MapReduce代码执行。

HBase:

 数据库
 是一种面向列存储的非关系型数据库。
 用于存储结构化和非结构话的数据
 适用于单表非关系型数据的存储,不适合做关联查询,类似JOIN等操作。
 基于HDFS
 数据持久化存储的体现形式是Hfile,存放于DataNode中,被ResionServer以region的形式进行管理。
 延迟较低,接入在线业务使用
 面对大量的企业数据,HBase可以直线单表大量数据的存储,同时提供了高效的数据访问速度。

总结:Hive与HBase
Hive和Hbase是两种基于Hadoop的不同技术,Hive是一种类SQL的引擎,并且运行MapReduce任务,Hbase是一种在Hadoop之上的NoSQL 的Key/vale数据库。这两种工具是可以同时使用的。就像用Google来搜索,用FaceBook进行社交一样,Hive可以用来进行统计查询,HBase可以用来进行实时查询,数据也可以从Hive写到HBase,或者从HBase写回Hive。

Hive与HBase的整合

我们可以直接将数据存入hbase,然后通过hive整合hbase直接使用sql语句分析hbase里面的数据即可,非常方便

需求一:将hive分析结果的数据,保存到HBase当中去

第一步:拷贝hbase的五个依赖jar包到hive的lib目录下

将我们HBase的五个jar包拷贝到hive的lib目录下

hbase的jar包都在/export/servers/hbase-1.2.0-cdh5.14.0/lib
我们需要拷贝五个jar包名字如下
hbase-client-1.2.0-cdh5.14.0.jar         	  
hbase-hadoop2-compat-1.2.0-cdh5.14.0.jar 
hbase-hadoop-compat-1.2.0-cdh5.14.0.jar  
hbase-it-1.2.0-cdh5.14.0.jar    
hbase-server-1.2.0-cdh5.14.0.jar

我们直接在node03执行以下命令,通过创建软连接的方式来进行jar包的依赖

ln -s /export/servers/hbase-1.2.0-cdh5.14.0/lib/hbase-client-1.2.0-cdh5.14.0.jar              /export/servers/hive-1.1.0-cdh5.14.0/lib/hbase-client-1.2.0-cdh5.14.0.jar             
ln -s /export/servers/hbase-1.2.0-cdh5.14.0/lib/hbase-hadoop2-compat-1.2.0-cdh5.14.0.jar      /export/servers/hive-1.1.0-cdh5.14.0/lib/hbase-hadoop2-compat-1.2.0-cdh5.14.0.jar             
ln -s /export/servers/hbase-1.2.0-cdh5.14.0/lib/hbase-hadoop-compat-1.2.0-cdh5.14.0.jar       /export/servers/hive-1.1.0-cdh5.14.0/lib/hbase-hadoop-compat-1.2.0-cdh5.14.0.jar            
ln -s /export/servers/hbase-1.2.0-cdh5.14.0/lib/hbase-it-1.2.0-cdh5.14.0.jar     /export/servers/hive-1.1.0-cdh5.14.0/lib/hbase-it-1.2.0-cdh5.14.0.jar               
ln -s /export/servers/hbase-1.2.0-cdh5.14.0/lib/hbase-server-1.2.0-cdh5.14.0.jar          /export/servers/hive-1.1.0-cdh5.14.0/lib/hbase-server-1.2.0-cdh5.14.0.jar           

第二步:修改hive的配置文件
编辑node03服务器上面的hive的配置文件hive-site.xml添加以下两行配置

cd /export/servers/hive-1.1.0-cdh5.14.0/conf
vim hive-site.xml
<property>
                <name>hive.zookeeper.quorumname>
                <value>node01,node02,node03value>
        property>

         <property>
                <name>hbase.zookeeper.quorumname>
                <value>node01,node02,node03value>
        property>

第三步:修改hive-env.sh配置文件添加以下配置

cd /export/servers/hive-1.1.0-cdh5.14.0/conf
vim hive-env.sh

export HADOOP_HOME=/export/servers/hadoop-2.6.0-cdh5.14.0
export HBASE_HOME=/export/servers/hbase-1.2.0-cdh5.14.0
export HIVE_CONF_DIR=/export/servers/hive-1.1.0-cdh5.14.0/conf

第四步:hive当中建表并加载以下数据
hive当中建表
进入hive客户端

cd /export/servers/hive-1.1.0-cdh5.14.0/
bin/hive

创建hive数据库与hive对应的数据库表

create database course;
use course;
create external table if not exists course.score(id int,cname string,score int) row format delimited fields terminated by '\t' stored as textfile ;

准备数据内容如下
加载数据格式如下

cd /export/
vim hive-hbase.txt

1       zhangsan        88
2       lisi    60
3       wangwu  30
4       zhaoliu 70

进行加载数据
进入hive客户端进行加载数据

 load data local inpath '/export/hive-hbase.txt' into table score;
select * from score;

第五步:创建hive管理表与HBase进行映射
我们可以创建一个hive的管理表与hbase当中的表进行映射,hive管理表当中的数据,都会存储到hbase上面去
hive当中创建内部表

create table course.hbase_score(id int,cname string,score int)  
stored by 'org.apache.hadoop.hive.hbase.HBaseStorageHandler'  
with serdeproperties("hbase.columns.mapping" = "cf:name,cf:score") 
tblproperties("hbase.table.name" = "hbase_score");

通过insert overwrite select 插入数据

insert overwrite table course.hbase_score select id,cname,score from course.score;

第六步:hbase当中查看表hbase_score
进入hbase的客户端查看表hbase_score,并查看当中的数据

hbase(main):023:0> list
TABLE                                                                                       
hbase_score                                                                                 
myuser                                                                                      
myuser2                                                                                     
student                                                                                     
user                                                                                        
5 row(s) in 0.0210 seconds

=> ["hbase_score", "myuser", "myuser2", "student", "user"]
hbase(main):024:0> scan 'hbase_score'
ROW                      COLUMN+CELL                                                        
 1                       column=cf:name, timestamp=1550628395266, value=zhangsan            
 1                       column=cf:score, timestamp=1550628395266, value=80                 
 2                       column=cf:name, timestamp=1550628395266, value=lisi                
 2                       column=cf:score, timestamp=1550628395266, value=60                 
 3                       column=cf:name, timestamp=1550628395266, value=wangwu              
 3                       column=cf:score, timestamp=1550628395266, value=30                 
 4                       column=cf:name, timestamp=1550628395266, value=zhaoliu             
 4                       column=cf:score, timestamp=1550628395266, value=70                 
4 row(s) in 0.0360 seconds

需求二:创建hive外部表,映射HBase当中已有的表模型,直接通过
第一步:HBase当中创建表并手动插入加载一些数据
进入HBase的shell客户端,手动创建一张表,并插入加载一些数据进去

create 'hbase_hive_score',{
      NAME =>'cf'}

put 'hbase_hive_score','1','cf:name','zhangsan'
put 'hbase_hive_score','1','cf:score', '95'

put 'hbase_hive_score','2','cf:name','lisi'
put 'hbase_hive_score','2','cf:score', '96'

put 'hbase_hive_score','3','cf:name','wangwu'
put 'hbase_hive_score','3','cf:score', '97'

第二步:建立hive的外部表,映射HBase当中的表以及字段
在hive当中建立外部表,
进入hive客户端,然后执行以下命令进行创建hive外部表,就可以实现映射HBase当中的表数据

CREATE external TABLE course.hbase2hive(id int, name string, score int) STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key,cf:name,cf:score") TBLPROPERTIES("hbase.table.name" ="hbase_hive_score");

你可能感兴趣的:(HBase的MR实现和hive的整合)