Apache HBASE 实现MapReduce实现HBASE数据导入到Hdfs

数据准备:

mktest:mk3

.... 
95021                                  column=user:age, timestamp=1554208964508, value=17                                                                
95021                                  column=user:dept, timestamp=1554208964508, value=MA                                                               
95021                                  column=user:name, timestamp=1554208964508, value=\xE5\x91\xA8\xE4\xBA\x8C                                         
95021                                  column=user:sex, timestamp=1554208964508, value=\xE7\x94\xB7                                                      
95022                                  column=user:age, timestamp=1554208964508, value=20                                                                
95022                                  column=user:dept, timestamp=1554208964508, value=MA                                                               
95022                                  column=user:name, timestamp=1554208964508, value=\xE9\x83\x91\xE6\x98\x8E                                         
95022                                  column=user:sex, timestamp=1554208964508, value=\xE7\x94\xB7

需求:按照dept分组,求每个部门的平均年龄

1.MapReduce程序设计

1)Map端

自定义类继承 TableMapper(HBASE提供的)

package com.mycat.hdemo.hbase2hdfs;

import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;

import java.io.IOException;

public class HBase2HdfsMapper extends TableMapper<Text, IntWritable> {// 部门作为key,年龄作为聚合列

    private Text mk=new Text();
    private IntWritable mv=new IntWritable();
    @Override
    protected void map(ImmutableBytesWritable key, Result value, Context context) throws IOException, InterruptedException {
        Cell[] cells = value.rawCells();
        for (Cell c : cells) {
            String name = Bytes.toString(c.getQualifierArray(), c.getQualifierOffset(), c.getQualifierLength());
            String val = Bytes.toString(c.getValueArray(), c.getValueOffset(), c.getValueLength());
            if(name.equals("dept")){
                mk.set(val);
            }
            if(name.equals("age")){
                mv.set(Integer.parseInt(val));
            }
        }
        context.write(mk,mv);
    }
}
2)Reduce端

自定义类继承自 Reduce类,TableMapReduceUtil为HBASE提供的API,用来封装HBASE的输入与输出

package com.mycat.hdemo.hbase2hdfs;

import com.sun.org.apache.xpath.internal.operations.String;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class HBase2HdfsReducer extends Reducer<Text, IntWritable,Text, NullWritable> {// 制表符分割输出
    private Text mk=new Text();
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        int sum=0; // 求每组(dept)总和
        int count=0; //计数
        for (IntWritable value : values) {
            sum += value.get();
            count++;
        }
        double avg=sum*1.0/count;
        mk.set(key.toString()+"\t"+Double.toString(avg));
        context.write(mk,NullWritable.get());
    }
}
3)Driver类
package com.mycat.hdemo.hbase2hdfs;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class HBase2HdfsDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        System.setProperty("HADOOP_USER_NAME","hadoop");
        Configuration conf=new Configuration();
        conf.set("fs.defaultFS","hdfs://mkmg/");
        conf.set("hbase.zookeeper.quorum","mycat01:2181,mycat02:2181,mycat03:2181");
        Job job = Job.getInstance(conf);

        job.setJarByClass(HBase2HdfsDriver.class);

        job.setReducerClass(HBase2HdfsReducer.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        Scan scan=new Scan();
        TableMapReduceUtil.initTableMapperJob("mktest:mk3",scan,HBase2HdfsMapper.class, Text.class, IntWritable.class,job,false);
        FileSystem fs = FileSystem.get(conf);
        Path out=new Path("/mktest/mk4out");
        if(fs.exists(out)){
            fs.delete(out,true);
        }
        FileOutputFormat.setOutputPath(job,out);

        job.waitForCompletion(true);
    }
}

2.结果输出

[hadoop@mycat01 stu]$ hdfs dfs -cat /mktest/mk4out/part-r-00000
CS	20.0
IS	19.166666666666668
MA	19.0

你可能感兴趣的:(HBASE)