目录
一、前言
准备工作
二、HDFS——MapReduce操作1
1、Map阶段
2、Reduce阶段
3、Driver阶段
4、结果查询
三、HDFS——MapReduce操作2
1、Map阶段
2、Reduce阶段
3、Driver阶段
4、结果查询
本篇文章主要分享,编写简单的hbase与mapreduce集合的案例,即从hdfs中读取数据导入到hbase表里,读取hbase表中的指定数据,导入到另外一张表中。
创建学生成绩表
vi student.tsv
添加数据
1 1001 刘一 99
2 1002 陈二 98
3 1003 张三 97
4 1004 李四 96
上传到hdfs的指定路径中
hadoop fs -put student.tsv /Data/Hbase/textHbase/HbaseMapreduce/input/
在hbase中创建两个表,为后续导入数据做准备
hbase(main):013:0> create 'student','info'
Created table student
Took 4.2924 seconds
=> Hbase::Table - student
hbase(main):014:0> create 'student_new','info'
Created table student_new
Took 1.2611 seconds
=> Hbase::Table - student_new
读取hdfs中的文件数据,导入到hbase中
package com.itcast.hbase.example.Hbase_MapReduce1;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class ReadDataMapper extends Mapper {
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//接收传入进来的一行文本,并转换成String类型
String line = value.toString();
//将这行内容按分隔符空格切割成单词,保存在String数组中
String[] words = line.split("\t");
//构建Put对象
Put put = new Put(Bytes.toBytes(words[0]));
//添加读取的数据
put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("Number"), Bytes.toBytes(words[1]));
put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("Name"), Bytes.toBytes(words[2]));
put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("Source"), Bytes.toBytes(words[3]));
//使用context,把map阶段处理的数据发送给reduce阶段作为输入数据
context.write(new ImmutableBytesWritable(Bytes.toBytes(words[0])), put);
}
}
package com.itcast.hbase.example.Hbase_MapReduce1;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.io.NullWritable;
import java.io.IOException;
public class ReadDataReduce extends TableReducer {
protected void reduce(ImmutableBytesWritable key, Iterable values, Context context) throws IOException, InterruptedException {
//循环遍历,将每一行的数据写入到hbase的表中
for (Put value : values) {
context.write(NullWritable.get(), value);
}
}
}
package com.itcast.hbase.example.Hbase_MapReduce1;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.HRegionPartitioner;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import java.io.IOException;
public class ReadDataDriver {
static FileSystem fs = null;
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
//通过 Job 来封装本次 MapReduce 的相关信息
Configuration configuration = HBaseConfiguration.create();
//设置参数,指定要访问的文件系统的类型:HDFS文件系统
configuration.set("fs.defaultFS","hdfs://hadoop01:9000");
//配置zookeeper访问地址configuration.set("hbase.zookeeper.quorum","hadoop01:2181,hadoop02:2181,hadoop03:2181");
//设置客户端的访问身份,以root身份访问HDFS
System.setProperty("HADOOP_USER_NAME","root");
//通过FileSystem类的静态方法,获取文件系统客户端对象
fs = FileSystem.get(configuration);
//获取Job运行实例
Job job = Job.getInstance(configuration);
//指定 MapReduce Job jar运行主类
job.setJarByClass(ReadDataDriver.class);
//指定本次MapReduce的Mapper类
job.setMapperClass(ReadDataMapper.class);
//设置业务逻辑 Mapper 类的输出 key 和 value 的数据类型
job.setMapOutputKeyClass(ImmutableBytesWritable.class);
job.setMapOutputValueClass(Put.class);
//指定要处理的数据所在的位置
FileInputFormat.setInputPaths(job, new Path("/Data/Hbase/textHbase/HbaseMapreduce/input/"));
//设置Reduce数量
job.setNumReduceTasks(10);
//设置Reducer
TableMapReduceUtil.initTableReducerJob("student", ReadDataReduce.class, job, HRegionPartitioner.class);
//提交程序
job.waitForCompletion(true);
}
}
hbase(main):003:0> scan 'student',{FORMATTER=>'toString'}
ROW COLUMN+CELL
1 column=info:Name, timestamp=2023-04-15T19:35:19.445Z, value=刘一
1 column=info:Number, timestamp=2023-04-15T19:35:19.445Z, value=1001
1 column=info:Source, timestamp=2023-04-15T19:35:19.445Z, value=99
2 column=info:Name, timestamp=2023-04-15T19:35:19.445Z, value=陈二
2 column=info:Number, timestamp=2023-04-15T19:35:19.445Z, value=1002
2 column=info:Source, timestamp=2023-04-15T19:35:19.445Z, value=98
3 column=info:Name, timestamp=2023-04-15T19:35:19.445Z, value=张三
3 column=info:Number, timestamp=2023-04-15T19:35:19.445Z, value=1003
3 column=info:Source, timestamp=2023-04-15T19:35:19.445Z, value=97
4 column=info:Name, timestamp=2023-04-15T19:35:19.445Z, value=李四
4 column=info:Number, timestamp=2023-04-15T19:35:19.445Z, value=1004
4 column=info:Source, timestamp=2023-04-15T19:35:19.445Z, value=96
4 row(s)
Took 0.0119 seconds
读取表student中的数据,将指定数据导入到student_new表中
package com.itcast.hbase.example.Hbase_MapReduce2;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class ReadysMapreduce extends TableMapper {
protected void map(ImmutableBytesWritable key, Result value,
Mapper
.Context context)throws IOException, InterruptedException {
//将student的Name和Number提取出来,相当于将每一个数据读取出来放入到Put对象中
Put put = new Put(key.get());
//遍历添加column行
for (Cell cell : value.rawCells()) {
//添加/克隆列族:info
if ("info".equals(Bytes.toString(CellUtil.cloneFamily(cell)))) {
//添加/克隆列:name
if ("Name".equals(Bytes.toString(CellUtil.cloneQualifier(cell)))) {
//将该列cell加入到Put对象中
put.add(cell);
添加/克隆列:color
}else if ("Number".equals(Bytes.toString(CellUtil.cloneQualifier(cell)))) {
put.add(cell);
}
}
}
context.write(key, put);
}
}
package com.itcast.hbase.example.Hbase_MapReduce2;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.io.NullWritable;
import java.io.IOException;
public class WriteysReduce extends TableReducer {
protected void reduce(ImmutableBytesWritable key, Iterable vaules, Context context)
throws IOException, InterruptedException {
//读出来的每一行数据写入到student_mr表中
for (Put put : vaules) {
context.write(NullWritable.get(), put);
}
}
}
package com.itcast.hbase.example.Hbase_MapReduce2;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.mapreduce.Job;
public class JobysMapreduce {
static FileSystem fs = null;
public static void main(String[] args) throws Exception {
//得到Configuration
//通过 Job 来封装本次 MapReduce 的相关信息
Configuration configuration = HBaseConfiguration.create();
//设置参数,指定要访问的文件系统的类型:HDFS文件系统
configuration.set("fs.defaultFS","hdfs://hadoop01:9000");
//配置zookeeper访问地址
configuration.set("hbase.zookeeper.quorum","hadoop01:2181,hadoop02:2181,hadoop03:2181");
//设置客户端的访问身份,以root身份访问HDFS
System.setProperty("HADOOP_USER_NAME","root");
//通过FileSystem类的静态方法,获取文件系统客户端对象
fs = FileSystem.get(configuration);
//获取Job运行实例
Job job = Job.getInstance(configuration);
//指定 MapReduce Job jar运行主类
job.setJarByClass(JobysMapreduce.class);
//配置Job
Scan scan = new Scan();
scan.setCacheBlocks(false);
scan.setCaching(500);
TableMapReduceUtil.initTableMapperJob(
"student", //数据源的表名
scan, //scan控制器
ReadysMapreduce.class, //设置Mapper类
ImmutableBytesWritable.class, //设置Mapper输出key类型
Put.class, //设置Mapper输出value值类型
job //设置给哪个JOB
);
//设置Reducer
TableMapReduceUtil.initTableReducerJob("student_new", WriteysReduce.class, job);
//设置Reduce数量至少1个
job.setNumReduceTasks(1);
//提交程序
job.waitForCompletion(true);
}
}
hbase(main):004:0> scan 'student_new',{FORMATTER=>'toString'}
ROW COLUMN+CELL
1 column=info:Name, timestamp=2023-04-15T19:35:19.445Z, value=刘一
1 column=info:Number, timestamp=2023-04-15T19:35:19.445Z, value=1001
2 column=info:Name, timestamp=2023-04-15T19:35:19.445Z, value=陈二
2 column=info:Number, timestamp=2023-04-15T19:35:19.445Z, value=1002
3 column=info:Name, timestamp=2023-04-15T19:35:19.445Z, value=张三
3 column=info:Number, timestamp=2023-04-15T19:35:19.445Z, value=1003
4 column=info:Name, timestamp=2023-04-15T19:35:19.445Z, value=李四
4 column=info:Number, timestamp=2023-04-15T19:35:19.445Z, value=1004
4 row(s)
Took 0.7145 seconds