在使用MapReduce对hbase中的数据进行分析时,通过编写MapReduce程序,对数据进行分析,当我使用MapReduce进行分析后的结果存储到mysql等关系型数据库。
在对hbase中的数据仅分析时,要求算出每人每年的通话时长,每月的通过时常,每天的通话时常,和通话次数,是通过一个MapReduce进行编写的,然后当想要把数据存储到3张表时,遇到了困难,以前存一张表时都是使用的DBInputFormat这个类,但是这个类没有找到存储多张表的方法,所以就使用了自定义类型的outputFormat,把mysql的数据存储到多张表中,然后进行了测试成功了
1、继承RecordWriter
package com.HbaseMR;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import java.io.IOException;
import java.sql.*;
//此处的泛型是reduce端的输出的类型的key、 value
public class MSRecordWrite extends RecordWriter<CallDate, Text> {
private static Connection conn ;
static {
String driver = "com.mysql.jdbc.Driver";
String url = "jdbc:mysql://localhost:3306/hbase2db";
String user = "root";
String password = "root";
try {
Class.forName(driver);
conn = DriverManager.getConnection(url, user, password);
} catch (Exception e) {
e.printStackTrace();
}
}
@Override
public void write(CallDate key, Text value) throws IOException, InterruptedException {
// TODO Auto-generated method stub
System.out.println("start to write" + value.toString());
try {
System.out.println("it already gets into insert function");
PreparedStatement pstmt = null;
final String callPhone = key.getCallPhone();
final String callDate = key.getCallDate();
final String[] split = value.toString().split("\t");
long callTime = Integer.parseInt(split[0]);
int counts = Integer.parseInt(split[1]);
//年份
if (callDate.length()==4){
pstmt = conn.prepareStatement("insert into call_data_y values(?,?,?,?)");
}else if(callDate.length() ==7){ //按月份
pstmt = conn.prepareStatement("insert into call_data_m values(?,?,?,?)");
}else { //按天
pstmt = conn.prepareStatement("insert into call_data_d values(?,?,?,?)");
}
//进行赋值
pstmt.setString(1,callPhone);
pstmt.setString(2,callDate);
pstmt.setLong(3,callTime);
pstmt.setInt(4,counts);
pstmt.executeUpdate();
pstmt.close();
System.out.println("insert is successfuly " + value.toString());
} catch (Exception ex) {
try {
conn.close();
} catch (SQLException e) {
e.printStackTrace();
}
ex.printStackTrace();
}
}
@Override
public void close(TaskAttemptContext context) throws IOException, InterruptedException {
}
}
2、继承OutputFormat类
package com.HbaseMR;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.*;
import java.io.IOException;
public class MSOutPutFormat extends OutputFormat<CallDate, Text> {
@Override
public RecordWriter<CallDate, Text> getRecordWriter(TaskAttemptContext context)
throws IOException, InterruptedException {
return new MSRecordWrite();
}
@Override
public void checkOutputSpecs(JobContext context) throws IOException, InterruptedException {
}
@Override
public OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException, InterruptedException {
return (new org.apache.hadoop.mapreduce.lib.output.NullOutputFormat<NullWritable, IntWritable>())
.getOutputCommitter(context);
}
}
3、自定义的类
package com.HbaseMR;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class CallDate implements WritableComparable<CallDate>{
//手机号
private String callPhone;
//日期
private String callDate;
public CallDate(){}
public CallDate(String call_phone, String call_data){
this.callDate = call_data;
this.callPhone = call_phone;
}
@Override
public int compareTo(CallDate o) {
return callPhone.compareTo(o.getCallPhone())==0? callDate.compareTo(o.getCallDate()): callPhone.compareTo(o.getCallPhone());
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(callPhone);
out.writeUTF(callDate);
}
@Override
public void readFields(DataInput in) throws IOException {
this.callPhone= in.readUTF();
this.callDate = in.readUTF();
}
public String getCallDate() {
return callDate;
}
public String getCallPhone() {
return callPhone;
}
public void setCallDate(String callDate) {
this.callDate = callDate;
}
public void setCallPhone(String callName) {
this.callPhone = callName;
}
@Override
public String toString() {
return this.callPhone+"\t"+this.callDate;
}
}
4、Mapper端
package com.HbaseMR;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellScanner;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
import java.util.List;
//此处的泛型是map端的输出
public class HbaseMapper extends TableMapper<CallDate,Text> {
@Override
protected void map(ImmutableBytesWritable key, Result value, Context context) throws IOException, InterruptedException {
//每次读取的是一个RowKey的数据,一条数据记录一个人的一次通话
//主叫号 主叫姓名 被叫号 被叫姓名 通话日期 通过时间
String calling_phone = null;
String call_data = null;
String call_time = null;
String call_phone = null;
//获取所有的cell
final Cell[] cells = value.rawCells();
for (Cell cell : cells){
//获取每个cell的列名
String c = new String(CellUtil.cloneQualifier(cell));
//获取每个cell的值
String v = new String(CellUtil.cloneValue(cell));
//进行匹配,把需要的字段进行取出来
switch (c){
case "called_phone" :
call_phone = v;
break;
case "dialing_phone":
calling_phone = v;
break;
case "call_date":
call_data = v;
break;
case "call_time":
call_time = v;
break;
}
}
//日期格式是2018-06-26 15:36:03
//把手机号、日期封装成自定义的bean,bean和通话时间作为reduce端的输入
//取出年份
final String sub = call_data.substring(0, 7);
context.write(new CallDate(calling_phone,sub),new Text(call_time));
//把年份-月份取出来
context.write(new CallDate(calling_phone,call_data.substring(0,4)),new Text(call_time));
//年份-月份-日取出来,传到reduce端
context.write(new CallDate(calling_phone,call_data.substring(0,10)),new Text(call_time));
}
}
5、分区,把年份分到一个分区,以月为单位的分到同一分区中,把以日为单位的分到同一分区中
package com.HbaseMR;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
public class HbasePartitioner extends Partitioner<CallDate,Text>{
@Override
public int getPartition(CallDate callDate, Text text, int i) {
if (callDate.getCallDate().length()==10){
return 0;
}else if(callDate.getCallDate().length()==7){
return 1;
}else{
return 2;
}
}
}
7、定义GroupingComparator类,把相同手机号和相同日期当做是一个key,进行聚合
package com.HbaseMR;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
public class HbaseGroupingCompar extends WritableComparator{
public HbaseGroupingCompar(){
super(CallDate.class,true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
CallDate a1 = (CallDate)a;
CallDate b1 = (CallDate)b;
if(a1.getCallPhone().equals(b1.getCallPhone())&&a1.getCallDate().equals(b1.getCallDate())){
return 0;
}else if(!a1.getCallPhone().equals(b1.getCallPhone())){
return a1.getCallPhone().compareTo(b1.getCallPhone());
}else{
return a1.getCallDate().compareTo(b1.getCallDate());
}
}
}
8、reduce端,把手机号和日期相同的进行聚合
package com.HbaseMR;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class HbaseReducer extends Reducer<CallDate,Text,CallDate,Text>{
@Override
protected void reduce(CallDate key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
long callTime= 0;
int i = 0;
//i表示通话次数,通话次数就和values的长度相等
for(Text v : values){
final long aLong = Long.parseLong(v.toString());
callTime += aLong;
i++;
}
context.write(key,new Text(callTime+"\t"+i));
}
}
Driver端
package com.HbaseMR;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.lucene.spatial3d.geom.Tools;
public class HbaseDriver extends Configured implements Tool {
public static void main(String[] args) throws Exception {
Configuration conf = HBaseConfiguration.create();
//zk地址,通过获取zk地址,可以对hbase中的数据进行操作
conf.set("hbase.zookeeper.quorum","192.168.136.150:2181,192.168.136.151:2181,192.168.136.152:2181");
int status = ToolRunner.run(conf, new HbaseDriver(), args);
if(status==1){
System.out.println("成功");
}else{
System.out.println("失败");
}
}
@Override
public int run(String[] args) throws Exception {
final Configuration conf = this.getConf();
final Job job = Job.getInstance(conf);
job.setJarByClass(HbaseDriver.class);
job.setGroupingComparatorClass(HbaseGroupingCompar.class);
Scan scan = new Scan();
scan.setCacheBlocks(false);
scan.setCaching(500);
TableMapReduceUtil.initTableMapperJob("kafka:call_logs1",scan,
HbaseMapper.class,CallDate.class,Text.class,job);
job.setReducerClass(HbaseReducer.class);
job.setOutputKeyClass(CallDate.class);
job.setPartitionerClass(HbasePartitioner.class);
job.setOutputValueClass(Text.class);
job.setNumReduceTasks(3);
//设置reduce的输出为自定义的OutputFormat
job.setOutputFormatClass(MSOutPutFormat.class);
final boolean status = job.waitForCompletion(true);
return status==true? 1 : 0 ;
}
}
这里在补充一个hbase的自定义分区(也就是region的划分)
public static void createTable1(String tableName,String ... familyColumn){
final HTableDescriptor tableDescriptor = new HTableDescriptor(TableName.valueOf(tableName));
//这里一共分7个分区,也就是7个人region
//在ASCII中"|"代表无穷大
//以00开头的被分为一个region中
//以01开头的被分到一个region中
//.....
//以06开头的被分到一个region中
byte[][] splitKey = {
Bytes.toBytes("00|"),
Bytes.toBytes("01|"),
Bytes.toBytes("02|"),
Bytes.toBytes("03|"),
Bytes.toBytes("04|"),
Bytes.toBytes("05|"),
Bytes.toBytes("06|"),
};
for(String column:familyColumn){
final HColumnDescriptor columnDescriptor = new HColumnDescriptor(column);
if(tableName.equals(Names.TABLENAME_RLS_)){
columnDescriptor.setMinVersions(100);
columnDescriptor.setMaxVersions(100);
}
tableDescriptor.addFamily(columnDescriptor);
}
try {
//在创建表时,把分区规则传进去
admin.createTable(tableDescriptor,splitKey);
System.out.println("创建成功");
} catch (IOException e) {
System.out.println("创建表失败");
e.printStackTrace();
}
}