提出需求:
团队为了统一技术栈,一致同意用Flink来进行对批和流计算统一处理。
问题来了:
Flink似乎相对spark来说还是很不完善,spark分分钟搞定的事情,在Flink里就需要动动脑子了。
参考spark;
object CreateHfile {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("CreateHfile").setMaster(args(0))
val sc = new SparkContext(conf)
val hbaseConf = HBaseConfiguration.create()
//
val rdd = sc.textFile(args(1))
.flatMap(v =>{
val x = new javaList[String]()
for( a <- 1 to 9999){
x.add(v + "%04d".format(a))
}
x.toArray
})
.sortBy(v=>v.toString)
.map(r =>(new ImmutableBytesWritable(Bytes.toBytes(r.toString)),
new KeyValue(Bytes.toBytes(r.toString), Bytes.toBytes("phoneFamliy"), Bytes.toBytes("phoneCol"),System.currentTimeMillis(),KeyValue.Type.DeleteColumn)))
rdd.saveAsNewAPIHadoopFile(args(2), classOf[ImmutableBytesWritable],classOf[KeyValue],classOf[HFileOutputFormat2], hbaseConf)
sc.stop()
}
}
再来看看MR
public class HFileCreateJob {
private final static Logger log = LoggerFactory.getLogger(HFileCreateJob.class);
public void run(String input,String output,String env) throws Exception {
Configuration conf = new Configuration();
if("dev".equals(env)){
devHeader(conf) ;
}
try {
// 运行前,删除已存在的中间输出目录
try {
FileSystem fs = FileSystem.get(URI.create(output), conf);
fs.delete(new Path(output), true);
fs.close();
} catch (IOException e) {
e.printStackTrace();
}
Job job = Job.getInstance(conf, "HFileCreateJob");
job.setJobName("Zhao@HFileCreateJob_V1.0");
job.setJarByClass(HFileCreateJob.class);
job.setMapperClass(HfileMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setReducerClass(HfileReducer.class);
job.setOutputKeyClass(ImmutableBytesWritable.class);
job.setOutputValueClass(KeyValue.class);
FileInputFormat.addInputPath(job, new Path(input));
FileOutputFormat.setOutputPath(job, new Path(output));
job.setOutputFormatClass(HFileOutputFormat2.class);
System.exit(job.waitForCompletion(true) ? 0 : 1);
} catch (IOException e) {
e.printStackTrace();
}
}
private void devHeader(Configuration conf){
// 本地测试提交到测试集群
conf.set("mapreduce.app-submission.cross-platform", "true");
conf.set("mapreduce.job.ubertask.enable", "true");
conf.set("fs.defaultFS","hdfs://10.10.10.165:8020");
conf.set("mapreduce.job.jar","E:\\intermult-hbase\\target\\intermulthbase-1.0-SNAPSHOT.jar");
// 支持hdfs下目录含子目录
conf.set("mapreduce.input.fileinputformat.input.dir.recursive", "true");
System.setProperty("hadoop.home.dir", "D:\\soft\\developsoft\\Hadoop\\hadoop-2.6.5");
System.setProperty("HADOOP_USER_NAME", "hdfs");
}
public class HfileMapper extends Mapper {
private String rowKeySalt = ConfigFactory.load().getConfig("hfileCreate").getString("rowKeySalt") ;
@Override
protected void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException {
String[] datas = value.toString().split("\\001");
String content = value.toString().replaceAll("\\001","\\!\\@\\#\\$") ;
Text rowKey = new Text(SHA256Util.getSHA256Str(datas[0] + rowKeySalt )) ;
context.write(rowKey,new Text(content));
}
}
public class HfileReducer extends Reducer {
private final static Logger logger = LoggerFactory.getLogger(HFileCreateJob.class);
private Config env = ConfigFactory.load().getConfig("hfileCreate") ;
private String family = env.getString("family") ;
private String column= env.getString("column") ;
@Override
protected void reduce(Text key, Iterable values, Reducer.Context context)
throws IOException, InterruptedException {
for (Text value : values) {
try{
String line = value.toString();
logger.error("line : " + line);
ImmutableBytesWritable rowkey = new ImmutableBytesWritable(key.toString().getBytes());
KeyValue kv = new KeyValue(key.toString().getBytes(), this.family.getBytes(), column.getBytes() , line.getBytes());
context.write(rowkey, kv);
}catch (Exception e){
logger.error("",e);
e.printStackTrace();
}
}
}
}
最后是Flink的方案:
import org.apache.flink.api.common.functions.RichGroupReduceFunction;
import org.apache.flink.api.common.operators.Order;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.hadoop.mapreduce.HadoopInputFormat;
import org.apache.flink.api.java.hadoop.mapreduce.HadoopOutputFormat;
import org.apache.flink.api.java.tuple.Tuple1;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.util.Collector;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
/**
* 用Flink生产Hfile
* https://ci.apache.org/projects/flink/flink-docs-release-1.7/dev/batch/hadoop_compatibility.html
* Created by geo on 2019/4/8. */
public class Application {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
devHeader(conf);
Job job = Job.getInstance(conf);
// HDFS 输入
HadoopInputFormat hadoopIF =
new HadoopInputFormat(
new TextInputFormat(), LongWritable.class, Text.class, job
);
TextInputFormat.addInputPath(job, new Path("hdfs://2.2.2.2:8020/user/zhao/out0226/testHfile"));
// Flink就干了这点事
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
DataSet> textDataSet = env.createInput(hadoopIF);
DataSet> ds = textDataSet.map(v-> Tuple1.of(v.f1.toString()))
.returns(Types.TUPLE(Types.STRING))
.groupBy(0)
.sortGroup(0,Order.ASCENDING)
.reduceGroup(new createHfile());
// 设置输出类型
job.setOutputKeyClass(ImmutableBytesWritable.class);
job.setOutputValueClass(KeyValue.class);
// 输出到HDFS
HadoopOutputFormat hadoopOF =
new HadoopOutputFormat(
new HFileOutputFormat2(), job
);
HFileOutputFormat2.setOutputPath(job, new Path("hdfs://10.111.32.165:8020/user/zhao/out0226/9/"));
job.setOutputFormatClass(HFileOutputFormat2.class);
ds.output(hadoopOF);
env.execute();
}
// 生产 Tuple2
public static final class createHfile extends RichGroupReduceFunction, Tuple2> {
@Override
public void reduce(Iterable> values, Collector> out) throws Exception {
String family="datasfamily";
String column="content";
for (Tuple1 key:values) {
ImmutableBytesWritable rowkey = new ImmutableBytesWritable(key.toString().getBytes());
KeyValue kv = new KeyValue(key.toString().getBytes(), family.getBytes(), column.getBytes() , key.f0.getBytes());
out.collect(Tuple2.of(rowkey,kv));
}
}
}
/**
* 本地或测试环境使用
* @param conf Configuration
*/
private static void devHeader(Configuration conf){
// 本地测试提交到测试集群
conf.set("mapreduce.app-submission.cross-platform", "true");
conf.set("mapreduce.job.ubertask.enable", "true");
conf.set("fs.defaultFS","hdfs://2.2.2.2:8020");
// 支持hdfs下目录含子目录
conf.set("mapreduce.input.fileinputformat.input.dir.recursive", "true");
System.setProperty("hadoop.home.dir", "D:\\soft\\developsoft\\Hadoop\\hadoop-2.6.5");
System.setProperty("HADOOP_USER_NAME", "hdfs");
}
}
来,pom也抛上来
UTF-8
1.8
1.8
1.1.5
org.apache.kafka
kafka_2.11
1.0.1
org.slf4j
slf4j-log4j12
jmxri
com.sun.jmx
jmxtools
com.sun.jdmk
jms
javax.jms
junit
junit
com.typesafe
config
1.2.1
junit
junit
3.8.1
test
ch.qos.logback
logback-core
${logback.version}
ch.qos.logback
logback-classic
${logback.version}
ch.qos.logback
logback-access
${logback.version}
commons-codec
commons-codec
RELEASE
net.alchim31.maven
scala-maven-plugin
3.2.2
compile
testCompile
org.apache.maven.plugins
maven-shade-plugin
2.4.3
package
shade
com.geotmt.dw.Application
写在最后:
Flink还是蛮新的技术,慢慢看着他长大吧。