Flink生成Hfile

提出需求:

       团队为了统一技术栈,一致同意用Flink来进行对批和流计算统一处理。

 

问题来了:

      Flink似乎相对spark来说还是很不完善,spark分分钟搞定的事情,在Flink里就需要动动脑子了。

 

参考spark;

object CreateHfile {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("CreateHfile").setMaster(args(0))
    val sc = new SparkContext(conf)

    val hbaseConf = HBaseConfiguration.create()
    //
    val rdd = sc.textFile(args(1))
        .flatMap(v =>{
                val x = new javaList[String]()
                for( a <- 1 to 9999){
                        x.add(v + "%04d".format(a))
                }
                x.toArray
        })
        .sortBy(v=>v.toString)
        .map(r =>(new ImmutableBytesWritable(Bytes.toBytes(r.toString)),
            new KeyValue(Bytes.toBytes(r.toString), Bytes.toBytes("phoneFamliy"), Bytes.toBytes("phoneCol"),System.currentTimeMillis(),KeyValue.Type.DeleteColumn)))

    rdd.saveAsNewAPIHadoopFile(args(2), classOf[ImmutableBytesWritable],classOf[KeyValue],classOf[HFileOutputFormat2], hbaseConf)
    sc.stop()

  }

}

再来看看MR

public class HFileCreateJob {
    private final static Logger log = LoggerFactory.getLogger(HFileCreateJob.class);

    public void run(String input,String output,String env) throws Exception {

        Configuration conf = new Configuration();
        if("dev".equals(env)){
            devHeader(conf) ;
        }

        try {
            // 运行前,删除已存在的中间输出目录
            try {
                FileSystem fs = FileSystem.get(URI.create(output), conf);
                fs.delete(new Path(output), true);
                fs.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
            Job job = Job.getInstance(conf, "HFileCreateJob");
            job.setJobName("Zhao@HFileCreateJob_V1.0");
            job.setJarByClass(HFileCreateJob.class);
            job.setMapperClass(HfileMapper.class);
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(Text.class);

            job.setReducerClass(HfileReducer.class);
            job.setOutputKeyClass(ImmutableBytesWritable.class);
            job.setOutputValueClass(KeyValue.class);
            FileInputFormat.addInputPath(job, new Path(input));
            FileOutputFormat.setOutputPath(job, new Path(output));
            job.setOutputFormatClass(HFileOutputFormat2.class);
            System.exit(job.waitForCompletion(true) ? 0 : 1);

        } catch (IOException e) {
            e.printStackTrace();
        }

    }

    private void devHeader(Configuration conf){
        // 本地测试提交到测试集群
        conf.set("mapreduce.app-submission.cross-platform", "true");
        conf.set("mapreduce.job.ubertask.enable", "true");
        conf.set("fs.defaultFS","hdfs://10.10.10.165:8020");
        conf.set("mapreduce.job.jar","E:\\intermult-hbase\\target\\intermulthbase-1.0-SNAPSHOT.jar");
        // 支持hdfs下目录含子目录
        conf.set("mapreduce.input.fileinputformat.input.dir.recursive", "true");
        System.setProperty("hadoop.home.dir", "D:\\soft\\developsoft\\Hadoop\\hadoop-2.6.5");
        System.setProperty("HADOOP_USER_NAME", "hdfs");
    }

public class HfileMapper extends Mapper {
    private String rowKeySalt =  ConfigFactory.load().getConfig("hfileCreate").getString("rowKeySalt") ;

    @Override
    protected void map(LongWritable key, Text value, Context context)throws IOException, InterruptedException {
        String[] datas = value.toString().split("\\001");
        String content = value.toString().replaceAll("\\001","\\!\\@\\#\\$") ;
        Text rowKey = new Text(SHA256Util.getSHA256Str(datas[0] + rowKeySalt )) ;
        context.write(rowKey,new Text(content));
    }
}

public class HfileReducer extends Reducer {
    private final static Logger logger = LoggerFactory.getLogger(HFileCreateJob.class);
    private Config env =  ConfigFactory.load().getConfig("hfileCreate") ;
    private String family =  env.getString("family") ;
    private String column=  env.getString("column") ;

    @Override
    protected void reduce(Text key, Iterable values, Reducer.Context context)
            throws IOException, InterruptedException {

        for (Text value : values) {
            try{
                String line = value.toString();
                logger.error("line : " + line);
                ImmutableBytesWritable rowkey = new ImmutableBytesWritable(key.toString().getBytes());
                KeyValue kv = new KeyValue(key.toString().getBytes(), this.family.getBytes(), column.getBytes() , line.getBytes());
                context.write(rowkey, kv);

            }catch (Exception e){
                logger.error("",e);
                e.printStackTrace();
            }

        }
    }
}

最后是Flink的方案:

import org.apache.flink.api.common.functions.RichGroupReduceFunction;
import org.apache.flink.api.common.operators.Order;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.hadoop.mapreduce.HadoopInputFormat;
import org.apache.flink.api.java.hadoop.mapreduce.HadoopOutputFormat;
import org.apache.flink.api.java.tuple.Tuple1;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.util.Collector;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;

/**
 * 用Flink生产Hfile
 * https://ci.apache.org/projects/flink/flink-docs-release-1.7/dev/batch/hadoop_compatibility.html
 * Created by geo on 2019/4/8. */
public class Application {

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        devHeader(conf);
        Job job = Job.getInstance(conf);

        // HDFS 输入
        HadoopInputFormat hadoopIF =
                new HadoopInputFormat(
                        new TextInputFormat(), LongWritable.class, Text.class, job
                );
        TextInputFormat.addInputPath(job, new Path("hdfs://2.2.2.2:8020/user/zhao/out0226/testHfile"));


        // Flink就干了这点事
        ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
        DataSet> textDataSet = env.createInput(hadoopIF);
        DataSet> ds =  textDataSet.map(v-> Tuple1.of(v.f1.toString()))
                .returns(Types.TUPLE(Types.STRING))
                .groupBy(0)
                .sortGroup(0,Order.ASCENDING)
                .reduceGroup(new createHfile());

        // 设置输出类型
        job.setOutputKeyClass(ImmutableBytesWritable.class);
        job.setOutputValueClass(KeyValue.class);

        // 输出到HDFS
        HadoopOutputFormat hadoopOF =
                new HadoopOutputFormat(
                        new HFileOutputFormat2(), job
                );
        HFileOutputFormat2.setOutputPath(job, new Path("hdfs://10.111.32.165:8020/user/zhao/out0226/9/"));
        job.setOutputFormatClass(HFileOutputFormat2.class);
        ds.output(hadoopOF);
        env.execute();
    }

    // 生产 Tuple2
    public static final class createHfile extends RichGroupReduceFunction, Tuple2> {

        @Override
        public void reduce(Iterable> values, Collector> out) throws Exception {
            String family="datasfamily";
            String column="content";
            for (Tuple1 key:values) {
                ImmutableBytesWritable rowkey = new ImmutableBytesWritable(key.toString().getBytes());
                KeyValue kv = new KeyValue(key.toString().getBytes(), family.getBytes(), column.getBytes() , key.f0.getBytes());
                out.collect(Tuple2.of(rowkey,kv));
            }
        }
    }

    /**
     * 本地或测试环境使用
     * @param conf Configuration
     */
    private static void devHeader(Configuration conf){
        // 本地测试提交到测试集群
        conf.set("mapreduce.app-submission.cross-platform", "true");
        conf.set("mapreduce.job.ubertask.enable", "true");
        conf.set("fs.defaultFS","hdfs://2.2.2.2:8020");
        // 支持hdfs下目录含子目录
        conf.set("mapreduce.input.fileinputformat.input.dir.recursive", "true");
        System.setProperty("hadoop.home.dir", "D:\\soft\\developsoft\\Hadoop\\hadoop-2.6.5");
        System.setProperty("HADOOP_USER_NAME", "hdfs");
    }
}

来,pom也抛上来

 
        UTF-8
        1.8
        1.8
        1.1.5
    

    
        
            org.apache.kafka
            kafka_2.11
            1.0.1
            
                
                    org.slf4j
                    slf4j-log4j12
                
                
                    jmxri
                    com.sun.jmx
                
                
                    jmxtools
                    com.sun.jdmk
                
                
                    jms
                    javax.jms
                
                
                    junit
                    junit
                
            
        
        
            com.typesafe
            config
            1.2.1
        
        
            junit
            junit
            3.8.1
            test
        
        
        
            ch.qos.logback
            logback-core
            ${logback.version}
        
        
            ch.qos.logback
            logback-classic
            ${logback.version}
        

        
            ch.qos.logback
            logback-access
            ${logback.version}
        
        
            commons-codec
            commons-codec
            RELEASE
        
    

    
        
            
                net.alchim31.maven
                scala-maven-plugin
                3.2.2
                
                    
                        
                            compile
                            testCompile
                        
                    
                
            
            
                org.apache.maven.plugins
                maven-shade-plugin
                2.4.3
                
                    
                        package
                        
                            shade
                        
                        
                            
                                
                                    com.geotmt.dw.Application
                                
                            
                        
                    
                
            
        
    

写在最后:

 

    Flink还是蛮新的技术,慢慢看着他长大吧。

你可能感兴趣的:(大数据,demo)