大数据之Hive之扩展项目Youtube案例<二>

该项目的pom.xml文件:

       xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">

       4.0.0

       com.z

       youtube

       0.0.1-SNAPSHOT

       jar

       youtube

       http://maven.apache.org

      

              UTF-8

        

      

             

                     centor

                     http://central.maven.org/maven2/

             

      

      

             

                     junit

                     junit

                     3.8.1

                     test

             

             

                     org.apache.hadoop

                     hadoop-client

                     2.7.2

             

             

                     org.apache.hadoop

                     hadoop-yarn-server-resourcemanager

                     2.7.2

             

      

3.6.1ETLETLUtil

package com.z.youtube.util;

public class ETLUtils {

       /**

        * 1、过滤不合法数据

        * 2、去掉&符号左右两边的空格

        * 3\t换成&符号

        * @param ori

        * @return

        */

       public static String getETLString(String ori){

              String[] splits = ori.split("\t");

              //1、过滤不合法数据

              if(splits.length < 9) return null;

              //2、去掉&符号左右两边的空格

              splits[3] = splits[3].replaceAll(" ", "");

              StringBuilder sb = new StringBuilder();

              //3\t换成&符号

              for(int i = 0; i < splits.length; i++){

                     sb.append(splits[i]);

                     if(i < 9){

                            if(i != splits.length - 1){

                                   sb.append("\t");

                            }

                     }else{

                            if(i != splits.length - 1){

                                   sb.append("&");

                            }

                     }

              }

              return sb.toString();

       }

}

3.6.2ETLMapper

package com.z.youtube.mr.etl;

import java.io.IOException;

import org.apache.commons.lang.StringUtils;

import org.apache.hadoop.io.NullWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Mapper;

import com.z.youtube.util.ETLUtil;

public class VideoETLMapper extends Mapper{

       Text text = new Text();

       @Override

       protected void map(Object key, Text value, Context context) throws IOException, InterruptedException {

              String etlString = ETLUtil.oriString2ETLString(value.toString());

              if(StringUtils.isBlank(etlString)) return;

                           text.set(etlString);

              context.write(NullWritable.get(), text);

       }

}

3.6.3ETLRunner

package com.z.youtube.mr.etl;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.NullWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.apache.hadoop.util.Tool;

import org.apache.hadoop.util.ToolRunner;

public class VideoETLRunner implements Tool {

       private Configuration conf = null;

       @Override

       public void setConf(Configuration conf) {

              this.conf = conf;

       }

       @Override

       public Configuration getConf() {

              return this.conf;

       }

       @Override

       public int run(String[] args) throws Exception {

              conf = this.getConf();

              conf.set("inpath", args[0]);

              conf.set("outpath", args[1]);

              Job job = Job.getInstance(conf, "youtube-video-etl");         

              job.setJarByClass(VideoETLRunner.class);             

              job.setMapperClass(VideoETLMapper.class);

              job.setMapOutputKeyClass(NullWritable.class);

              job.setMapOutputValueClass(Text.class);

              job.setNumReduceTasks(0);             

              this.initJobInputPath(job);

              this.initJobOutputPath(job);           

              return job.waitForCompletion(true) ? 0 : 1;

       } 

       private void initJobOutputPath(Job job) throws IOException {

              Configuration conf = job.getConfiguration();

              String outPathString = conf.get("outpath");           

              FileSystem fs = FileSystem.get(conf);             

              Path outPath = new Path(outPathString);

              if(fs.exists(outPath)){

                     fs.delete(outPath, true);

              }            

              FileOutputFormat.setOutputPath(job, outPath);          

       }

       private void initJobInputPath(Job job) throws IOException {

              Configuration conf = job.getConfiguration();

              String inPathString = conf.get("inpath");         

              FileSystem fs = FileSystem.get(conf);            

              Path inPath = new Path(inPathString);

              if(fs.exists(inPath)){

                     FileInputFormat.addInputPath(job, inPath);

              }else{

                     throw new RuntimeException("HDFS中该文件目录不存在:" + inPathString);

              }

       }

       public static void main(String[] args) {

              try {

                     int resultCode = ToolRunner.run(new VideoETLRunner(), args);

                     if(resultCode == 0){

                            System.out.println("Success!");

                     }else{

                            System.out.println("Fail!");

                     }

                     System.exit(resultCode);

              } catch (Exception e) {

                     e.printStackTrace();

                     System.exit(1);

              }

       }

}

3.6.4、执行ETL

赠送编译打包命令提示:-P local clean package

$ bin/yarn jar ~/softwares/jars/youtube-0.0.1-SNAPSHOT.jar \

com.z.youtube.etl.ETLYoutubeVideosRunner \

/youtube/video/2008/0222 \

/youtube/output/video/2008/0222

你可能感兴趣的:(大数据之Hive之扩展项目Youtube案例<二>)