该项目的pom.xml文件:
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
package com.z.youtube.util; public class ETLUtils { /** * 1、过滤不合法数据 * 2、去掉&符号左右两边的空格 * 3、\t换成&符号 * @param ori * @return */ public static String getETLString(String ori){ String[] splits = ori.split("\t"); //1、过滤不合法数据 if(splits.length < 9) return null; //2、去掉&符号左右两边的空格 splits[3] = splits[3].replaceAll(" ", ""); StringBuilder sb = new StringBuilder(); //3、\t换成&符号 for(int i = 0; i < splits.length; i++){ sb.append(splits[i]); if(i < 9){ if(i != splits.length - 1){ sb.append("\t"); } }else{ if(i != splits.length - 1){ sb.append("&"); } } } return sb.toString(); } } |
package com.z.youtube.mr.etl; import java.io.IOException; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import com.z.youtube.util.ETLUtil; public class VideoETLMapper extends Mapper Text text = new Text(); @Override protected void map(Object key, Text value, Context context) throws IOException, InterruptedException { String etlString = ETLUtil.oriString2ETLString(value.toString()); if(StringUtils.isBlank(etlString)) return; text.set(etlString); context.write(NullWritable.get(), text); } } |
package com.z.youtube.mr.etl; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; public class VideoETLRunner implements Tool { private Configuration conf = null; @Override public void setConf(Configuration conf) { this.conf = conf; } @Override public Configuration getConf() { return this.conf; } @Override public int run(String[] args) throws Exception { conf = this.getConf(); conf.set("inpath", args[0]); conf.set("outpath", args[1]); Job job = Job.getInstance(conf, "youtube-video-etl"); job.setJarByClass(VideoETLRunner.class); job.setMapperClass(VideoETLMapper.class); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(Text.class); job.setNumReduceTasks(0); this.initJobInputPath(job); this.initJobOutputPath(job); return job.waitForCompletion(true) ? 0 : 1; } private void initJobOutputPath(Job job) throws IOException { Configuration conf = job.getConfiguration(); String outPathString = conf.get("outpath"); FileSystem fs = FileSystem.get(conf); Path outPath = new Path(outPathString); if(fs.exists(outPath)){ fs.delete(outPath, true); } FileOutputFormat.setOutputPath(job, outPath); } private void initJobInputPath(Job job) throws IOException { Configuration conf = job.getConfiguration(); String inPathString = conf.get("inpath"); FileSystem fs = FileSystem.get(conf); Path inPath = new Path(inPathString); if(fs.exists(inPath)){ FileInputFormat.addInputPath(job, inPath); }else{ throw new RuntimeException("HDFS中该文件目录不存在:" + inPathString); } } public static void main(String[] args) { try { int resultCode = ToolRunner.run(new VideoETLRunner(), args); if(resultCode == 0){ System.out.println("Success!"); }else{ System.out.println("Fail!"); } System.exit(resultCode); } catch (Exception e) { e.printStackTrace(); System.exit(1); } } } |
赠送编译打包命令提示:-P local clean package
$ bin/yarn jar ~/softwares/jars/youtube-0.0.1-SNAPSHOT.jar \ com.z.youtube.etl.ETLYoutubeVideosRunner \ /youtube/video/2008/0222 \ /youtube/output/video/2008/0222 |