大数据案例（九）——自定义Outputformat

代码下载地址：https://github.com/tazhigang/big-data-github.git

一、概述

要在一个mapreduce程序中根据数据的不同输出两类结果到不同目录，这类灵活的输出需求可以通过自定义outputformat来实现。

自定义outputformat，
改写recordwriter，具体改写输出数据的方法write()

二、案例需求

需求:过滤输入的log日志中是否包含baidu
- （1）包含atguigu的网站输出到j:/url/baidu_url.txt
- （2）不包含atguigu的网站输出到j:/url/other_url.txt
输入数据

=================log.txt====================
https://www.baidu.com
http://news.baidu.com
https://map.baidu.com
http://www.google.com
http://cn.bing.com
http://www.sohu.com
http://www.sina.com
https://github.com
https://my.oschina.net

输出结果

=================baidu_url.txt====================
http://news.baidu.com	
https://map.baidu.com	
https://www.baidu.com	
=================other_url.txt====================
http://cn.bing.com	
http://www.google.com	
http://www.sina.com	
http://www.sohu.com	
https://github.com	
https://my.oschina.net

三、创建maven项目

项目结构
代码实现

HDFSUtil.java

	package com.ittzg.hadoop.outputformat;

	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.fs.FileSystem;
	import org.apache.hadoop.fs.Path;
	import org.junit.After;
	import org.junit.Before;
	import org.junit.Test;

	import java.io.IOException;
	import java.net.URI;

	/**
	 * @email: [email protected]
	 * @author: ittzg
	 * @date: 2019/7/7 22:54
	 */
	public class HDFSUtil {
		Configuration configuration = new Configuration();
		FileSystem fileSystem = null;

		/**
		 * 每次执行添加有@Test注解的方法之前调用
		 */
		@Before
		public void init(){
			configuration.set("fs.defaultFs","hadoop-ip-101:9000");
			try {
				fileSystem = FileSystem.get(new URI("hdfs://hadoop-ip-101:9000"),configuration,"hadoop");
			} catch (Exception e) {
				throw new RuntimeException("获取hdfs客户端连接异常");
			}
		}
		/**
		 * 每次执行添加有@Test注解的方法之后调用
		 */
		@After
		public void closeRes(){
			if(fileSystem!=null){
				try {
					fileSystem.close();
				} catch (IOException e) {
					throw new RuntimeException("关闭hdfs客户端连接异常");
				}
			}
		}
		/**
		 * 上传文件
		 */
		@Test
		public void putFileToHDFS(){
			try {
				fileSystem.copyFromLocalFile(new Path("F:\\big-data-github\\hadoop-parent\\hadoop-outputformat\\src\\main\\resources\\file\\log.txt"),new Path("/user/hadoop/outputformat/input/log.txt"));
			} catch (IOException e) {
				e.printStackTrace();
				System.out.println(e.getMessage());
			}
		}
		/**
		 * 创建hdfs的目录
		 * 支持多级目录
		 */
		@Test
		public void mkdirAtHDFS(){
			try {
				boolean mkdirs = fileSystem.mkdirs(new Path("/user/hadoop/outputformat/input"));
				System.out.println(mkdirs);
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
	}

MyRecordWriter.java

	package com.ittzg.hadoop.outputformat;

	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.fs.FSDataOutputStream;
	import org.apache.hadoop.fs.FileSystem;
	import org.apache.hadoop.fs.Path;
	import org.apache.hadoop.io.NullWritable;
	import org.apache.hadoop.io.Text;
	import org.apache.hadoop.mapreduce.RecordWriter;
	import org.apache.hadoop.mapreduce.TaskAttemptContext;

	import java.io.IOException;

	/**
	 * @email: [email protected]
	 * @author: ittzg
	 * @date: 2019/7/7 22:57
	 */
	public class MyRecordWriter extends RecordWriter {

		private FSDataOutputStream baiduOut = null;
		private FSDataOutputStream otherOut = null;


		public MyRecordWriter(TaskAttemptContext job) {
			Configuration configuration = job.getConfiguration();
			try {
				FileSystem fileSystem = FileSystem.get(configuration);
				//创建两个输入流
				Path baiduPath = new Path("j:/url/baidu_url.txt");
				Path otherPath = new Path("j:/url/other_url.txt");
				baiduOut = fileSystem.create(baiduPath);
				otherOut = fileSystem.create(otherPath);
			} catch (IOException e) {
				e.printStackTrace();
			}
		}

		public void write(Text key, NullWritable value) throws IOException, InterruptedException {
			if(key.toString().contains("baidu")){
				baiduOut.write(key.toString().getBytes());
			}else{
				otherOut.write(key.toString().getBytes());
			}
		}

		public void close(TaskAttemptContext context) throws IOException, InterruptedException {
			if(baiduOut != null){
				baiduOut.close();
			}
			if(otherOut != null){
				otherOut.close();
			}
		}
	}

MyFileOutputFormat.java

	package com.ittzg.hadoop.outputformat;

	import org.apache.hadoop.io.NullWritable;
	import org.apache.hadoop.io.Text;
	import org.apache.hadoop.mapreduce.RecordWriter;
	import org.apache.hadoop.mapreduce.TaskAttemptContext;
	import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

	import java.io.IOException;

	/**
	 * @email: [email protected]
	 * @author: ittzg
	 * @date: 2019/7/7 22:55
	 */
	public class MyFileOutputFormat extends FileOutputFormat {
		public RecordWriter getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException {
			return new MyRecordWriter(job);
		}
	}

MyFileOutputFormatDriver.java

	package com.ittzg.hadoop.outputformat;

	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.fs.FileSystem;
	import org.apache.hadoop.fs.Path;
	import org.apache.hadoop.io.BytesWritable;
	import org.apache.hadoop.io.LongWritable;
	import org.apache.hadoop.io.NullWritable;
	import org.apache.hadoop.io.Text;
	import org.apache.hadoop.mapreduce.Job;
	import org.apache.hadoop.mapreduce.Mapper;
	import org.apache.hadoop.mapreduce.Reducer;
	import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
	import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
	import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;

	import java.io.IOException;
	import java.net.URI;

	/**
	 * @email: [email protected]
	 * @author: ittzg
	 * @date: 2019/7/7 23:08
	 */
	public class MyFileOutputFormatDriver {
		static class MyMapper extends Mapper{
			@Override
			protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
				context.write(value,NullWritable.get());
			}
		}

		static class MyReduce extends Reducer {
			Text urlFormat = new Text();
			@Override
			protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException {
				urlFormat.set(key.toString()+"\t\n");
				context.write(urlFormat,NullWritable.get());
			}
		}

		public static void main(String[] args) throws Exception {
			String input = "hdfs://hadoop-ip-101:9000/user/hadoop/outputformat/input";
			String output = "hdfs://hadoop-ip-101:9000/user/hadoop/outputformat/output";

			Configuration conf = new Configuration();
			conf.set("mapreduce.app-submission.cross-platform","true");
			Job job = Job.getInstance(conf);
			job.setJarByClass(MyFileOutputFormatDriver.class);

			job.setMapperClass(MyMapper.class);
			job.setReducerClass(MyReduce.class);

			job.setMapOutputKeyClass(Text.class);
			job.setMapOutputValueClass(NullWritable.class);

			job.setOutputKeyClass(Text.class);
			job.setOutputValueClass(NullWritable.class);


			FileSystem fs = FileSystem.get(new URI("hdfs://hadoop-ip-101:9000"),conf,"hadoop");
			Path outPath = new Path(output);
			if(fs.exists(outPath)){
				fs.delete(outPath,true);
			}
			// 将自定义的输出格式组件设置到job中
			job.setOutputFormatClass(MyFileOutputFormat.class);
			FileInputFormat.setInputPaths(job, new Path(input));

			// 虽然我们自定义了outputformat，但是因为我们的outputformat继承自fileoutputformat
			// 而fileoutputformat要输出一个_SUCCESS文件，所以，在这还得指定一个输出目录
			FileOutputFormat.setOutputPath(job, outPath);

			boolean result = job.waitForCompletion(true);

			System.exit(result ? 0 : 1);

		}
	}

四、运行结果

网页浏览及本地文件浏览
文件内容下载浏览

大数据案例（九）——自定义Outputformat

一、概述

二、案例需求

三、创建maven项目

四、运行结果

你可能感兴趣的:(大数据,git,java)