自定义outputFormat,根据内容输出到不同文件中

根据内容的不同,数据写到不同的文件中

代码

package cn.feizhou.logenhance;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class LogEnhance {

	static class LogEnhanceMapper extends Mapper {


		Text k = new Text();
		NullWritable v = NullWritable.get();
		@Override
		protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
			// 获取一个计数器用来记录不合法的日志行数, 组名, 计数器名称
			Counter counter = context.getCounter("malformed", "malformedline");
			String line = value.toString();
			String[] fields = StringUtils.split(line, ",");
			 
				String name = fields[0];
				if (name.contains("order")||name.contains("pid")) {
					k.set(line+"--");
					context.write(k, v);
				} else {
					//计数器+1
					counter.increment(1);
				}
		}

	}

	public static void main(String[] args) throws Exception {

		Configuration conf = new Configuration();

		Job job = Job.getInstance(conf);

		job.setJarByClass(LogEnhance.class);

		job.setMapperClass(LogEnhanceMapper.class);

		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(NullWritable.class);

		// 要控制不同的内容写往不同的目标路径,可以采用自定义outputformat的方法
		job.setOutputFormatClass(LogEnhanceOutputFormat.class);

		FileInputFormat.setInputPaths(job, new Path("H:/test/"));

		// 尽管我们用的是自定义outputformat,但是它是继承制fileoutputformat
		// 在fileoutputformat中,必须输出一个_success文件,所以在此还需要设置输出path
		FileOutputFormat.setOutputPath(job, new Path("H:/out/"));

		// 不需要reducer
		job.setNumReduceTasks(0);

		job.waitForCompletion(true);
		System.exit(0);

	}

}
-------------------------------------
package cn.feizhou.logenhance;

import java.io.IOException;

import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * maptask或者reducetask在最终输出时,先调用OutputFormat的getRecordWriter方法拿到一个RecordWriter
 * 然后再调用RecordWriter的write(k,v)方法将数据写出
 * 
 * @author
 * 
 */
public class LogEnhanceOutputFormat extends FileOutputFormat {

	@Override
	public RecordWriter getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException {

		FileSystem fs = FileSystem.get(context.getConfiguration());

		Path orderPath = new Path("H:/out/en/order.txt");
		Path pidPath = new Path("H:/out/en/pid.txt");

		FSDataOutputStream orderPathOs = fs.create(orderPath);
		FSDataOutputStream pidPathOs = fs.create(pidPath);

		return new EnhanceRecordWriter(orderPathOs, pidPathOs);
	}

	/**
	 * 构造一个自己的recordwriter
	 * 
	 * @author
	 * 
	 */
	static class EnhanceRecordWriter extends RecordWriter {
		FSDataOutputStream orderPathOs = null;
		FSDataOutputStream pidPathOs = null;

		public EnhanceRecordWriter(FSDataOutputStream orderPathOs, FSDataOutputStream pidPathOs) {
			super();
			this.orderPathOs = orderPathOs;
			this.pidPathOs = pidPathOs;
		}

		@Override
		public void write(Text key, NullWritable value) throws IOException, InterruptedException {
			String result = key.toString();
			// 如果要写出的数据是订单数据,则写入 H:/out/en/order.dat
			if (result.contains("order_")) {
				orderPathOs.write(result.getBytes());
			} else {
				// 如果要写出的数据是产品数据,则写入 H:/out/en/pid.dat
				pidPathOs.write(result.getBytes());
			}

		}

		@Override
		public void close(TaskAttemptContext context) throws IOException, InterruptedException {
			if (orderPathOs != null) {
				orderPathOs.close();
			}
			if (pidPathOs != null) {
				pidPathOs.close();
			}

		}

	}

}

测试数据orders.txt

order_0000001,22
order_0000001,22
order_0000002,22
pid_0000002,22
pid_0000002,22
xx_0000003,22
xx_0000006,22
xx_0000005,22
xx_0000004,22

 

结果

自定义outputFormat,根据内容输出到不同文件中_第1张图片

order.txt

order_0000001,22--order_0000001,22--order_0000002,22--

 

pid.txt

pid_0000002,22--pid_0000002,22--

 

你可能感兴趣的:(大数据)