03 |
import org.apache.hadoop.conf.Configuration; |
04 |
import org.apache.hadoop.fs.Path; |
05 |
import org.apache.hadoop.io.LongWritable; |
06 |
import org.apache.hadoop.io.NullWritable; |
07 |
import org.apache.hadoop.io.Text; |
08 |
import org.apache.hadoop.mapred.*; |
09 |
import org.apache.hadoop.mapred.lib.MultipleOutputs; |
10 |
import org.apache.hadoop.util.GenericOptionsParser; |
12 |
import java.io.IOException; |
15 |
* User:http://www.iteblog.com/ |
19 |
public class OldMulOutput { |
20 |
public static class MapClass |
22 |
implements Mapper<LongWritable, |
23 |
Text, NullWritable, Text> { |
24 |
private MultipleOutputs mos; |
25 |
private OutputCollector<NullWritable, Text> collector; |
27 |
public void configure(JobConf conf) { |
28 |
mos = new MultipleOutputs(conf); |
31 |
public void map(LongWritable key, Text value, |
32 |
OutputCollector<NullWritable, Text> output, |
33 |
Reporter reporter) throws IOException { |
34 |
String[] arr = value.toString().split( "," , - 1 ); |
35 |
String chrono = arr[ 1 ] + "," + arr[ 2 ]; |
36 |
String geo = arr[ 4 ] + "," + arr[ 5 ]; |
37 |
collector = mos.getCollector( "chrono" , reporter); |
38 |
collector.collect(NullWritable.get(), new Text(chrono)); |
39 |
collector = mos.getCollector( "geo" , reporter); |
40 |
collector.collect(NullWritable.get(), new Text(geo)); |
43 |
public void close() throws IOException { |
47 |
public static void main(String[] args) throws IOException { |
48 |
Configuration conf = new Configuration(); |
49 |
String[] remainingArgs = |
50 |
new GenericOptionsParser(conf, args).getRemainingArgs(); |
52 |
if (remainingArgs.length != 2 ) { |
53 |
System.err.println( "Error!" ); |
57 |
JobConf job = new JobConf(conf, OldMulOutput. class ); |
58 |
Path in = new Path(remainingArgs[ 0 ]); |
59 |
Path out = new Path(remainingArgs[ 1 ]); |
60 |
FileInputFormat.setInputPaths(job, in); |
61 |
FileOutputFormat.setOutputPath(job, out); |
63 |
job.setJobName( "MultiFile" ); |
64 |
job.setMapperClass(MapClass. class ); |
65 |
job.setInputFormat(TextInputFormat. class ); |
66 |
job.setOutputKeyClass(NullWritable. class ); |
67 |
job.setOutputValueClass(Text. class ); |
69 |
job.setNumReduceTasks( 0 ); |
70 |
MultipleOutputs.addNamedOutput(job, |
72 |
TextOutputFormat. class , |
76 |
MultipleOutputs.addNamedOutput(job, |
78 |
TextOutputFormat. class , |
81 |
JobClient.runJob(job); |
上面程序来源《Hadoop in action》。同样将上面的程序打包成jar文件(具体怎么打包,也不说了),并在Hadoop2.2.0上面运行(测试数据请在这里下载:http://pan.baidu.com/s/1td8xN):
1 |
/home/q/hadoop- 2.2 . 0 /bin/hadoop jar \ |
2 |
/export1/tmp/wyp/OutputText.jar com.wyp.OldMulOutput \ |
3 |
/home/wyp/apat63_99.txt \ |
01 |
[wyp @l -datalogm1.data.cn1 bin]$ /home/q/hadoop- 2.2 . 0 /bin/hadoop fs \ |
04 |
-rw-r--r-- 3 wyp sg 0 2013 - 11 - 26 14 : 57 /home/wyp/out5/_SUCCESS |
05 |
-rw-r--r-- 3 wyp sg 31243 2013 - 11 - 26 15 : 57 /home/wyp/out5/chrono-m- 00000 |
06 |
-rw-r--r-- 3 wyp sg 22719 2013 - 11 - 26 15 : 57 /home/wyp/out5/chrono-m- 00001 |
07 |
-rw-r--r-- 3 wyp sg 29922 2013 - 11 - 26 15 : 57 /home/wyp/out5/geo-m- 00000 |
08 |
-rw-r--r-- 3 wyp sg 20429 2013 - 11 - 26 15 : 57 /home/wyp/out5/geo-m- 00001 |
09 |
-rw-r--r-- 3 wyp sg 0 2013 - 11 - 26 14 : 57 /home/wyp/out5/part-m- 00000 |
10 |
-rw-r--r-- 3 wyp sg 0 2013 - 11 - 26 14 : 57 /home/wyp/out5/part-m- 00001 |
The MultipleOutputs class simplifies writing output data to multiple outputs
Case one: writing to additional outputs other than the job default output. Each additional output, or named output, may be configured with its own OutputFormat, with its own key class and with its own value class.
Case two: to write data to different files provided by user
In the old MapReduce API there are two classes for producing multiple outputs: MultipleOutputFormat and MultipleOutputs. In a nutshell, MultipleOutputs is more fully featured, but MultipleOutputFormat has more control over the output directory structure and file naming. MultipleOutputs in the new API combines the best features of the two multiple output classes in the old API.
03 |
import org.apache.hadoop.conf.Configuration; |
04 |
import org.apache.hadoop.fs.Path; |
05 |
import org.apache.hadoop.io.LongWritable; |
06 |
import org.apache.hadoop.io.NullWritable; |
07 |
import org.apache.hadoop.io.Text; |
08 |
import org.apache.hadoop.mapreduce.Job; |
09 |
import org.apache.hadoop.mapreduce.Mapper; |
10 |
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; |
11 |
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; |
12 |
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; |
13 |
import org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat; |
14 |
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs; |
15 |
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; |
16 |
import org.apache.hadoop.util.GenericOptionsParser; |
18 |
import java.io.IOException; |
21 |
* User:http://www.iteblog.com/ |
25 |
public class MulOutput { |
26 |
public static class MapClass |
27 |
extends Mapper<LongWritable, Text, NullWritable, Text> { |
28 |
private MultipleOutputs mos; |
30 |
protected void setup(Context context) |
31 |
throws IOException, InterruptedException { |
33 |
mos = new MultipleOutputs(context); |
37 |
protected void map(LongWritable key, |
40 |
throws IOException, InterruptedException { |
41 |
mos.write(NullWritable.get(), value, |
42 |
generateFileName(value)); |
45 |
private String generateFileName(Text value) { |
46 |
String[] split = value.toString().split( "," , - 1 ); |
47 |
String country = split[ 4 ].substring( 1 , 3 ); |
52 |
protected void cleanup(Context context) |
53 |
throws IOException, InterruptedException { |
54 |
super .cleanup(context); |
59 |
public static void main(String[] args) |
60 |
throws IOException, ClassNotFoundException, |
61 |
InterruptedException { |
62 |
Configuration conf = new Configuration(); |
63 |
Job job = Job.getInstance(conf, "MulOutput" ); |
64 |
String[] remainingArgs = |
65 |
new GenericOptionsParser(conf, args) |
68 |
if (remainingArgs.length != 2 ) { |
69 |
System.err.println( "Error!" ); |
72 |
Path in = new Path(remainingArgs[ 0 ]); |
73 |
Path out = new Path(remainingArgs[ 1 ]); |
75 |
FileInputFormat.setInputPaths(job, in); |
76 |
FileOutputFormat.setOutputPath(job, out); |
78 |
job.setJarByClass(MulOutput. class ); |
79 |
job.setMapperClass(MapClass. class ); |
80 |
job.setInputFormatClass(TextInputFormat. class ); |
81 |
job.setOutputKeyClass(NullWritable. class ); |
82 |
job.setOutputValueClass(Text. class ); |
83 |
job.setNumReduceTasks( 0 ); |
85 |
System.exit(job.waitForCompletion( true ) ? 0 : 1 ); |
上面的程序通过setup(Context context)来初始化MultipleOutputs对象,并在mapper函数中调用MultipleOutputs的write方法将数据输出到根据value的值不同的文件夹中(通过调用generateFileName函数来处理)。MultipleOutputs类有多个不同版本的write方法,它们的函数原型如下:
1 |
public <K, V> void write(String namedOutput, K key, V value) |
2 |
throws IOException, InterruptedException |
4 |
public <K, V> void write(String namedOutput, K key, V value, |
5 |
String baseOutputPath) throws IOException, InterruptedException |
7 |
public void write(KEYOUT key, VALUEOUT value, String baseOutputPath) |
8 |
throws IOException, InterruptedException |
1 |
/home/q/hadoop- 2.2 . 0 /bin/hadoop jar \ |
2 |
/export1/tmp/wyp/OutputText.jar com.wyp.MulOutput \ |
3 |
/home/wyp/apat63_99.txt \ |
01 |
[wyp @l -datalogm1.data.cn1 bin]$ /home/q/hadoop- 2.2 . 0 /bin/hadoop fs \ |
03 |
.............................这里省略了很多................................... |
04 |
drwxr-xr-x - wyp supergroup 0 2013 - 11 - 26 19 : 42 /home/wyp/out11/VN |
05 |
drwxr-xr-x - wyp supergroup 0 2013 - 11 - 26 19 : 41 /home/wyp/out11/VU |
06 |
drwxr-xr-x - wyp supergroup 0 2013 - 11 - 26 19 : 42 /home/wyp/out11/YE |
07 |
drwxr-xr-x - wyp supergroup 0 2013 - 11 - 26 19 : 42 /home/wyp/out11/YU |
08 |
drwxr-xr-x - wyp supergroup 0 2013 - 11 - 26 19 : 42 /home/wyp/out11/ZA |
09 |
.............................这里省略了很多................................... |
10 |
-rw-r--r-- 3 wyp supergroup 0 2013 - 11 - 26 19 : 42 /home/wyp/out11/_SUCCESS |
11 |
-rw-r--r-- 3 wyp supergroup 0 2013 - 11 - 26 19 : 42 /home/wyp/out11/part-m- 00000 |
12 |
-rw-r--r-- 3 wyp supergroup 0 2013 - 11 - 26 19 : 42 /home/wyp/out11/part-m- 00001 |
1 |
LazyOutputFormat.setOutputFormatClass(job, |
2 |
TextOutputFormat. class ); |
1 |
job.setOutputFormatClass(TextOutputFormat. class ); |
01 |
[wyp @l -datalogm1.data.cn1 bin]$ /home/q/hadoop- 2.2 . 0 /bin/hadoop fs \ |
03 |
.............................这里省略了很多................................... |
04 |
drwxr-xr-x - wyp supergroup 0 2013 - 11 - 26 19 : 44 /home/wyp/out12/VU |
05 |
drwxr-xr-x - wyp supergroup 0 2013 - 11 - 26 19 : 44 /home/wyp/out12/YE |
06 |
drwxr-xr-x - wyp supergroup 0 2013 - 11 - 26 19 : 44 /home/wyp/out12/YU |
07 |
drwxr-xr-x - wyp supergroup 0 2013 - 11 - 26 19 : 44 /home/wyp/out12/ZA |
08 |
drwxr-xr-x - wyp supergroup 0 2013 - 11 - 26 19 : 44 /home/wyp/out12/ZM |
09 |
drwxr-xr-x - wyp supergroup 0 2013 - 11 - 26 19 : 44 /home/wyp/out12/ZW |
10 |
.............................这里省略了很多................................... |
11 |
-rw-r--r-- 3 wyp supergroup 0 2013 - 11 - 26 19 : 44 /home/wyp/out12/_SUCCESS |