在MapReduce中利用MultipleOutputs输出多个文件

最近在学习Hadoop,由于用到要将reduce结果输出到多个文档中,所以研究了一下MultipleOutputs用法,在这里总结一下。
首先我用到的例子是将原始数据按国家分类成不同的文档,数据是从网上拷贝下来的,如下:

18.217.167.70   United States
206.96.54.107   United States
196.109.151.139 Mauritius
174.52.58.113   United States
142.111.216.8   Canada
162.100.49.185  United States
146.38.26.54    United States
36.35.107.36    China
95.214.95.13    Spain
2.96.191.111    United Kingdom
62.177.119.177  Czech Republic
21.165.189.3    United States
46.190.32.115   Greece
113.173.113.29  Vietnam
42.65.172.142   Taiwan
197.91.198.199  South Africa
68.165.71.27    United States
110.119.165.104 China
171.50.76.89    India
171.207.52.113  Singapore
40.174.30.170   United States
191.170.95.175  United States
17.81.129.101   United States
91.212.157.202  France
173.83.82.99    United States
129.75.56.220   United States
149.25.104.198  United States
103.110.22.19   Indonesia
204.188.117.122 United States
138.23.10.72    United States
172.50.15.32    United States
85.88.38.58     Belgium
49.15.14.6      India
19.84.175.5     United States
50.158.140.215  United States
161.114.120.34  United States
118.211.174.52  Australia
220.98.113.71   Japan
182.101.16.171  China
25.45.75.194    United Kingdom
168.16.162.99   United States
155.60.219.154  Australia
26.216.17.198   United States
68.34.157.157   United States
89.176.196.28   Czech Republic
173.11.51.134   United States
116.207.191.159 China
164.210.124.152 United States
168.17.158.38   United States
174.24.173.11   United States
143.64.173.176  United States
160.164.158.125 Italy
15.111.128.4    United States
22.71.176.163   United States
105.57.100.182  Morocco
111.147.83.42   China
137.157.65.89   Australia

原数据格式是“IP\tCountry”,现在要做的就是统计每个国家的IP数,然后每个国家以单独的文档存放,文档文档名称是国家名,内容是IP总数。
实现的完整代码如下:

import java.io.IOException;
import java.util.regex.Pattern;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;

/*数据格式为:IP地址 国家(eg:111.147.83.42 China),将其按国家分类输出*/
public class TestMultipleOutput1 {

    public static class IPCountryMapper extends Mapper<Object, Text, Text, IntWritable>{
        private final static int country_pos = 1;
        private final static Pattern pattern = Pattern.compile("\\t"); 
        protected void map(Object key, Text value, Context context) throws IOException, InterruptedException{
            String country = pattern.split(value.toString())[country_pos]; 
            context.write(new Text(country), new IntWritable(1));

            System.out.println("key=" + new Text(country) + ";" + "value=" + new IntWritable(1));//测试输出看是否得到相应的key和value
        }
    }

    public static class IPCountryReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
        private MultipleOutputs output;
        protected void setup(Context context){
            output = new MultipleOutputs(context);
        }

        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException{
            int total = 0;
            for(IntWritable value:values){
                total += value.get();
            }
            //格式:public void write(KEYOUT key, VALUEOUT value, String baseOutputPath)
            //output.write(new Text("Output by MultipleOutputs"), NullWritable.get(), key.toString());
            output.write(key, new IntWritable(total), key.toString());
        }

        protected void cleanup(Context context) throws IOException, InterruptedException{
            output.close();
        }
    }

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration();
        if(args.length != 2){
            System.out.println("Usage:<in><out>");
            System.exit(2);
        }

        /*Hadoop自己的安全性检查,如果输出文件存在,程序运行就会失败,所以在这里我们先做判断,如果输出文件存在则删除它*/
        FileSystem hdfs = FileSystem.get(conf);
        Path path = new Path(args[1]);
        if(hdfs.exists(path))
            hdfs.delete(path, true);

        Job job = new Job(conf, "IP count by country to named files");
        job.setJarByClass(TestMultipleOutput1.class);
        //设置输入格式
        job.setInputFormatClass(TextInputFormat.class);

        job.setMapperClass(IPCountryMapper.class);
        job.setMapOutputKeyClass(Text.class);//(1)
        job.setMapOutputValueClass(IntWritable.class);//(2)

        job.setReducerClass(IPCountryReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        //设置输入、输出路径
        FileInputFormat.addInputPath(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        System.exit(job.waitForCompletion(true)?0:1);
    }
}

这里主要是在重写reduce时设置了多文件输出;再一个Map的输出格式一定要和Reduce的输入格式相对应,就是如果二者不是默认的数据类型,一定要加上注释中的(1)(2),不然一直会提示数据不一致的错误。

你可能感兴趣的:(hadoop,多文档输出)