解决MapperReduce在实际应用中产生的数据倾斜问题

1.txt:

hello tom1
hello tom2
hello tom3
hello tom4
hello tom5
hello tom6
hello tom7
hello tom8
hello tom9
hello tom10

2.txt

hello tom11
hello tom12
hello tom13
hello tom14
hello tom15
hello tom16
hello tom17
hello tom18
hello tom19
hello tom20

3.txt

hello tom21
hello tom22
hello tom23
hello tom24
hello tom25
hello tom26
hello tom27
hello tom28
hello tom29
hello tom30

先写一个能产生数据倾斜的MapperReduce代码,如下:

 

map端:

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * Created  on 2017/3/16.
 */
public class WCSkewMapper extends Mapper {
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String[] arr = value.toString().split(" ");

        Text keyOut = new Text();
        IntWritable valueOut = new IntWritable();

        for (String s : arr) {
            keyOut.set(s);
            valueOut.set(1);
            context.write(keyOut, valueOut);
        }
    }
}


reduce 端:

 
  

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 * Created  on 2017/3/16.
 */
public class WCSkewReducer extends Reducer{
    @Override
    protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException {
        int count =0 ;
        for(IntWritable iw : values){
            count = count  + iw.get();
        }
        context.write(key,new IntWritable(count));
    }
}

App端:

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**
 * Created on 2017/3/16.
 */
public class WCSkewApp {
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();

        conf.set("fs.defaultFS","file:///");
        Job job = Job.getInstance(conf);


        //设置job 的各种属性
        job.setJobName("WCAkewApp");
        job.setJarByClass(WCSkewApp.class);
        job.setInputFormatClass(TextInputFormat.class);

        //添加输入路径
        FileInputFormat.addInputPath(job,new Path("g:/comp/skew"));
        FileOutputFormat.setOutputPath(job,new  Path("g:/comp/out"));

        //设置合成类
        job.setCombinerClass(WCSkewReducer.class);

        //设置任务类
        job.setMapperClass(WCSkewMapper.class);
        job.setReducerClass(WCSkewReducer.class);


        //reduce任务数
        job.setNumReduceTasks(4);

        //设置kv类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        job.waitForCompletion(true);

    }
}

执行以上代码,则会在g:/comp/out/下产生数据的数据。

30个hello 都进到一个reduce 执行,以下为解决办法:


利用随机分区解决:

map端 

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * Created on 2017/3/16.
 */
public class WCSkewMapper extends Mapper {
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String[] arr = value.toString().split(" ");

        Text keyOut = new Text();
        IntWritable valueOut = new IntWritable();

        for (String s : arr) {
            keyOut.set(s);
            valueOut.set(1);
            context.write(keyOut, valueOut);
        }
    }
}

reduce端


import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 * Created  on 2017/3/16.
 */
public class WCSkewReducer extends Reducer{
    @Override
    protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException {
        int count =0 ;
        for(IntWritable iw : values){
            count = count  + iw.get();
        }
        context.write(key,new IntWritable(count));
    }
}


RandomPartitioner 端

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;

import java.util.Random;

/**
 * Created  on 2017/3/18.
 */
public class RandomPartitioner extends Partitioner{

    public int getPartition(Text text, IntWritable intWritable, int numPartitioner) {
        return new Random().nextInt(numPartitioner);
    }
}


App 端

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * Created  on 2017/3/16.
 */
public class WCSkewApp {
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();

        conf.set("fs.defaultFS","file:///");
        Job job = Job.getInstance(conf);


        //设置job 的各种属性
        job.setJobName("WCAkewApp");
        job.setJarByClass(WCSkewApp.class);
        job.setInputFormatClass(TextInputFormat.class);

        //添加输入路径
        FileInputFormat.addInputPath(job,new Path("g:/comp/skew"));
        FileOutputFormat.setOutputPath(job,new  Path("g:/comp/out"));

        //设置合成类
        job.setPartitionerClass(RandomPartitioner.class);//设置分区类
        job.setCombinerClass(WCSkewReducer.class);

        //设置任务类
        job.setMapperClass(WCSkewMapper.class);
        job.setReducerClass(WCSkewReducer.class);


        //reduce任务数
        job.setNumReduceTasks(4);

        //设置kv类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        job.waitForCompletion(true);

    }
}


利用随机分区,使hello分散在不同的reduceTask上进行计算,但执行到这步还没结束,因为此时产生的结果不是我想要的,因为hello是分散在不同的part-上,我们真正要的结果是每个单词出现的次数,所以我们还要进行一次Job(Mapper Reduce)任务, 如下:


map 端

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * Created  on 2017/3/18.
 */
public class WCSkewMapper extends Mapper{

    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String[] arr = value.toString().split("\t");
        context.write(new Text(arr[0]), new IntWritable(Integer.parseInt(arr[1])));
    }
}

reduce 端

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 * Created on 2017/3/16.
 */
public class WCSkewReducer extends Reducer{
    protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException {
        int count = 0;
        for (IntWritable iw : values) {
            count = count + iw.get();
        }
        context.write(key, new IntWritable(count));
    }
}


App端

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


/**
 *解决数据倾斜问题
 */
public class WCSkewApp {
    public static void main(String[] args) throws Exception {
        //加载配置文件
        Configuration conf = new Configuration();
        //设置本地文件系统
        conf.set("fs.defaultFS", "file:///");
        //创建job对象
        Job job = Job.getInstance(conf);


        //设置job的属性
        job.setJobName("WCSkewApp");
        job.setJarByClass(WCSkewApp.class);
        //设置文件输入格式
        job.setInputFormatClass(TextInputFormat.class);

        //设置Mapper Reduce类
        job.setMapperClass(WCSkewMapper.class);
        job.setReducerClass(WCSkewReducer.class);

        //设置map reduce 的kv 建输出类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        //设置文件输入和输出路径
        FileInputFormat.addInputPath(job, new Path("g:/comp/out/part-r-00000"));
        FileInputFormat.addInputPath(job, new Path("g:/comp/out/part-r-00001"));
        FileInputFormat.addInputPath(job, new Path("g:/comp/out/part-r-00002"));
        FileInputFormat.addInputPath(job, new Path("g:/comp/out/part-r-00003"));
        FileOutputFormat.setOutputPath(job, new Path("g:/comp/out8"));

        //设置reduce 个数
        job.setNumReduceTasks(4);


        job.waitForCompletion(true);


    }

}
执行此job任务之后,得到的hello 为其总数,即为我们想要的 此时数据倾斜问题得到解决。。






在上述解决数据倾斜问题的第二个job任务中,在App端的输入格式还可以设置成为

KeyValueTextInputFormat泛型为(本来是TextInputformat),需要注意的是此时map 端的输入输出均为Text 类型

代码如下:

map 

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * Created on 2017/3/18.
 */
public class WCSkewMapper extends Mapper{

    protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
        context.write(key, new IntWritable(Integer.parseInt(value.toString())));
    }
}

reduce 


import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 * Created  on 2017/3/16.
 */
public class WCSkewReducer extends Reducer{
    protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException {
        int count = 0;
        for (IntWritable iw : values) {
            count = count + iw.get();
        }
        context.write(key, new IntWritable(count));
    }
}

App

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


/**
 *解决数据倾斜问题
 */
public class WCSkewApp {
    public static void main(String[] args) throws Exception {
        //加载配置文件
        Configuration conf = new Configuration();
        //设置本地文件系统
        conf.set("fs.defaultFS", "file:///");
        //创建job对象
        Job job = Job.getInstance(conf);


        //设置job的属性
        job.setJobName("WCSkewApp");
        job.setJarByClass(WCSkewApp.class);
        //设置文件输入格式
        job.setInputFormatClass(KeyValueTextInputFormat.class);

        //设置Mapper Reduce类
        job.setMapperClass(WCSkewMapper.class);
        job.setReducerClass(WCSkewReducer.class);

        //设置map reduce 的kv 建输出类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        //设置文件输入和输出路径
        FileInputFormat.addInputPath(job, new Path("g:/comp/out/part-r-00000"));
        FileInputFormat.addInputPath(job, new Path("g:/comp/out/part-r-00001"));
        FileInputFormat.addInputPath(job, new Path("g:/comp/out/part-r-00002"));
        FileInputFormat.addInputPath(job, new Path("g:/comp/out/part-r-00003"));
        FileOutputFormat.setOutputPath(job, new Path("g:/comp/out8"));

        //设置reduce 个数
        job.setNumReduceTasks(4);


        job.waitForCompletion(true);


    }

}



你可能感兴趣的:(hadoop)