a b a b a c hu jn jgh op a a a a b c c c c a a a a a a a c c c c c c c c c c c a a a a
a b a b a c hu jn jgh op a a a a b c c c c a a a a a a a c c c c c c c c c c c a a a a
a b a b a c hu jn jgh op a a a a b c c c c a a a a a a a c c c c c c c c c c c a a a a
a b a b a c hu jn jgh op a a a a b c c c c a a a a a a a c c c c c c c c c c c a a a a
a b a b a c hu jn jgh op a a a a b c c c c a a a a a a a c c c c c c c c c c c a a a a
a b a b a c hu jn jgh op a a a a b c c c c a a a a a a a c c c c c c c c c c c a a a a
a b a b a c hu jn jgh op a a a a b c c c c a a a a a a a c c c c c c c c c c c a a a a
a b a b a c hu jn jgh op a a a a b c c c c a a a a a a a c c c c c c c c c c c a a a a
a b a b a c hu jn jgh op a a a a b c c c c a a a a a a a c c c c c c c c c c c a a a a
a b a b a c hu jn jgh op a a a a b c c c c a a a a a a a c c c c c c c c c c c a a a a
a b a b a c hu jn jgh op a a a a b c c c c a a a a a a a c c c c c c c c c c c a a a a
a b a b a c hu jn jgh op a a a a b c c c c a a a a a a a c c c c c c c c c c c a a a a
a b a b a c hu jn jgh op a a a a b c c c c a a a a a a a c c c c c c c c c c c a a a a
a b a b a c hu jn jgh op a a a a b c c c c a a a a a a a c c c c c c c c c c c a a a a
a b a b a c hu jn jgh op a a a a b c c c c a a a a a a a c c c c c c c c c c c a a a a
a b a b a c hu jn jgh op a a a a b c c c c a a a a a a a c c c c c c c c c c c a a a a
a b a b a c hu jn jgh op a a a a b c c c c a a a a a a a c c c c c c c c c c c a a a a
a b a b a c hu jn jgh op a a a a b c c c c a a a a a a a c c c c c c c c c c c a a a a
a b a b a c hu jn jgh op a a a a b c c c c a a a a a a a c c c c c c c c c c c a a a a
a b a b a c hu jn jgh op a a a a b c c c c a a a a a a a c c c c c c c c c c c a a a a
a b a b a c hu jn jgh op a a a a b c c c c a a a a a a a c c c c c c c c c c c a a a a
a b a b a c hu jn jgh op a a a a b c c c c a a a a a a a c c c c c c c c c c c a a a a
a b a b a c hu jn jgh op a a a a b c c c c a a a a a a a c c c c c c c c c c c a a a a
a b a b a c hu jn jgh op a a a a b c c c c a a a a a a a c c c c c c c c c c c a a a a
a b a b a c hu jn jgh op a a a a b c c c c a a a a a a a c c c c c c c c c c c a a a a
a b a b a c hu jn jgh op a a a a b c c c c a a a a a a a c c c c c c c c c c c a a a a
a b a b a c hu jn jgh op a a a a b c c c c a a a a a a a c c c c c c c c c c c a a a a
a b a b a c hu jn jgh op a a a a b c c c c a a a a a a a c c c c c c c c c c c a a a a
a b a b a c hu jn jgh op a a a a b c c c c a a a a a a a c c c c c c c c c c c a a a a
a b a b a c hu jn jgh op a a a a b c c c c a a a a a a a c c c c c c c c c c c a a a a
a b a b a c hu jn jgh op a a a a b c c c c a a a a a a a c c c c c c c c c c c a a a a
a b a b a c hu jn jgh op a a a a b c c c c a a a a a a a c c c c c c c c c c c a a a a
a b a b a c hu jn jgh op a a a a b c c c c a a a a a a a c c c c c c c c c c c a a a a
a b a b a c hu jn jgh op a a a a b c c c c a a a a a a a c c c c c c c c c c c a a a a
a b a b a c hu jn jgh op a a a a b c c c c a a a a a a a c c c c c c c c c c c a a a a
a b a b a c hu jn jgh op a a a a b c c c c a a a a a a a c c c c c c c c c c c a a a a
a b a b a c hu jn jgh op a a a a b c c c c a a a a a a a c c c c c c c c c c c a a a a
a b a b a c hu jn jgh op a a a a b c c c c a a a a a a a c c c c c c c c c c c a a a a
a b a b a c hu jn jgh op a a a a b c c c c a a a a a a a c c c c c c c c c c c a a a a
a b a b a c hu jn jgh op a a a a b c c c c a a a a a a a c c c c c c c c c c c a a a a
a b a b a c hu jn jgh op a a a a b c c c c a a a a a a a c c c c c c c c c c c a a a a
a b a b a c hu jn jgh op a a a a b c c c c a a a a a a a c c c c c c c c c c c a a a a
a b a b a c hu jn jgh op a a a a b c c c c a a a a a a a c c c c c c c c c c c a a a a
package com.doit.hadoop.skew;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
* @author hulc
* @slogan: just do it
* @date 2020/8/20 22:48
*/
public class DataSkewDriver {
public static void main(String[] args) {
// 配置文件
Configuration conf = new Configuration();
conf.set("mapreduce.framework.name", "local");
// job 注意由于引入了配置文件,需要设置为local模式运行
try {
Job skew = Job.getInstance(conf, "skew");
// mapper 和reducer类
skew.setMapperClass(SkewMapper.class);
skew.setReducerClass(SkewReducer.class);
// map输出的key value
skew.setMapOutputKeyClass(Text.class);
skew.setMapOutputValueClass(IntWritable.class);
// reduce 输出的key 和value
skew.setOutputKeyClass(Text.class);
skew.setOutputValueClass(IntWritable.class);
// 输入和输出数据源 E:\DOITLearning\8.Hadoop\mrdata\flow\input\dataskew.txt
FileInputFormat.setInputPaths(skew, new Path("E:\\DOITLearning\\8.Hadoop\\mrdata\\flow\\input\\dataskew.txt"));
FileOutputFormat.setOutputPath(skew, new Path("E:\\DOITLearning\\8.Hadoop\\mrdata\\flow\\skew_output1"));
// reduce 任务数
skew.setNumReduceTasks(2);
// 启动任务
boolean b = skew.waitForCompletion(true);
if(b) {
System.out.println("success");
} else {
System.out.println("failed");
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
class SkewMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
IntWritable mapValue = new IntWritable(1);
Text mapKey = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// 简单的统计单词个数
String line = value.toString();
String[] split = line.split("\\s+");
for (String s : split) {
mapKey.set(s);
context.write(mapKey, mapValue);
}
}
}
class SkewReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
IntWritable reduceValue = new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
// 聚合统计单词个数
int count = 0;
for (IntWritable value : values) {
count ++;
}
reduceValue.set(count);
context.write(key, reduceValue);
}
}
package com.doit.hadoop.skew;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
* @author hulc
* @slogan: just do it
* @date 2020/8/20 22:48
*/
public class DataSkewDriver {
public static void main(String[] args) {
// 配置文件
Configuration conf = new Configuration();
conf.set("mapreduce.framework.name", "local");
// job 注意由于引入了配置文件,需要设置为local模式运行
try {
Job skew = Job.getInstance(conf, "skew");
// mapper 和reducer类
skew.setMapperClass(SkewMapper.class);
skew.setReducerClass(SkewReducer.class);
// map输出的key value
skew.setMapOutputKeyClass(Text.class);
skew.setMapOutputValueClass(IntWritable.class);
// reduce 输出的key 和value
skew.setOutputKeyClass(Text.class);
skew.setOutputValueClass(IntWritable.class);
// 输入和输出数据源 E:\DOITLearning\8.Hadoop\mrdata\flow\input\dataskew.txt
FileInputFormat.setInputPaths(skew, new Path("E:\\DOITLearning\\8.Hadoop\\mrdata\\flow\\input\\dataskew.txt"));
FileOutputFormat.setOutputPath(skew, new Path("E:\\DOITLearning\\8.Hadoop\\mrdata\\flow\\skew_output2"));
// reduce 任务数
skew.setNumReduceTasks(5);
// 启动任务
boolean b = skew.waitForCompletion(true);
if(b) {
System.out.println("success");
} else {
System.out.println("failed");
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
class SkewMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
IntWritable mapValue = new IntWritable(1);
Text mapKey = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// 简单的统计单词个数
String line = value.toString();
String[] split = line.split("\\s+");
for (String s : split) {
mapKey.set(s);
context.write(mapKey, mapValue);
}
}
}
class SkewReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
IntWritable reduceValue = new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
// 聚合统计单词个数
int count = 0;
for (IntWritable value : values) {
count ++;
}
reduceValue.set(count);
context.write(key, reduceValue);
}
}
package com.doit.hadoop.skew;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.util.Random;
/**
* @author hulc
* @slogan: just do it
* @date 2020/8/20 22:48
*/
public class DataSkewDriver {
public static void main(String[] args) {
// 配置文件
Configuration conf = new Configuration();
conf.set("mapreduce.framework.name", "local");
// job 注意由于引入了配置文件,需要设置为local模式运行
try {
Job skew = Job.getInstance(conf, "skew");
// mapper 和reducer类
skew.setMapperClass(SkewMapper.class);
skew.setReducerClass(SkewReducer.class);
// map输出的key value
skew.setMapOutputKeyClass(Text.class);
skew.setMapOutputValueClass(IntWritable.class);
// reduce 输出的key 和value
skew.setOutputKeyClass(Text.class);
skew.setOutputValueClass(IntWritable.class);
// 输入和输出数据源 E:\DOITLearning\8.Hadoop\mrdata\flow\input\dataskew.txt
FileInputFormat.setInputPaths(skew, new Path("E:\\DOITLearning\\8.Hadoop\\mrdata\\flow\\input\\dataskew.txt"));
FileOutputFormat.setOutputPath(skew, new Path("E:\\DOITLearning\\8.Hadoop\\mrdata\\flow\\skew_output3"));
// reduce 任务数
skew.setNumReduceTasks(2);
// 启动任务
boolean b = skew.waitForCompletion(true);
if(b) {
System.out.println("success");
} else {
System.out.println("failed");
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
class SkewMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
IntWritable mapValue = new IntWritable(1);
Text mapKey = new Text();
Random random = new Random();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// 简单的统计单词个数
String line = value.toString();
String[] split = line.split("\\s+");
for (String s : split) {
// 打散key时,为了让数据更加均匀,一般都是取reducetask的倍数进行随机数生成并拼接。这样打散后数据分配到各个区中的概率会相对平均一些
mapKey.set(s+random.nextInt(4));
context.write(mapKey, mapValue);
}
}
}
class SkewReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
IntWritable reduceValue = new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
// 聚合统计单词个数
int count = 0;
for (IntWritable value : values) {
count ++;
}
reduceValue.set(count);
context.write(key, reduceValue);
}
}
总而言之,数据倾斜的来源是为了方便分布式计算而对中间数据做切分,但切分代码并不是所有场景下都能相对均匀切分,这时候就会出现多分数据,但有些数据多,有些数据少。这样后续的计算几点处理数据时,运行负载就会有差异,所以需要想办法进行重新划分,其实就是负载均衡的一种体现