hadoop全排序 实现方法
1.设置一个 reduce
2.自定义分区函数
3.hadoop采样器
设置随机采样,切片采样
抽取样本数据,计算区间值,把区间值写入分区文件,分区文件就是序列文件,此序列文件只有key,没有value,value为null
第一部分(设置一个reduce)
准备mapper,reducer,test类
package com.MaxTemp;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class WCMapper extends Mapper
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line =value.toString();
String arr[] = line.split(" ");
context.write(new IntWritable(Integer.parseInt(arr[0])),new IntWritable(Integer.parseInt(arr[1])));
}
}
package com.MaxTemp;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class WCReduce extends Reducer
protected void reduce(IntWritable key, Iterable values, Context context) throws IOException, InterruptedException {
int max=Integer.MIN_VALUE;
for (IntWritable iw:values){
max=max>iw.get()? max:iw.get();
}
context.write(key,new IntWritable(max));
}
}
package com.MaxTemp;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WCTest {
public static void main(String[] args) throws Exception{
Configuration conf=new Configuration();
conf.set("fs.defaultFS","file:///");
Job job=Job.getInstance(conf);
job.setJobName("WCTest");
job.setJarByClass(WCTest.class);
job.setInputFormatClass(TextInputFormat.class);
args = new String[]{"file:///d:/mr/b.txt","file:///d:/mr/out"};
FileInputFormat.addInputPath(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
job.setPartitionerClass(YearPartitioner.class);
job.setMapperClass(WCMapper.class);
job.setReducerClass(WCReduce.class);
job.setNumReduceTasks(1);
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(IntWritable.class);
job.waitForCompletion(true);
}
}
准备天气的数据。
package com.MaxTemp;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
public class Demo {
public static void main(String[] args) throws IOException {
String url=“a.txt”;
File file=new File(“d:/mr/a.txt”);
if (file.exists()){
System.out.println(“文件已存在”);
}
FileWriter fw=new FileWriter(file);
for (int x=1960;x<=2060;x++){
for (int y=-30;y<=60;y++){
fw.write(x+" “+y);
fw.write(”\n");
}
}
fw.close();
}
}
运行主类,即可。
第二部分(自定义分区函数实现)
准备mapper,reducer,test类
package com.MaxTemp;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class WCMapper extends Mapper
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line =value.toString();
String arr[] = line.split(" ");
context.write(new IntWritable(Integer.parseInt(arr[0])),new IntWritable(Integer.parseInt(arr[1])));
}
}
package com.MaxTemp;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class WCReduce extends Reducer
protected void reduce(IntWritable key, Iterable values, Context context) throws IOException, InterruptedException {
int max=Integer.MIN_VALUE;
for (IntWritable iw:values){
max=max>iw.get()? max:iw.get();
}
context.write(key,new IntWritable(max));
}
}
package com.MaxTemp;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WCTest {
public static void main(String[] args) throws Exception{
Configuration conf=new Configuration();
conf.set("fs.defaultFS","file:///");
Job job=Job.getInstance(conf);
job.setJobName("WCTest");
job.setJarByClass(WCTest.class);
job.setInputFormatClass(TextInputFormat.class);
args = new String[]{"file:///d:/mr/b.txt","file:///d:/mr/out"};
FileInputFormat.addInputPath(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
job.setPartitionerClass(YearPartitioner.class);
job.setMapperClass(WCMapper.class);
job.setReducerClass(WCReduce.class);
job.setNumReduceTasks(3);
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(IntWritable.class);
job.waitForCompletion(true);
}
}
准备天气的数据。
package com.MaxTemp;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
public class Demo {
public static void main(String[] args) throws IOException {
String url=“a.txt”;
File file=new File(“d:/mr/a.txt”);
if (file.exists()){
System.out.println(“文件已存在”);
}
FileWriter fw=new FileWriter(file);
for (int x=1960;x<=2060;x++){
for (int y=-30;y<=60;y++){
fw.write(x+" “+y);
fw.write(”\n");
}
}
fw.close();
}
}
运行主类,即可。
第三部分(hadoop采样器实现全排序)
准备mapper,reducer,Test
package com.MaxTemp;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
package com.MaxTemp;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/**
package com.MaxTemp;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.InputSampler;
import org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner;
/**
*
*/
public class MaxTempApp {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
conf.set("fs.defaultFS","file:///");
Job job = Job.getInstance(conf);
//设置job的各种属性
job.setJobName("MaxTempApp"); //作业名称
job.setJarByClass(MaxTempApp.class); //搜索类
job.setInputFormatClass(SequenceFileInputFormat.class); //设置输入格式
args = new String[]{“file:///d:/mr/seq/1.seq”,“file:///d:/mr/out”};
//添加输入路径
FileInputFormat.addInputPath(job,new Path(args[0]));
//设置输出路径
FileOutputFormat.setOutputPath(job,new Path(args[1]));
//将sample数据写入分区文件.
job.setMapperClass(MaxTempMapper.class); //mapper类
job.setReducerClass(MaxTempReducer.class); //reducer类
job.setMapOutputKeyClass(IntWritable.class); //
job.setMapOutputValueClass(IntWritable.class); //
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(IntWritable.class); //
//reduce个数
//设置全排序分区类
job.setPartitionerClass(TotalOrderPartitioner.class);
//创建随机采样器对象
//freq:每个key被选中的概率
//numSapmple:抽取样本的总数
//maxSplitSampled:最大采样切片数
InputSampler.Sampler sampler =
new InputSampler.RandomSampler(0.1, 6000, 3);
job.setNumReduceTasks(3);
TotalOrderPartitioner.setPartitionFile(job.getConfiguration(),new Path("D:/mr/par.lst"));
InputSampler.writePartitionFile(job, sampler);
job.waitForCompletion(true);
}
}
准备seq文件
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.junit.Test;
import java.io.IOException;
import java.util.Random;
public class Sequence {
@Test
public void save() throws IOException {
Configuration conf=new Configuration();
conf.set(“fs.defaultFS”,“file:///”);
FileSystem fs=FileSystem.get(conf);
//Path path=new Path(“D:/seq/1.seq”);
Path path=new Path(“D:/mr/seq/1.seq”);
//Path path=new Path(“hdfs://s200:9000/centos/hadoop”);
//SequenceFile.Writer writer=SequenceFile.createWriter(fs,conf,path,IntWritable.class,Text.class);
SequenceFile.Writer writer= SequenceFile.createWriter(fs, conf, path, IntWritable.class, IntWritable.class);
for(int x=0;x<6000;x++){
int year =1970+new Random().nextInt(100);
int temp=-37+new Random().nextInt(100);
writer.append(new IntWritable(year),new IntWritable(temp));
}
writer.close();
}
}
运行测试类
之后IDEA或者ECLIPSE下运行就可以了。
IDEA需要导入依赖或者相关包。
感兴趣的可以关注微信公众号:IT编程学习栈
扫描下方二维码关注