Hadoop全排序原理和代码

hadoop全排序 实现方法
1.设置一个 reduce
2.自定义分区函数
3.hadoop采样器
设置随机采样,切片采样
抽取样本数据,计算区间值,把区间值写入分区文件,分区文件就是序列文件,此序列文件只有key,没有value,value为null

第一部分(设置一个reduce)
准备mapper,reducer,test类

package com.MaxTemp;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class WCMapper extends Mapper {
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line =value.toString();
String arr[] = line.split(" ");
context.write(new IntWritable(Integer.parseInt(arr[0])),new IntWritable(Integer.parseInt(arr[1])));
}
}

package com.MaxTemp;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class WCReduce extends Reducer {
protected void reduce(IntWritable key, Iterable values, Context context) throws IOException, InterruptedException {
int max=Integer.MIN_VALUE;
for (IntWritable iw:values){
max=max>iw.get()? max:iw.get();
}
context.write(key,new IntWritable(max));
}
}

package com.MaxTemp;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class WCTest {

public static void main(String[] args) throws Exception{

    Configuration conf=new Configuration();
    conf.set("fs.defaultFS","file:///");
    Job job=Job.getInstance(conf);
    job.setJobName("WCTest");
 
    job.setJarByClass(WCTest.class);
    job.setInputFormatClass(TextInputFormat.class);

    args = new String[]{"file:///d:/mr/b.txt","file:///d:/mr/out"};
    FileInputFormat.addInputPath(job,new Path(args[0]));
    FileOutputFormat.setOutputPath(job,new Path(args[1]));

    job.setPartitionerClass(YearPartitioner.class);
    job.setMapperClass(WCMapper.class);
    job.setReducerClass(WCReduce.class);

    job.setNumReduceTasks(1);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(IntWritable.class);
    job.waitForCompletion(true);
}

}

准备天气的数据。

package com.MaxTemp;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;

public class Demo {
public static void main(String[] args) throws IOException {
String url=“a.txt”;
File file=new File(“d:/mr/a.txt”);
if (file.exists()){
System.out.println(“文件已存在”);
}
FileWriter fw=new FileWriter(file);
for (int x=1960;x<=2060;x++){
for (int y=-30;y<=60;y++){
fw.write(x+" “+y);
fw.write(”\n");

        }

    }
    fw.close();
}

}

运行主类,即可。

第二部分(自定义分区函数实现)
准备mapper,reducer,test类

package com.MaxTemp;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class WCMapper extends Mapper {
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line =value.toString();
String arr[] = line.split(" ");
context.write(new IntWritable(Integer.parseInt(arr[0])),new IntWritable(Integer.parseInt(arr[1])));
}
}

package com.MaxTemp;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class WCReduce extends Reducer {
protected void reduce(IntWritable key, Iterable values, Context context) throws IOException, InterruptedException {
int max=Integer.MIN_VALUE;
for (IntWritable iw:values){
max=max>iw.get()? max:iw.get();
}
context.write(key,new IntWritable(max));
}
}

package com.MaxTemp;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class WCTest {

public static void main(String[] args) throws Exception{

    Configuration conf=new Configuration();
    conf.set("fs.defaultFS","file:///");
    Job job=Job.getInstance(conf);
    job.setJobName("WCTest");
 
    job.setJarByClass(WCTest.class);
    job.setInputFormatClass(TextInputFormat.class);

    args = new String[]{"file:///d:/mr/b.txt","file:///d:/mr/out"};
    FileInputFormat.addInputPath(job,new Path(args[0]));
    FileOutputFormat.setOutputPath(job,new Path(args[1]));

    job.setPartitionerClass(YearPartitioner.class);
    job.setMapperClass(WCMapper.class);
    job.setReducerClass(WCReduce.class);

    job.setNumReduceTasks(3);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(IntWritable.class);
    job.waitForCompletion(true);
}

}

准备天气的数据。

package com.MaxTemp;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;

public class Demo {
public static void main(String[] args) throws IOException {
String url=“a.txt”;
File file=new File(“d:/mr/a.txt”);
if (file.exists()){
System.out.println(“文件已存在”);
}
FileWriter fw=new FileWriter(file);
for (int x=1960;x<=2060;x++){
for (int y=-30;y<=60;y++){
fw.write(x+" “+y);
fw.write(”\n");

        }

    }
    fw.close();
}

}

运行主类,即可。

第三部分(hadoop采样器实现全排序)

准备mapper,reducer,Test

package com.MaxTemp;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**

  • WCTextMapper
    */
    public class MaxTempMapper extends Mapper{
    protected void map(IntWritable key, IntWritable value, Context context) throws IOException, InterruptedException {
    context.write(key,value);
    }
    }

package com.MaxTemp;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**

  • Reducer
    /
    public class MaxTempReducer extends Reducer{
    /
    *
    • reduce
      */
      protected void reduce(IntWritable key, Iterable values, Context context) throws IOException, InterruptedException {
      int max = Integer.MIN_VALUE ;
      for(IntWritable iw : values){
      max = max > iw.get() ? max : iw.get() ;
      }
      context.write(key,new IntWritable(max));
      }
      }

package com.MaxTemp;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.InputSampler;
import org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner;

/**
*
*/
public class MaxTempApp {
public static void main(String[] args) throws Exception {

    Configuration conf = new Configuration();
    conf.set("fs.defaultFS","file:///");

    Job job = Job.getInstance(conf);

    //设置job的各种属性
    job.setJobName("MaxTempApp");                        //作业名称
    job.setJarByClass(MaxTempApp.class);                 //搜索类

job.setInputFormatClass(SequenceFileInputFormat.class); //设置输入格式
args = new String[]{“file:///d:/mr/seq/1.seq”,“file:///d:/mr/out”};
//添加输入路径
FileInputFormat.addInputPath(job,new Path(args[0]));
//设置输出路径
FileOutputFormat.setOutputPath(job,new Path(args[1]));

    //将sample数据写入分区文件.
    job.setMapperClass(MaxTempMapper.class);            //mapper类
    job.setReducerClass(MaxTempReducer.class);         //reducer类
    job.setMapOutputKeyClass(IntWritable.class);        //
    job.setMapOutputValueClass(IntWritable.class);      //
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(IntWritable.class);         //
   //reduce个数
    //设置全排序分区类
    job.setPartitionerClass(TotalOrderPartitioner.class);
    //创建随机采样器对象
    //freq:每个key被选中的概率
    //numSapmple:抽取样本的总数
    //maxSplitSampled:最大采样切片数
    InputSampler.Sampler sampler =
            new InputSampler.RandomSampler(0.1, 6000, 3);
    job.setNumReduceTasks(3);
    	TotalOrderPartitioner.setPartitionFile(job.getConfiguration(),new Path("D:/mr/par.lst"));
    InputSampler.writePartitionFile(job, sampler);
  job.waitForCompletion(true);
}

}

准备seq文件

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.junit.Test;

import java.io.IOException;
import java.util.Random;

public class Sequence {
@Test
public void save() throws IOException {
Configuration conf=new Configuration();
conf.set(“fs.defaultFS”,“file:///”);
FileSystem fs=FileSystem.get(conf);
//Path path=new Path(“D:/seq/1.seq”);
Path path=new Path(“D:/mr/seq/1.seq”);
//Path path=new Path(“hdfs://s200:9000/centos/hadoop”);
//SequenceFile.Writer writer=SequenceFile.createWriter(fs,conf,path,IntWritable.class,Text.class);
SequenceFile.Writer writer= SequenceFile.createWriter(fs, conf, path, IntWritable.class, IntWritable.class);
for(int x=0;x<6000;x++){
int year =1970+new Random().nextInt(100);
int temp=-37+new Random().nextInt(100);
writer.append(new IntWritable(year),new IntWritable(temp));
}
writer.close();
}
}

运行测试类

之后IDEA或者ECLIPSE下运行就可以了。
IDEA需要导入依赖或者相关包。
感兴趣的可以关注微信公众号:IT编程学习栈
扫描下方二维码关注
Hadoop全排序原理和代码_第1张图片

你可能感兴趣的:(mapreduce)