我们这里的mapper程序不做任何逻辑,也不对key,与value做任何改变,只是接收我们的数据,然后往下发送
public class MyMapper extends Mapper<LongWritable,Text,Text,NullWritable>{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
context.write(value,NullWritable.get());
}
}
我们的reducer也不做任何处理,将我们的数据原封不动的输出即可
public class MyReducer extends Reducer<Text,NullWritable,Text,NullWritable> {
@Override
protected void reduce(Text key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
context.write(key,NullWritable.get());
}
}
/**
* 这里的输入类型与我们map阶段的输出类型相同
*/
public class MyPartitioner extends Partitioner<Text,NullWritable>{
/**
* 返回值表示我们的数据要去到哪个分区
* 返回值只是一个分区的标记,标记所有相同的数据去到指定的分区
*/
@Override
public int getPartition(Text text, NullWritable nullWritable, int i) {
String result = text.toString().split("\t")[5];
System.out.println(result);
if (Integer.parseInt(result) > 15){
return 1;
}else{
return 0;
}
}
}
public class PartitionMain extends Configured implements Tool {
public static void main(String[] args) throws Exception{
int run = ToolRunner.run(new Configuration(), new PartitionMain(), args);
System.exit(run);
}
@Override
public int run(String[] args) throws Exception {
Job job = Job.getInstance(super.getConf(), PartitionMain.class.getSimpleName());
job.setJarByClass(PartitionMain.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
TextInputFormat.addInputPath(job,new Path("hdfs://192.168.52.100:8020/partitioner"));
TextOutputFormat.setOutputPath(job,new Path("hdfs://192.168.52.100:8020/outpartition"));
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
job.setReducerClass(MyReducer.class);
/**
* 设置我们的分区类,以及我们的reducetask的个数,注意reduceTask的个数一定要与我们的
* 分区数保持一致
*/
job.setPartitionerClass(MyPartitioner.class);
job.setNumReduceTasks(2);
boolean b = job.waitForCompletion(true);
return b?0:1;
}
}
Writable:hadoop当中序列化的接口
Comparable :javaSE当中的排序的接口
如果我们只需要序列化:实现writable即可
如果既需要序列化也需要排序:实现 WritableComparable
默认排序规则是对k2 进行排序。以后记住:需要对谁进行排序就把谁作为k2
a 1
a 9
b 3
a 7
b 8
b 10
a 5
要求第一列按照字典顺序进行排列,第一列相同的时候,第二列按照升序进行排列
a 1
a 9
b 3
a 7
b 8
b 10
a 5
a 9
排序需求:先对第一列进行排序,如果第一列相等了,再对第二列进行排序
a 1
a 5
a 7
a 9
a 9
b 3
b 8
b 10
二级排序
两个字段都要进行排序,两个字段都要作为我们K2 。可不可以将两个字段封装成一个javaBean,作为我们k2
javaBean需要序列化 要排序 javaBean 实现 WritableComparable 即可
public class PairWritable implements WritableComparable<PairWritable> {
// 组合key,第一部分是我们第一列,第二部分是我们第二列
private String first;
private int second;
public PairWritable() {
}
public PairWritable(String first, int second) {
this.set(first, second);
}
/**
* 方便设置字段
*/
public void set(String first, int second) {
this.first = first;
this.second = second;
}
/**
* 反序列化
*/
@Override
public void readFields(DataInput input) throws IOException {
this.first = input.readUTF();
this.second = input.readInt();
}
/**
* 序列化
*/
@Override
public void write(DataOutput output) throws IOException {
output.writeUTF(first);
output.writeInt(second);
}
/*
* 重写比较器
*/
public int compareTo(PairWritable o) {
//每次比较都是调用该方法的对象与传递的参数进行比较,说白了就是第一行与第二行比较完了之后的结果与第三行比较,
//得出来的结果再去与第四行比较,依次类推
System.out.println(o.toString());
System.out.println(this.toString());
int comp = this.first.compareTo(o.first);
if (comp != 0) {
return comp;
} else { // 若第一个字段相等,则比较第二个字段
return Integer.valueOf(this.second).compareTo(
Integer.valueOf(o.getSecond()));
}
}
public int getSecond() {
return second;
}
public void setSecond(int second) {
this.second = second;
}
public String getFirst() {
return first;
}
public void setFirst(String first) {
this.first = first;
}
@Override
public String toString() {
return "PairWritable{" +
"first='" + first + '\'' +
", second=" + second +
'}';
}
}
public class SortMapper extends Mapper<LongWritable,Text,PairWritable,IntWritable> {
private PairWritable mapOutKey = new PairWritable();
private IntWritable mapOutValue = new IntWritable();
@Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String lineValue = value.toString();
String[] strs = lineValue.split("\t");
//设置组合key和value ==> <(key,value),value>
mapOutKey.set(strs[0], Integer.valueOf(strs[1]));
mapOutValue.set(Integer.valueOf(strs[1]));
context.write(mapOutKey, mapOutValue);
}
}
public class SortReducer extends Reducer<PairWritable,IntWritable,Text,IntWritable> {
private Text outPutKey = new Text();
@Override
public void reduce(PairWritable key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
//迭代输出
for(IntWritable value : values) {
outPutKey.set(key.getFirst());
context.write(outPutKey, value);
}
}
}
public class SecondarySort extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
Configuration conf = super.getConf();
conf.set("mapreduce.framework.name","local");
Job job = Job.getInstance(conf, SecondarySort.class.getSimpleName());
job.setJarByClass(SecondarySort.class);
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job,new Path("file:///L:\\大数据离线阶段备课教案以及资料文档——by老王\\4、大数据离线第四天\\排序\\input"));
TextOutputFormat.setOutputPath(job,new Path("file:///L:\\大数据离线阶段备课教案以及资料文档——by老王\\4、大数据离线第四天\\排序\\output"));
job.setMapperClass(SortMapper.class);
job.setMapOutputKeyClass(PairWritable.class);
job.setMapOutputValueClass(IntWritable.class);
job.setReducerClass(SortReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
boolean b = job.waitForCompletion(true);
return b?0:1;
}
public static void main(String[] args) throws Exception {
Configuration entries = new Configuration();
ToolRunner.run(entries,new SecondarySort(),args);
}
}
计数器主要是mr当中提供给我们的一个计数的工具
mapreduce当中的一个调优的手段
通过context上下文对象可以获取我们的计数器,进行记录,在map端使用计数器进行统计
public class SortMapper extends Mapper<LongWritable,Text,PairWritable,IntWritable> {
private PairWritable mapOutKey = new PairWritable();
private IntWritable mapOutValue = new IntWritable();
@Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//自定义我们的计数器,这里实现了统计map数据数据的条数
Counter counter = context.getCounter("MR_COUNT", "MapRecordCounter");
counter.increment(1L);
String lineValue = value.toString();
String[] strs = lineValue.split("\t");
//设置组合key和value ==> <(key,value),value>
mapOutKey.set(strs[0], Integer.valueOf(strs[1]));
mapOutValue.set(Integer.valueOf(strs[1]));
context.write(mapOutKey, mapOutValue);
}
}
运行程序之后就可以看到我们自定义的计数器在map阶段读取了七条数据
通过enum枚举类型来定义计数器
统计reduce端数据的输入的key有多少个,对应的value有多少个
ublic class SortReducer extends Reducer<PairWritable,IntWritable,Text,IntWritable> {
private Text outPutKey = new Text();
public static enum Counter{
REDUCE_INPUT_RECORDS, REDUCE_INPUT_VAL_NUMS,
}
@Override
public void reduce(PairWritable key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
context.getCounter(Counter.REDUCE_INPUT_RECORDS).increment(1L);
//迭代输出
for(IntWritable value : values) {
context.getCounter(Counter.REDUCE_INPUT_VAL_NUMS).increment(1L);
outPutKey.set(key.getFirst());
context.write(outPutKey, value);
}
}
}