public abstract class Partitioner
(key.hashCode() & Integer.MAX_VALUE) % numReduceTasks;
key.hashCode()% numReduceTasks;
按照我们的需求:3分区
13*
15*
其他*
reducer = 分区数 1
reducer > 分区数 产生很多空的没用的文件
200 200 199
reducer < 分区数
package com.ccj.pxj.phone.partition;
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class Access implements Writable {
private String phone;
private long up;
private long down;
private long sum;
public String getPhone() {
return phone;
}
public void setPhone(String phone) {
this.phone = phone;
}
public long getUp() {
return up;
}
public void setUp(long up) {
this.up = up;
}
public long getDown() {
return down;
}
public void setDown(long down) {
this.down = down;
}
public Access(String phone, long up, long down) {
this.phone = phone;
this.up = up;
this.down = down;
this.sum=up+down;
}
public Access() {
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(phone);
out.writeLong(up);
out.writeLong(down);
out.writeLong(sum);
}
@Override
public void readFields(DataInput in) throws IOException {
this.phone= in.readUTF();
this.up= in.readLong();
this.down=in.readLong();
this.sum=in.readLong();
}
@Override
public String toString() {
return
phone + '\t' +
up +
"\t" + down +
"\t" + sum ;
}
}
package com.ccj.pxj.phone.partition;
import com.ccj.pxj.phone.utils.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class SerDriver {
public static void main(String[] args) throws Exception {
String input = "data/phone_data .txt";
String output = "out";
// 1)获取Job对象
Configuration configuration = new Configuration();
Job job = Job.getInstance(configuration);
FileUtils.deleteOutput(configuration, output);
// 2)本job对应要执行的主类是哪个
job.setJarByClass(SerDriver.class);
// 3)设置Mapper和Reducer
job.setMapperClass(MyMaper .class);
job.setReducerClass(MyReduce.class);
// 4)设置Mapper阶段输出数据的类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Access.class);
// 设置自定义分区器
job.setPartitionerClass(PxjPartition.class);
// 设置reduce个数=>决定最终文件的个数
job.setNumReduceTasks(3);
// 5)设置Reducer阶段输出数据的类型
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Access.class);
// 6)设置输入和输出路径
FileInputFormat.setInputPaths(job, new Path(input));
FileOutputFormat.setOutputPath(job, new Path(output));
// 7)提交作业
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);
}
public static class MyMaper extends Mapper<LongWritable, Text,Text, Access>{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] data = value.toString().split("\t");
String phone = data[1];
// 上行流量
long up = Long.parseLong(data[data.length - 3]);
// 下行流量
long down = Long.parseLong(data[data.length - 2]);
context.write(new Text(phone),new Access(phone,up,down));
}
}
public static class MyReduce extends Reducer<Text, Access, NullWritable, Access>{
@Override
protected void reduce(Text key, Iterable<Access> values, Context context) throws IOException, InterruptedException {
long ups=0;
long downs=0;
for (Access value : values) {
ups+=value.getUp();
downs+=value.getDown();
}
context.write(NullWritable.get(),new Access(key.toString(),ups,downs));
}
}
}
package com.ccj.pxj.phone.partition;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
public class PxjPartition extends Partitioner<Text,Access> {
@Override
public int getPartition(Text text, Access access, int numPartition) {
String phone = text.toString();
/* if("13".equals(phone)){
return 0;
}else if("15".equals(phone)){
return 1;
}else {
return 2;
}
}*/
if(phone.startsWith("13")){
return 0;
}else if(phone.startsWith("15")){
return 1;
}else{
return 2;
}
}
}
说明:
reduce个数=>决定最终文件的个数
reducer = 分区数 1
reducer > 分区数 产生很多空的没用的文件
200 200 199
reducer < 分区数 报错
map: MapTask
reduce:ReduceTask
combiner:MapTask
父类是Reducer
对于每个MapTask的输出进行局部汇总/本地聚合
业务逻辑和Reducer是一模一样的
没使用 combiner
Combine input records=0
Combine output records=0
使用后
Combine input records=12
Combine output records=7
package com.ccj.pxj.commine.wc;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class WCMapper extends Mapper<LongWritable,Text,Text,IntWritable> {
IntWritable ONE = new IntWritable(1);
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] words = line.split(",");
for (String word : words) {
context.write(new Text(word),ONE);
}
}
}
package com.ccj.pxj.commine.wc;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class WCReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum=0;
for (IntWritable value : values) {
sum+=value.get();
}
context.write(key,new IntWritable(sum));
}
}
package com.ccj.pxj.commine.wc;
import com.ccj.pxj.phone.utils.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WCDriver {
public static void main(String[] args)throws Exception {
String input = "data/1.txt";
String output = "out1";
// 1)获取Job对象
Configuration configuration = new Configuration();
Job job = Job.getInstance(configuration);
FileUtils.deleteOutput(configuration, output);
// 2)本job对应要执行的主类是哪个
job.setJarByClass(WCDriver.class);
// 3)设置Mapper和Reducer
job.setMapperClass(WCMapper.class);
job.setReducerClass(WCReducer.class);
// 4)设置Mapper阶段输出数据的类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
//设置Combiner
job.setCombinerClass(WCReducer.class);
// 5)设置Reducer阶段输出数据的类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//
// 6)设置输入和输出路径
FileInputFormat.setInputPaths(job, new Path(input));
FileOutputFormat.setOutputPath(job, new Path(output));
// 7)提交作业
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);
}
}
在MapReduce中,内置的这些数据类型不仅具有Writable的功能
还具有排序的功能
WritableComparable得实现三个方法
write
readFields
compareTo
全排: order by
分区排序:sort by
package com.ccj.pxj.sort;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
/**
* 自定义序列化类且排序
*/
public class Traffic implements WritableComparable<Traffic> {
private String phone;
private long up;
private long down;
private long sum;
public String getPhone() {
return phone;
}
public void setPhone(String phone) {
this.phone = phone;
}
public long getUp() {
return up;
}
public void setUp(long up) {
this.up = up;
}
public long getDown() {
return down;
}
public void setDown(long down) {
this.down = down;
}
public Traffic(String phone, long up, long down) {
this.phone = phone;
this.up = up;
this.down = down;
this.sum=up+down;
}
public Traffic() {
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(phone);
out.writeLong(up);
out.writeLong(down);
out.writeLong(sum);
}
@Override
public void readFields(DataInput in) throws IOException {
this.phone= in.readUTF();
this.up= in.readLong();
this.down=in.readLong();
this.sum=in.readLong();
}
@Override
public String toString() {
return
phone + '\t' +
up +
"\t" + down +
"\t" + sum ;
}
@Override
public int compareTo(Traffic o) {
return this.sum>o.sum?-1:1;
}
}
package com.ccj.pxj.sort;
import com.ccj.pxj.phone.utils.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class AllSortDriver {
public static void main(String[] args) throws Exception {
String input = "data/phone_data .txt";
String output = "out";
// 1)获取Job对象
Configuration configuration = new Configuration();
Job job = Job.getInstance(configuration);
FileUtils.deleteOutput(configuration, output);
// 2)本job对应要执行的主类是哪个
job.setJarByClass(AllSortDriver.class);
// 3)设置Mapper和Reducer
job.setMapperClass(MyMaper.class);
job.setReducerClass(MyReduce.class);
// 4)设置Mapper阶段输出数据的类型
job.setMapOutputKeyClass(Traffic.class);
job.setMapOutputValueClass(Text.class);
// 5)设置Reducer阶段输出数据的类型
job.setOutputKeyClass(Traffic.class);
job.setOutputValueClass(Text.class);
// 6)设置输入和输出路径
FileInputFormat.setInputPaths(job, new Path(input));
FileOutputFormat.setOutputPath(job, new Path(output));
// 7)提交作业
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);
}
public static class MyMaper extends Mapper<LongWritable, Text,Traffic, Text>{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] data = value.toString().split("\t");
String phone = data[1];
// 上行流量
long up = Long.parseLong(data[data.length - 3]);
// 下行流量
long down = Long.parseLong(data[data.length - 2]);
context.write(new Traffic(phone,up,down),new Text(phone));
}
}
public static class MyReduce extends Reducer< Traffic,Text, Text, Traffic>{
/* @Override
protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException {
long ups=0;
long downs=0;
for (Traffic value : values) {
ups+=value.getUp();
downs+=value.getDown();
}
context.write(NullWritable.get(),new Traffic(key.toString(),ups,downs));
}*/
@Override
protected void reduce(Traffic key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
for (Text value : values) {
context.write(new Text(value),key);
}
}
}
}
分区内排序
package com.ccj.pxj.sort;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
/**
* 自定义序列化类且排序
*/
public class Traffic implements WritableComparable<Traffic> {
private String phone;
private long up;
private long down;
private long sum;
public String getPhone() {
return phone;
}
public void setPhone(String phone) {
this.phone = phone;
}
public long getUp() {
return up;
}
public void setUp(long up) {
this.up = up;
}
public long getDown() {
return down;
}
public void setDown(long down) {
this.down = down;
}
public Traffic(String phone, long up, long down) {
this.phone = phone;
this.up = up;
this.down = down;
this.sum=up+down;
}
public Traffic() {
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(phone);
out.writeLong(up);
out.writeLong(down);
out.writeLong(sum);
}
@Override
public void readFields(DataInput in) throws IOException {
this.phone= in.readUTF();
this.up= in.readLong();
this.down=in.readLong();
this.sum=in.readLong();
}
@Override
public String toString() {
return
phone + '\t' +
up +
"\t" + down +
"\t" + sum ;
}
@Override
public int compareTo(Traffic o) {
return this.sum>o.sum?-1:1;
}
}
package com.ccj.pxj.sort;
import com.ccj.pxj.phone.utils.FileUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class PartitionSortDriver {
public static void main(String[] args) throws Exception {
String input = "data/phone_data .txt";
String output = "out";
// 1)获取Job对象
Configuration configuration = new Configuration();
Job job = Job.getInstance(configuration);
FileUtils.deleteOutput(configuration, output);
// 2)本job对应要执行的主类是哪个
job.setJarByClass(PartitionSortDriver.class);
// 3)设置Mapper和Reducer
job.setMapperClass(MyMaper.class);
job.setReducerClass(MyReduce.class);
job.setPartitionerClass(PhonePartitioner.class);
job.setNumReduceTasks(3);
// 4)设置Mapper阶段输出数据的类型
job.setMapOutputKeyClass(Traffic.class);
job.setMapOutputValueClass(Text.class);
// 5)设置Reducer阶段输出数据的类型
job.setOutputKeyClass(Traffic.class);
job.setOutputValueClass(Text.class);
// 6)设置输入和输出路径
FileInputFormat.setInputPaths(job, new Path(input));
FileOutputFormat.setOutputPath(job, new Path(output));
// 7)提交作业
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);
}
public static class MyMaper extends Mapper<LongWritable, Text,Traffic, Text>{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] data = value.toString().split("\t");
String phone = data[1];
// 上行流量
long up = Long.parseLong(data[data.length - 3]);
// 下行流量
long down = Long.parseLong(data[data.length - 2]);
context.write(new Traffic(phone,up,down),new Text(phone));
}
}
public static class MyReduce extends Reducer< Traffic,Text, Text, Traffic>{
/* @Override
protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException {
long ups=0;
long downs=0;
for (Traffic value : values) {
ups+=value.getUp();
downs+=value.getDown();
}
context.write(NullWritable.get(),new Traffic(key.toString(),ups,downs));
}*/
@Override
protected void reduce(Traffic key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
for (Text value : values) {
context.write(new Text(value),key);
}
}
}
}
有一类很常见的需求:按照一定的规则把数据给我写到某个文件中去
package com.ccj.outputformat;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import java.io.IOException;
public class PxjRecordWriter extends RecordWriter<Text,NullWritable> {
FSDataOutputStream pxjout;
FSDataOutputStream wfyout;
FileSystem fileSystem;
public PxjRecordWriter(TaskAttemptContext context){
try{
fileSystem=FileSystem.get(context.getConfiguration());
pxjout=fileSystem.create(new Path("out/pxj"));
wfyout=fileSystem.create(new Path("out/wfy"));
}catch (Exception e){
e.printStackTrace();
}
}
@Override
public void write(Text key, NullWritable value) throws IOException, InterruptedException {
if(key.toString().contains("pxj")){
pxjout.write(key.toString().getBytes());
}else{
wfyout.write(key.toString().getBytes());
}
}
@Override
public void close(TaskAttemptContext context) throws IOException, InterruptedException {
IOUtils.closeStream(pxjout);
IOUtils.closeStream(wfyout);
}
}
package com.ccj.outputformat;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class PxjDataOutputFormat extends FileOutputFormat<Text,NullWritable> {
@Override
public RecordWriter<Text, NullWritable> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException {
return new PxjRecordWriter(job);
}
}
package com.ccj.outputformat;
import com.ccj.pxj.phone.utils.FileUtils;
import com.ccj.pxj.sort.PhonePartitioner;
import com.ccj.pxj.sort.Traffic;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class PxjDataDriver {
public static void main(String[] args) throws Exception {
String input = "data/log.txt";
String output = "out";
// 1)获取Job对象
Configuration configuration = new Configuration();
Job job = Job.getInstance(configuration);
FileUtils.deleteOutput(configuration, output);
// 2)本job对应要执行的主类是哪个
job.setJarByClass(PxjDataDriver.class);
// 3)设置Mapper和Reducer
job.setMapperClass(MyMaper.class);
job.setReducerClass(MyReduce.class);
//job.setPartitionerClass(PhonePartitioner.class);
// job.setNumReduceTasks(3);
// 4)设置Mapper阶段输出数据的类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
// 5)设置Reducer阶段输出数据的类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
job.setOutputFormatClass(PxjDataOutputFormat.class);
// 6)设置输入和输出路径
FileInputFormat.setInputPaths(job, new Path(input));
FileOutputFormat.setOutputPath(job, new Path(output));
// 7)提交作业
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);
}
public static class MyMaper extends Mapper<LongWritable, Text,Text, NullWritable>{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
context.write(value,NullWritable.get());
}
}
public static class MyReduce extends Reducer< Text,NullWritable, Text,NullWritable>{
@Override
protected void reduce(Text key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
for (NullWritable value : values) {
context.write(new Text(key+"\r\n"),value);
}
}
}
}
RecordWriter
==> 抄源码
return new LineRecordWriter
上下文是一个框,什么东西都往里面装,什么东西都从里面取
源码、源码里面的单元测试 这是我们学习最好的资源
代码具有工业性的参考价值
作者:pxj(潘陈)
日期:2020-01-18