需求一:统计每个手机号的上行数据包总和,下行数据包总和,上行总流量之和,下行总流量之和
需求二:将需求一的结果按照上行流量总和降序排序
需求三:将需求一的结果重新分区
不论是写SQL还是写MR,一般都先把结果的格式先列出来
手机号 上行包总和 下行包总和 上行流量总和 下行流量总和
有没有分组或者排序:决定Map输出的Key是什么
Map输出的Value应该是什么?
验证
Input
Map
构建JavaBean
Shuffle
key value
110 1 1 10 10
110 2 2 20 20
119 1 1 3 3
120 1 2 3 4
|
按照key分组
|
相同手机号的所有的包和流量信息都在一个迭代器中
|
key value:JavaBean
110 {1 1 10 10,2 2 20 20}
119 1 1 3 3
120 1 2 3 4
key value:JavaBean
110 3 3 30 30
119 1 1 3 3
120 1 2 3 4
13480253104 3 3 180 180
13502468823 57 102 7335 110349
13560439658 33 24 2034 5892
13600217502 37 266 2257 203704
13602846565 15 12 1938 2910
13660577991 24 9 6960 690
13719199419 4 0 240 0
13726230503 24 27 2481 24681
13760778710 2 2 120 120
13823070001 6 3 360 180
13826544101 4 0 264 0
13922314466 12 12 3008 3720
13925057413 69 63 11058 48243
13926251106 4 0 240 0
13926435656 2 4 132 1512
15013685858 28 27 3659 3538
15920133257 20 20 3156 2936
15989002119 3 3 1938 180
18211575961 15 12 1527 2106
18320173382 21 18 9531 2412
84138413 20 16 4116 1432
需求一是按照key,就是手机号排序的
结果长什么样?
有没有分组或者排序?
Map输出的Value是什么?
13502468823 57 102 7335 110349
13560439658 33 24 2034 5892
13600217502 37 266 2257 203704
13602846565 15 12 1938 2910
13660577991 24 9 6960 690
13719199419 4 0 240 0
13726230503 24 27 2481 24681
13760778710 2 2 120 120
13480253104 3 3 180 180
13823070001 6 3 360 180
13826544101 4 0 264 0
13922314466 12 12 3008 3720
13925057413 69 63 11058 48243
13926251106 4 0 240 0
13926435656 2 4 132 1512
15013685858 28 27 3659 3538
15920133257 20 20 3156 2936
15989002119 3 3 1938 180
18211575961 15 12 1527 2106
18320173382 21 18 9531 2412
84138413 20 16 4116 1432
package bigdata.hanjiaxiaozhi.cn.mr.flow;
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
/**
* @ClassName FlowBean01
* @Description TODO Flow1程序中的自定义数据类型
* 用于封装:上行包、下行包、上行流量、下行流量
* @Date 2020/6/1 11:16
* @Create By hanjiaxiaozhi
*/
public class FlowBean01 implements Writable {
//先定义属性
private long upPack;
private long downPack;
private long upFlow;
private long downFlow;
public FlowBean01(){
}
//get and set
public long getUpPack() {
return upPack;
}
public void setUpPack(long upPack) {
this.upPack = upPack;
}
public long getDownPack() {
return downPack;
}
public void setDownPack(long downPack) {
this.downPack = downPack;
}
public long getUpFlow() {
return upFlow;
}
public void setUpFlow(long upFlow) {
this.upFlow = upFlow;
}
public long getDownFlow() {
return downFlow;
}
public void setDownFlow(long downFlow) {
this.downFlow = downFlow;
}
//序列化
@Override
public void write(DataOutput out) throws IOException {
out.writeLong(this.upPack);
out.writeLong(this.downPack);
out.writeLong(this.upFlow);
out.writeLong(this.downFlow);
}
//反序列化
@Override
public void readFields(DataInput in) throws IOException {
this.upPack = in.readLong();
this.downPack = in.readLong();
this.upFlow = in.readLong();
this.downFlow = in.readLong();
}
//toString方法,output阶段会调用该方法写入文件
@Override
public String toString() {
return upPack+"\t"+downPack+"\t"+upFlow+"\t"+downFlow;
}
}
package bigdata.hanjiaxiaozhi.cn.mr.flow;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import java.io.IOException;
/**
* @ClassName MRDriver
* @Description TODO 这是MapReduce程序实现流量分析统计的需求一:统计每个手机号码的上行和下行总和
* @Date 2020/5/30 10:34
* @Create By hanjiaxiaozhi
*/
public class MRFlow1 extends Configured implements Tool {
/**
* 用于将Job的代码封装
* @param args
* @return
* @throws Exception
*/
@Override
public int run(String[] args) throws Exception {
//todo:1-构建一个Job
Job job = Job.getInstance(this.getConf(),"flow1");//构建Job对象,调用父类的getconf获取属性的配置
job.setJarByClass(MRFlow1.class);//指定可以运行的类型
//todo:2-配置这个Job
//input
// job.setInputFormatClass(TextInputFormat.class);//设置输入的类的类型,默认就是TextInputFormat
Path inputPath = new Path("datas/flowCase/data_flow.dat");//用程序的第一个参数做为第一个输入路径
//设置的路径可以给目录,也可以给定文件,如果给定目录,会将目录中所有文件作为输入,但是目录中不能包含子目录
TextInputFormat.setInputPaths(job,inputPath);//为当前job设置输入的路径
//map
job.setMapperClass(MRMapper.class);//设置Mapper的类,需要调用对应的map方法
job.setMapOutputKeyClass(Text.class);//设置Mapper输出的key类型
job.setMapOutputValueClass(FlowBean01.class);//设置Mapper输出的value类型
//shuffle
// job.setPartitionerClass(HashPartitioner.class);//自定义分区
// job.setGroupingComparatorClass(null);//自定义分组的方式
// job.setSortComparatorClass(null);//自定义排序的方式
//reduce
job.setReducerClass(MRReducer.class);//设置Reduce的类,需要调用对应的reduce方法
job.setOutputKeyClass(Text.class);//设置Reduce输出的Key类型
job.setOutputValueClass(FlowBean01.class);//设置Reduce输出的Value类型
job.setNumReduceTasks(1);//设置ReduceTask的个数,默认为1
//output:输出目录默认不能提前存在
// job.setOutputFormatClass(TextOutputFormat.class);//设置输出的类,默认我诶TextOutputFormat
Path outputPath = new Path("datas/output/flow/flow1/output");//用程序的第三个参数作为输出
//解决输出目录提前存在,不能运行的问题,提前将目前删掉
//构建一个HDFS的文件系统
FileSystem hdfs = FileSystem.get(this.getConf());
//判断输出目录是否存在,如果存在就删除
if(hdfs.exists(outputPath)){
hdfs.delete(outputPath,true);
}
TextOutputFormat.setOutputPath(job,outputPath);//为当前Job设置输出的路径
//todo:3-提交运行Job
return job.waitForCompletion(true) ? 0:-1;
}
/**
* 程序的入口,调用run方法
* @param args
*/
public static void main(String[] args) throws Exception {
//构建一个Configuration对象,用于管理这个程序所有配置,工作会定义很多自己的配置
Configuration conf = new Configuration();
//t通过Toolruner的run方法调用当前类的run方法
int status = ToolRunner.run(conf, new MRFlow1(), args);
//退出程序
System.exit(status);
}
/**
* @ClassName MRMapper
* @Description TODO 这是MapReduce模板的Map类
* 输入的KV类型:由inputformat决定,默认是TextInputFormat
* 输出的KV类型:由map方法中谁作为key,谁作为Value决定
*/
public static class MRMapper extends Mapper<LongWritable, Text, Text,FlowBean01> {
private Text outputKey = new Text();
private FlowBean01 outputValue = new FlowBean01();
/**
* 一般用于实现对数据前期的过滤、转换的操作
* * 输出新的KeyValue
* * Key:手机号
* * Value:这条数据中的上行包、下行包、上行流量、下行流量
*/
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//将每一行的数据进行分割
String[] items = value.toString().split("\t");
//进行合法性判断
if(items.length >= 11){
//通过计数器,记录有多少条合法的
context.getCounter("UserGroup","legal record").increment(1L);
//将手机号作为Key
this.outputKey.set(items[1]);
//将上行包、下行包、上行流量、下行流量作为value
this.outputValue.setUpPack(Long.parseLong(items[6]));
this.outputValue.setDownPack(Long.parseLong(items[7]));
this.outputValue.setUpFlow(Long.parseLong(items[8]));
this.outputValue.setDownFlow(Long.parseLong(items[9]));
//输出
context.write(this.outputKey,this.outputValue);
}else{
//通过计数器记录有多少个不合法的,计数器的结果可以在运行的日志中显示,这条代码执行一次就自增1
context.getCounter("UserGroup","illegal record").increment(1L);
//如果没有11个字段,记录总共有多少个,然后直接返回,取下一条
return;
}
}
}
/**
* @ClassName MRReducer
* @Description TODO MapReduce模板的Reducer的类
* 输入的KV类型:由Map的输出决定,保持一致
* 输出的KV类型:由reduce方法中谁作为key,谁作为Value决定
*/
public static class MRReducer extends Reducer<Text,FlowBean01,Text,FlowBean01> {
private FlowBean01 outputValue = new FlowBean01();
/**
* 实现reduce处理的逻辑:用于实现分组后的聚合
*/
@Override
protected void reduce(Text key, Iterable<FlowBean01> values, Context context) throws IOException, InterruptedException {
long sumUpPack = 0;
long sumDownPack = 0;
long sumUpFlow = 0;
long sumDownFlow = 0;
//迭代取出这个手机号所有的上网记录中的上行和下行的信息,累加即可
for (FlowBean01 value : values) {
sumUpPack += value.getUpPack();
sumDownPack += value.getDownPack();
sumUpFlow += value.getUpFlow();
sumDownFlow += value.getDownFlow();
}
//将最后计算出的总的数据作为Value
this.outputValue.setUpPack(sumUpPack);
this.outputValue.setDownPack(sumDownPack);
this.outputValue.setUpFlow(sumUpFlow);
this.outputValue.setDownFlow(sumDownFlow);
//输出结果
context.write(key,this.outputValue);
}
}
}
package bigdata.hanjiaxiaozhi.cn.mr.flow;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
/**
* @ClassName FlowBean01
* @Description TODO 用于封装Flow01的结果
* 用于封装:手机号、上行总包、下行总包、上行总流量、下行总流量
* @Date 2020/6/1 11:16
* @Create By hanjiaxiaozhi
*/
public class FlowBean02 implements WritableComparable<FlowBean02> {
//先定义属性
private String phone;
private long upPack;
private long downPack;
private long upFlow;
private long downFlow;
public FlowBean02(){
}
//get and set
public String getPhone() {
return phone;
}
public void setPhone(String phone) {
this.phone = phone;
}
public long getUpPack() {
return upPack;
}
public void setUpPack(long upPack) {
this.upPack = upPack;
}
public long getDownPack() {
return downPack;
}
public void setDownPack(long downPack) {
this.downPack = downPack;
}
public long getUpFlow() {
return upFlow;
}
public void setUpFlow(long upFlow) {
this.upFlow = upFlow;
}
public long getDownFlow() {
return downFlow;
}
public void setDownFlow(long downFlow) {
this.downFlow = downFlow;
}
//序列化
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(this.phone);
out.writeLong(this.upPack);
out.writeLong(this.downPack);
out.writeLong(this.upFlow);
out.writeLong(this.downFlow);
}
//反序列化
@Override
public void readFields(DataInput in) throws IOException {
this.phone = in.readUTF();
this.upPack = in.readLong();
this.downPack = in.readLong();
this.upFlow = in.readLong();
this.downFlow = in.readLong();
}
//toString方法,output阶段会调用该方法写入文件
@Override
public String toString() {
return phone+"\t"+upPack+"\t"+downPack+"\t"+upFlow+"\t"+downFlow;
}
/**
* 对于这个类型在Shuffle中的排序,按照上行流量降序
* @param o
* @return
*/
@Override
public int compareTo(FlowBean02 o) {
//只按照上行总流量降序排序即可
return -Long.valueOf(this.getUpFlow()).compareTo(Long.valueOf(o.getUpFlow()));
}
}
package bigdata.hanjiaxiaozhi.cn.mr.flow;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import java.io.IOException;
/**
* @ClassName MRDriver
* @Description TODO 基于MRFlow1的结果,对数据按照上行流量总和降序排序
* @Date 2020/5/30 10:34
* @Create By hanjiaxiaozhi
*/
public class MRFlow2 extends Configured implements Tool {
/**
* 用于将Job的代码封装
* @param args
* @return
* @throws Exception
*/
@Override
public int run(String[] args) throws Exception {
//todo:1-构建一个Job
Job job = Job.getInstance(this.getConf(),"flow2");//构建Job对象,调用父类的getconf获取属性的配置
job.setJarByClass(MRFlow2.class);//指定可以运行的类型
//todo:2-配置这个Job
//input
// job.setInputFormatClass(TextInputFormat.class);//设置输入的类的类型,默认就是TextInputFormat
Path inputPath = new Path("datas/output/flow/flow1/output/part-r-00000");//用程序的第一个参数做为第一个输入路径
//设置的路径可以给目录,也可以给定文件,如果给定目录,会将目录中所有文件作为输入,但是目录中不能包含子目录
TextInputFormat.setInputPaths(job,inputPath);//为当前job设置输入的路径
//map
job.setMapperClass(MRMapper.class);//设置Mapper的类,需要调用对应的map方法
job.setMapOutputKeyClass(FlowBean02.class);//设置Mapper输出的key类型
job.setMapOutputValueClass(NullWritable.class);//设置Mapper输出的value类型
//shuffle
// job.setPartitionerClass(HashPartitioner.class);//自定义分区
// job.setGroupingComparatorClass(null);//自定义分组的方式
// job.setSortComparatorClass(null);//自定义排序的方式
//reduce
job.setReducerClass(MRReducer.class);//设置Reduce的类,需要调用对应的reduce方法
job.setOutputKeyClass(FlowBean02.class);//设置Reduce输出的Key类型
job.setOutputValueClass(NullWritable.class);//设置Reduce输出的Value类型
job.setNumReduceTasks(1);//设置ReduceTask的个数,默认为1
//output:输出目录默认不能提前存在
// job.setOutputFormatClass(TextOutputFormat.class);//设置输出的类,默认我诶TextOutputFormat
Path outputPath = new Path("datas/output/flow/flow2/output");//用程序的第三个参数作为输出
//解决输出目录提前存在,不能运行的问题,提前将目前删掉
//构建一个HDFS的文件系统
FileSystem hdfs = FileSystem.get(this.getConf());
//判断输出目录是否存在,如果存在就删除
if(hdfs.exists(outputPath)){
hdfs.delete(outputPath,true);
}
TextOutputFormat.setOutputPath(job,outputPath);//为当前Job设置输出的路径
//todo:3-提交运行Job
return job.waitForCompletion(true) ? 0:-1;
}
/**
* 程序的入口,调用run方法
* @param args
*/
public static void main(String[] args) throws Exception {
//构建一个Configuration对象,用于管理这个程序所有配置,工作会定义很多自己的配置
Configuration conf = new Configuration();
//t通过Toolruner的run方法调用当前类的run方法
int status = ToolRunner.run(conf, new MRFlow2(), args);
//退出程序
System.exit(status);
}
/**
* @ClassName MRMapper
* @Description TODO 这是MapReduce模板的Map类
* 输入的KV类型:由inputformat决定,默认是TextInputFormat
* 输出的KV类型:由map方法中谁作为key,谁作为Value决定
*/
public static class MRMapper extends Mapper<LongWritable, Text, FlowBean02,NullWritable> {
private FlowBean02 outputKey = new FlowBean02();
private NullWritable outputValue = NullWritable.get();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//将文件中的五列封装为Key
String[] items = value.toString().split("\t");
//将五列封装到Key中
this.outputKey.setPhone(items[0]);
this.outputKey.setUpPack(Long.parseLong(items[1]));
this.outputKey.setDownPack(Long.parseLong(items[2]));
this.outputKey.setUpFlow(Long.parseLong(items[3]));
this.outputKey.setDownFlow(Long.parseLong(items[4]));
//输出
context.write(this.outputKey,this.outputValue);
}
}
/**
* @ClassName MRReducer
* @Description TODO MapReduce模板的Reducer的类
* 输入的KV类型:由Map的输出决定,保持一致
* 输出的KV类型:由reduce方法中谁作为key,谁作为Value决定
*/
public static class MRReducer extends Reducer<FlowBean02,NullWritable,FlowBean02,NullWritable> {
@Override
protected void reduce(FlowBean02 key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
/**
* 直接输出即可
*/
for (NullWritable value : values) {
context.write(key,value);
}
}
}
}