一、
源数据:
13156578896 123 456
13156578896 123 456
13155555555 333 555
13726230503 2481 24681
13826544101 264 0
13926435656 132 1512 1644
18784965678 123 567
18845678909 67890 345
12345678999 908 888
13344445555 9999 7777
12345678999 908 888
13344445555 9999 7777
13145678901 678 456
从上面的数据中得到每个手机的上行流量、下行流量、总流量。
1.数据分析:
(1.)数据格式为:
手机号,上行流量,下行流量
(2.)想要得到的数据格式为:
由于源数据中每一个手机号可能存在多条上网记录,最后要得到的输出格式是一个手机号的所有上行流量、下行流量和总流量。所以可以考虑利用MapReduce框架的特性,将每个手机号作为map的输出key,该手机号上网信息作为map的输出value,经过shuffle,则在reduce端接收到一个
3.代码如下:
com.phone.Phone:
package com.phone;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
public class Phone implements Writable {
private String phoneNB;
private long up_flow;
private long down_flow;
private long sum_flow;
// 在反序列化时,反射机制需要调用空参构造函数,所以显示定义了一个空参构造函数
public Phone() {
}
// 为了对象数据的初始化方便,加入一个带参的构造函数
public Phone(String phoneNB, long up_flow, long down_flow) {
this.phoneNB = phoneNB;
this.up_flow = up_flow;
this.down_flow = down_flow;
this.sum_flow = up_flow + down_flow;
}
// 将对象的数据序列化到流中
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(phoneNB);
out.writeLong(up_flow);
out.writeLong(down_flow);
out.writeLong(sum_flow);
}
// 从流中反序列化出对象的数据
// 从数据流中读出对象字段时,必须跟序列化时的顺序保持一致
@Override
public void readFields(DataInput in) throws IOException {
this.phoneNB = in.readUTF();
this.up_flow = in.readLong();
this.down_flow = in.readLong();
this.sum_flow = in.readLong();
}
public String getPhoneNB() {
return phoneNB;
}
public void setPhoneNB(String phoneNB) {
this.phoneNB = phoneNB;
}
public long getUp_flow() {
return up_flow;
}
public void setUp_flow(long up_flow) {
this.up_flow = up_flow;
}
public long getDown_flow() {
return down_flow;
}
public void setDown_flow(long down_flow) {
this.down_flow = down_flow;
}
public long getSum_flow() {
return sum_flow;
}
public void setSum_flow(long sum_flow) {
this.sum_flow = sum_flow;
}
@Override
public String toString() {
return "" + up_flow + "\t" + down_flow + "\t" + sum_flow;
}
}
package com.phone;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import com.zt.Sort;
import com.zt.Sort.Map;
import com.zt.Sort.Reduce;
import com.zt.WordCount.IntSumReducer;
public class PhoneOutPut {
public static class FlowSumMapper extends Mapper {
public void map(LongWritable k1, Text v1,
Context context)
throws IOException, InterruptedException {
// 一行数据
String line = v1.toString();
// 切分数据
/*String[] fields = StringUtils.split(line, "\t");
// 得到想要的手机号、上行流量、下行流量
String phoneNB = fields[1];
long up_flow = Long.parseLong(fields[7]);
long down_flow = Long.parseLong(fields[8]); */
String fileds[]=line.split(" ");
String phoneNB=fileds[0];
long up_flow = Long.parseLong(fileds[1]);
long down_flow = Long.parseLong(fileds[2]);
// 封装数据为kv并输出
// System.out.println(phoneNB+" 流量一:"+up_flow+" 流量二:"+down_flow);
context.write(new Text(phoneNB), new Phone(phoneNB, up_flow,
down_flow));
// System.out.println(new Phone(phoneNB, up_flow, down_flow));
}
}
public static class FlowSumReducer extends Reducer {
// 框架每传递一组数据<1387788654,{Phone,Phone,Phone,Phone.....}>调用一次我们的reduce方法
// reduce中的业务逻辑就是遍历values,然后进行累加求和再输出
public void reduce(Text k2, Iterable v2s,
Context context)
throws IOException, InterruptedException {
long up_flow = 0;
long down_flow = 0;
for (Phone v2 : v2s) {
up_flow += v2.getUp_flow();
down_flow += v2.getDown_flow();
}
context.write(k2, new Phone(k2.toString(), up_flow, down_flow)); //打印对象时调用toString方法
System.out.println(k2+" "+new Phone(k2.toString(), up_flow, down_flow));
/* LongWritable l=new LongWritable(up_flow);
context.write(k2, l);*/
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: Data Sort ");
System.exit(2);
}
Job job = new Job(conf, "phone1");
job.setJarByClass(PhoneOutPut.class);
//设置Map和Reduce处理类
job.setMapperClass(FlowSumMapper.class);
job.setCombinerClass(FlowSumReducer.class);
job.setReducerClass(FlowSumReducer.class);
//设置输出类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Phone.class);
//设置输入和输出目录
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
context.write相当于java中的System.out.print(),而打印类的对象时会调用该类的toString()方法,所以这里用context.write()可以将覆写的toString()方法里面的内容输出到hdfs的文件夹中。
输出如下:
12345678999 1816 1776 3592
13145678901 678 456 1134
13155555555 333 555 888
13156578896 246 912 1158
13344445555 19998 15554 35552
13726230503 2481 24681 27162
13826544101 264 0 264
13926435656 132 1512 1644
18784965678 123 567 690
18845678909 67890 345 68235
二、引入hadoop自定义排序
从上面得到的结果可以看出来,hadoop默认将结果按照mapper的输出按照key来进行升序排序,如果我们想要自定义排序结果(比如按照总流量从高到低排序),该如何做呢?了解shuffle的都知道,shuffle过程中,会将map的输出结果按照key进行排序,所以只需要将Phone作为map输出的key值,前提是Phone实现了Comparable接口。在hadoop中既实现Writable接口,又实现Comparable接口,可以简写为实现了WritableComparable接口。
源数据:
13156578897 123 456
13156578896 123 456
13155555555 333 555
13726230503 2481 24681
13826544101 264 0
13926435656 132 1512 1644
18784965678 123 567
18845678909 67890 345
12345678999 908 888
13344445555 9999 7777
12345678999 908 888
13344445555 9999 7777
13145678901 678 456
com.zort.MyPhone:
package com.zort;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
public class MyPhone implements WritableComparable {
private String phoneNB;
private long up_flow;
private long down_flow;
private long sum_flow;
// 在反序列化时,反射机制需要调用空参构造函数,所以显示定义了一个空参构造函数
public MyPhone() {
}
// 为了对象数据的初始化方便,加入一个带参的构造函数
public MyPhone(String phoneNB, long up_flow, long down_flow) {
this.phoneNB = phoneNB;
this.up_flow = up_flow;
this.down_flow = down_flow;
this.sum_flow = up_flow + down_flow;
}
// 将对象的数据序列化到流中
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(phoneNB);
out.writeLong(up_flow);
out.writeLong(down_flow);
out.writeLong(sum_flow);
}
// 从流中反序列化出对象的数据
// 从数据流中读出对象字段时,必须跟序列化时的顺序保持一致
@Override
public void readFields(DataInput in) throws IOException {
this.phoneNB = in.readUTF();
this.up_flow = in.readLong();
this.down_flow = in.readLong();
this.sum_flow = in.readLong();
}
public String getPhoneNB() {
return phoneNB;
}
public void setPhoneNB(String phoneNB) {
this.phoneNB = phoneNB;
}
public long getUp_flow() {
return up_flow;
}
public void setUp_flow(long up_flow) {
this.up_flow = up_flow;
}
public long getDown_flow() {
return down_flow;
}
public void setDown_flow(long down_flow) {
this.down_flow = down_flow;
}
public long getSum_flow() {
return sum_flow;
}
public void setSum_flow(long sum_flow) {
this.sum_flow = sum_flow;
}
@Override
public String toString() {
return "" + up_flow + "\t" + down_flow + "\t" + sum_flow;
}
// 实现Comparable接口,需要复写compareTo方法
@Override
public int compareTo(MyPhone o) {
return this.sum_flow > o.sum_flow ? -1 : 1;
}
}
com.zort.PhoneSort:
package com.zort;
import java.io.IOException;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class PhoneSort {
public static class SortMapper extends
Mapper {
@Override
protected void map(
LongWritable k1,
Text v1,
Mapper.Context context)
throws IOException, InterruptedException {
String line = v1.toString();
String[] fields = line.split(" ");
String phoneNB = fields[0];
long up_flow = Long.parseLong(fields[1]);
long down_flow = Long.parseLong(fields[2]);
context.write(new MyPhone(phoneNB, up_flow, down_flow),
NullWritable.get());
}
}
public static class SortReducer extends
Reducer {
@Override
protected void reduce(MyPhone k2, Iterable v2s,
Reducer.Context context)
throws IOException, InterruptedException {
String phoneNB = k2.getPhoneNB();
context.write(new Text(phoneNB), k2);
}
}
public static void main(String[] args) throws IOException,
ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(PhoneSort.class);
job.setMapperClass(SortMapper.class);
job.setReducerClass(SortReducer.class);
job.setMapOutputKeyClass(MyPhone.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(MyPhone.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
输出如下:
18845678909 67890 345 68235
13726230503 2481 24681 27162
13344445555 9999 7777 17776
13344445555 9999 7777 17776
12345678999 908 888 1796
12345678999 908 888 1796
13926435656 132 1512 1644
13145678901 678 456 1134
13155555555 333 555 888
18784965678 123 567 690
13156578897 123 456 579
13156578896 123 456 579
13826544101 264 0 264
三、引入Hadoop分区功能
如果信息特别多,想要将最后的结果分别存放在不通过的文件中,该怎么办呢?可以使用Hadoop提供的Partitioner函数,hadoop默认使用HashPartitioner。可以查看下Hadoop源码:
public class HashPartitioner extends Partitioner {
/** Use {@link Object#hashCode()} to partition. */
public int getPartition(K key, V value,
int numReduceTasks) {
return (key.hashCode() & Integer.MAX_VALUE) % numReduceTasks;
}
}
HashPartitioner是处理Mapper任务输出的,getPartition()方法有三个形参,key、value分别指的是Mapper任务的输出,numReduceTasks指的是设置的Reducer任务数量,默认值是1。那么任何整数与1相除的余数肯定是0。也就是说getPartition(…)方法的返回值总是0。也就是Mapper任务的输出总是送给一个Reducer任务,最终只能输出到一个文件中。据此分析,如果想要最终输出到多个文件中,在Mapper任务中对数据应该划分到多个区中。比如下面统计考试的总得分:
源数据:
小明 第一次考试 100 90 80
小李 第一次考试 60 61 64
小王 第一次考试 30 40 50
小明 第二次考试 89 90 80
小李 第二次考试 60 71 64
小王 第二次考试 31 40 50
小宋 第一次考试 30 20 40
小张 第一次考试 20 30 55
小明 第三次考试 78 90 80
package com.sum;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
public class Sum implements Writable{
private int chiness;
private int math;
private int english;
private int sum;
public Sum(){
}
public Sum(int chiness,int math,int english){
this.chiness=chiness;
this.math=math;
this.english=english;
this.sum=this.chiness+this.math+this.english;
}
public int getChiness(){
return this.chiness;
}
public int getMath(){
return this.math;
}
public int getEnglish(){
return this.english;
}
public int getSum(){
return this.sum;
}
public String toString(){
return "语文:"+this.chiness+"数学:"+this.math+"英语:"+this.english+"总分:"+this.sum;
}
@Override
public void readFields(DataInput in) throws IOException {
// TODO Auto-generated method stub
this.chiness=in.readInt();
this.math=in.readInt();
this.english=in.readInt();
this.sum=in.readInt();
}
@Override
public void write(DataOutput out) throws IOException {
// TODO Auto-generated method stub
out.writeInt(chiness);
out.writeInt(math);
out.writeInt(english);
out.writeInt(sum);
}
}
package com.sum;
import java.util.HashMap;
import java.util.StringTokenizer;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
public class position extends Partitioner { //输出
private static HashMap areaMap = new HashMap<>();
static {
areaMap.put("小明", 0);
areaMap.put("小宋", 1);
areaMap.put("小张", 2);
areaMap.put("小王", 3);
}
@Override
public int getPartition(Text key, Sum value, int numPartitions) {
//
String p=key.toString();
StringTokenizer p1=new StringTokenizer(p);
String id="";
while(p1.hasMoreElements()){
id=p1.nextToken();
}
Integer areCoder = areaMap.get(id);
if (areCoder == null) {
areCoder = 4;
}
return areCoder;
}
}
package com.sum;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.server.namenode.Content;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import com.phone.Phone;
import com.phone.PhoneOutPut;
import com.phone.PhoneOutPut.FlowSumMapper;
import com.phone.PhoneOutPut.FlowSumReducer;
import com.position.AreaPartitioner;
public class SumPosition{
public static class Map extends Mapper{
public void map(LongWritable key,Text value,Context context) throws IOException, InterruptedException{
String p=value.toString();
String name="";
int chiness=0;
int math=0;
int english=0;
/*String []text=p.split(" ");
String name=text[0];
int chiness=Integer.parseInt(text[2]);
int math=Integer.parseInt(text[3]);
int english=Integer.parseInt(text[4]);*/
StringTokenizer token=new StringTokenizer(p);
while(token.hasMoreElements()){
name=token.nextToken();
String count=token.nextToken();
chiness=Integer.parseInt(token.nextToken());
math=Integer.parseInt(token.nextToken());
english=Integer.parseInt(token.nextToken());
}
System.out.println(name+" "+chiness+" "+math+" "+english);
context.write(new Text(name), new Sum(chiness,math,english));
}
}
public static class Reduce extends Reducer{
public void reduce(Text key,Iterable value,Context context) throws IOException, InterruptedException{
int chiness=0;
int math=0;
int english=0;
for(Sum v:value ){
chiness+=v.getChiness();;
math+=v.getMath();
english+=v.getEnglish();
}
context.write(key, new Sum(chiness,math,english));
}
}
public static void main(String args[]) throws IOException, ClassNotFoundException, InterruptedException{
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: Data Sort ");
System.exit(2);
}
Job job = new Job(conf, "sum");
job.setJarByClass(SumOutPut.class);
//设置Map和Reduce处理类
job.setMapperClass(Map.class);
job.setCombinerClass(Reduce.class);
job.setReducerClass(Reduce.class);
// 定义分组逻辑类
job.setPartitionerClass(position.class);
//设置输出类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Sum.class);
job.setNumReduceTasks(5);
//设置输入和输出目录
FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
小明 语文:267数学:270英语:240总分:777
小宋 语文:30数学:20英语:40总分:90
小张 语文:20数学:30英语:55总分:105
小王 语文:61数学:80英语:100总分:241
小李 语文:120数学:132英语:128总分:380