//输入第一次未排序的结果,然后在执行mapreduce程序就进行排序了
import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import cn.com.bigdata.mr.flowcount.FlowBean; /** * 功能:对流量日志进行按用户统计并按流量大小倒序输出 * 注意: 这个需求并不能在一个mr程序中完成,它是分成了两个mr程序来实现 * 第一个mr只实现流量汇总统计 * 第二个mr是在前一个mr的输出结果之上再处理 * 在真实的业务场景中,大部分时候,一个业务统计需求都需要很多个mr步骤串联起来完成 * @author * */ public class FlowCountSort { static class FlowCountSortMapper extends Mapper<LongWritable, Text, FlowBean, Text> { @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { // 13480253104 180 180 360 String line = value.toString(); String[] fields = line.split("\t"); String phone = fields[0]; long upAmount = Long.parseLong(fields[1]); long dAmount = Long.parseLong(fields[2]); FlowBean countBean = new FlowBean(upAmount, dAmount); // 以流量bean作为key输出,则会以流量bean的排序规则达到reducer context.write(countBean, new Text(phone)); } } /** * reducer类 * * @author * */ static class FlowCountSortReducer extends Reducer<FlowBean, Text, Text, FlowBean> { @Override protected void reduce(FlowBean bean, Iterable<Text> values, Context context) throws IOException, InterruptedException { context.write(values.iterator().next(), bean); } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf); job.setJarByClass(FlowCountSort.class); job.setMapperClass(FlowCountSortMapper.class); job.setReducerClass(FlowCountSortReducer.class); job.setMapOutputKeyClass(FlowBean.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(FlowBean.class); /** * hadoop中默认的输入输出组件就是TextInputformat和textoutputformat,所以,这两句代码也可以省略 */ /* * job.setInputFormatClass(TextInputFormat.class); * job.setOutputFormatClass(TextOutputFormat.class); */ FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); boolean res = job.waitForCompletion(true); System.exit(res ? 0 : 1); } }
import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableComparable; /** * hadoop中序列化框架的应用示例 * 反序列化时,反射机制会调用类的无参构造函数 * 所以,如果你在类中定义了有参构造,就一定要记得显式定义一下无参构造函数 * @author * */ public class FlowBean implements WritableComparable<FlowBean> { private long upflow; private long dflow; private long sumflow; /** * 显式定义无参构造 */ public FlowBean() {} public FlowBean(long upflow, long dflow) { this.upflow = upflow; this.dflow = dflow; this.sumflow = upflow + dflow; } public long getUpflow() { return upflow; } public void setUpflow(long upflow) { this.upflow = upflow; } public long getDflow() { return dflow; } public void setDflow(long dflow) { this.dflow = dflow; } public long getSumflow() { return sumflow; } public void setSumflow(long sumflow) { this.sumflow = sumflow; } /** * 反序列化的方法,反序列化时,从流中读取到的各个字段的顺序应该与序列化时写出去的顺序保持一致 */ @Override public void readFields(DataInput in) throws IOException { upflow = in.readLong(); dflow = in.readLong(); sumflow = in.readLong(); } /** * 序列化的方法 */ @Override public void write(DataOutput out) throws IOException { out.writeLong(upflow); out.writeLong(dflow); //可以考虑不序列化总流量,因为总流量是可以通过上行流量和下行流量计算出来的 out.writeLong(sumflow); } @Override public String toString() { return upflow + "\t" + dflow + "\t" + sumflow; } @Override public int compareTo(FlowBean o) { //实现按照sumflow的大小倒序排序 return sumflow>o.getSumflow()?-1:1; } }