MapReduce算法模式-数据聚合(最大值/最小时/总和)

MapReduce算法模式-数据聚合(最大值/最小时/总和)_第1张图片
美妙的函数.jpg
开会时间尽量写写这部分的内容

数据聚合

数据聚合的过程是在分析统计过程中十分重要的过程,包括熟悉的最大值/最小值,求平均值,中值等,在本文章中需要实现的是最大值/最小时,并计算总和的过程;

模拟的是usrid相同的情况下,时间上最早出现时间和最晚出现时间,并统计出现的频率;

输入文件的值:
123 -- 2017-10-22 12:33:78
123 -- 2017-12-21 12:22:30
234 -- 2017-08-22 11:17:20
2546 -- 2008-10-08 21:19:40
map的输出的中间结果:
123     2017-12-21 00:22:30 2017-12-21 00:22:30 1
123     2017-10-22 00:34:18 2017-10-22 00:34:18 1
234     2017-08-22 11:17:20 2017-08-22 11:17:20 1
2546    2008-10-08 21:19:40 2008-10-08 21:19:40 1
reduce的输出的结果值:
123     2017-10-22 00:34:18 2017-12-21 00:22:30 2
234     2017-08-22 11:17:20 2017-08-22 11:17:20 1
2546    2008-10-08 21:19:40 2008-10-08 21:19:40 1

总体的过程类为:

NumbericalObject:定义map和reduce的value的数据类型
NumbericalObjectMapper:统计生成key和value的值(value的值是构造生成的,保持starttime和endtime一致)
NUmbericalObjectReducer:统计max和min的值
NumbericalObjectMain:主函数

各个类的函数

NumbericalObject:

package Numberical.MaxMinMidObject;

import org.apache.hadoop.io.Writable;


import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;

/*
    数值聚合操作:
    模拟商品评论列表:
    原始数据-->用户ID:评论时间
    所要实现的通过用户ID,判断出所有数据对应的开始时间和结束时间,统计这个时间段内用户评论的次数
     */
public class NumbericalObject implements Writable{
    /*
    定义相应的时间变量
     */
    private Date startTime = new Date();
    private Date endTime = new Date();
    private Long count;

    private final static SimpleDateFormat simpletime = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    public void setStartTime(Date startTime) {
        this.startTime = startTime;
    }

    public void setEndTime(Date endTime) {
        this.endTime = endTime;
    }

    public void setCount(Long count) {
        this.count = count;
    }

    public Date getStartTime() {

        return startTime;
    }

    public Date getEndTime() {
        return endTime;
    }

    public Long getCount() {
        return count;
    }

    //序列化对象
    public void write(DataOutput dataOutput) throws IOException {
        //使用writeLong的原因是因为Date的值在序列话的时候可以转换成时间戳的形式,这样位数大于8位,应该用Long的形式
        dataOutput.writeLong(startTime.getTime());
        dataOutput.writeLong(endTime.getTime());
        dataOutput.writeLong(count);
    }
    //反序列化对象
    public void readFields(DataInput dataInput) throws IOException {
        startTime = new Date(dataInput.readLong());
        endTime = new Date(dataInput.readLong());
        count = dataInput.readLong();
    }
    //实现writeable接口必须实现write的方法,实现序列化和反序列化的过程,因为这个参数作为了map或reduce的数据的key或value的值了
    //需要实现toString的方法,这样才能把序列化的结果输出到文件中
    @Override
    public String toString(){
        String str = simpletime.format(startTime)+"\t"+simpletime.format(endTime)+"\t"+count;
        return str;
    }
}

NUmbericalObjectMapper:

package Numberical.MaxMinMidObject;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.net.URI;
import java.text.SimpleDateFormat;
import java.util.StringTokenizer;

public class NumbericalObjectMapper {
    //删除相应的已经存在的目录
    /*
    Configuration:定义的配置文件
    Path:定义的文件的路径
     */
    public static void delete(Configuration conf, Path path) throws Exception{
        FileSystem fileSystem = FileSystem.get(new URI(path.toString()),conf);
        Path fspath=path;
        if(fileSystem.exists(fspath)){
            fileSystem.delete(fspath,true);
        }
    }

    public static class NumbericalMap extends Mapper{
        private static final Logger log =LoggerFactory.getLogger("LOG");
        private Text Numbericalkey = new Text();
        private NumbericalObject NumbericalValue = new NumbericalObject();
        protected void map(Object key,Text value,Context context) throws IOException,InterruptedException{
            log.info("时间是:");
            String[] valueInfo = value.toString().split("--");

            Numbericalkey.set(valueInfo[0]);
            SimpleDateFormat data1 = new SimpleDateFormat("yyyy-MM-dd hh:mm:ss");
            SimpleDateFormat data2 = new SimpleDateFormat("yyyy-MM-dd hh:mm:ss");

            try {
                NumbericalValue.setStartTime(data1.parse(valueInfo[1].toString()));
            }catch (Exception e){
                e.printStackTrace();
                }
            try {
                NumbericalValue.setEndTime(data2.parse(valueInfo[1].toString()));
            }catch (Exception e){
                e.printStackTrace();
            }
            NumbericalValue.setCount(Long.valueOf(1));

            context.write(Numbericalkey,NumbericalValue);

        }
    }
}

NumbericalObjectReducer:

package Numberical.MaxMinMidObject;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class NumbericalObjectReducer {

    public static class NumbericalCombine extends Reducer{
        //先处理时间的字段的情况
        //key+value[starttime,endtime] -->key
        //key值要进行处理,min(starttime),max(endtime)
        //value[count++] --value

        private NumbericalObject numbericalObject = new NumbericalObject();
        protected void reduce(Text key,Iterable value,Context context) throws IOException,InterruptedException{
            boolean flag =false;
            Long sum=0L;
            for(NumbericalObject val:value){
                if(flag == false){
                    numbericalObject.setStartTime(val.getStartTime());
                    numbericalObject.setEndTime(val.getEndTime());
                    numbericalObject.setCount(val.getCount());
                }
                flag=true;
                if (val.getStartTime().compareTo(numbericalObject.getStartTime())<0){
                    //min
                    numbericalObject.setStartTime(val.getStartTime());
                }
                if (val.getEndTime().compareTo(numbericalObject.getEndTime())>0){
                    //max
                    numbericalObject.setEndTime(val.getEndTime());
                }
                sum+=val.getCount();
                numbericalObject.setCount(sum);
            }
            context.write(key,numbericalObject);
        }
    }
}

NumericalObjectMain:

package Numberical.MaxMinMidObject;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class NumbericalObjectMain {
    private static final Logger log = LoggerFactory.getLogger("LOG");
    public static void main(String[] args) throws Exception{

        Configuration conf =new Configuration();
        Path path1 =new Path("NumbericalData/numberical");
        Path path2 =new Path("outputNumberical");
        NumbericalObjectMapper.delete(conf,path2);
        Job job=Job.getInstance(conf,"Numberical");

        FileInputFormat.setInputPaths(job,path1);
        FileOutputFormat.setOutputPath(job,path2);

        job.setJarByClass(NumbericalObjectMain.class);

        job.setMapperClass(NumbericalObjectMapper.NumbericalMap.class);
        job.setCombinerClass(NumbericalObjectReducer.NumbericalCombine.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(NumbericalObject.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NumbericalObject.class);

        job.waitForCompletion(true);
    }
}

相应的程序的目录和输出结果:


MapReduce算法模式-数据聚合(最大值/最小时/总和)_第2张图片
结果和输出.png
PS:最近时间挺忙的,加班累到狗,难得有时间静下来写点自己想写的东西,希望以后有时间多写很多好的文章!!!

你可能感兴趣的:(MapReduce算法模式-数据聚合(最大值/最小时/总和))