MapReduce中的常见InputFormat之DBInPutFormat和DBOutPutFormat案例详解

MapReduce中的常见InputFormat之DBInPutFormat和DBOutPutFormat案例详解

一、背景

为了方便MapReduce直接访问关系型数据库(Mysql,Oracle),Hadoop提供了DBInputFormat和DBOutputFormat两个类。

通过DBInputFormat类把数据库表数据读入到HDFS,根据DBOutputFormat类把MapReduce产生的结果集导入到数据库表中。

二、实例详解

以PV、UV为案例,使用IDEA和MySQL等开发工具,从MySQL数据库表里面读取数据,并将计算结果输出到MySQL数据库表里

  • DBInputFormat

    特点:从数据库中获取数据,将获得的数据作为Map任务的输入

  • DBOutputFormat

    特点:基于数据库的数据输出格式,将Reducer的计算结果输出保存到数据库,reduce方法每输出一次则在数据库产生一条记录

  1. 创建数据库表

MapReduce中的常见InputFormat之DBInPutFormat和DBOutPutFormat案例详解_第1张图片

  1. 自己随便插入一些数据

MapReduce中的常见InputFormat之DBInPutFormat和DBOutPutFormat案例详解_第2张图片

  1. 开发自定义的Writable对象,读写数据库表中的记录

    package com.baizhi.pv_mysql;
    
    import org.apache.hadoop.mapreduce.lib.db.DBWritable;
    
    import java.sql.PreparedStatement;
    import java.sql.ResultSet;
    import java.sql.SQLException;
    import java.util.Date;
    
    public class LogWritable implements DBWritable {
        private String ip;
        private Date createTime;
        private String method;
        private String pageaddr;
        private String flow;
        private String status;
    
        public LogWritable() {
        }
    
        public LogWritable(String ip, Date createTime, String method, String pageaddr, String flow, String status) {
            this.ip = ip;
            this.createTime = createTime;
            this.method = method;
            this.pageaddr = pageaddr;
            this.flow = flow;
            this.status = status;
        }
    
        public String getIp() {
            return ip;
        }
    
        public void setIp(String ip) {
            this.ip = ip;
        }
    
        public Date getCreateTime() {
            return createTime;
        }
    
        public void setCreateTime(Date createTime) {
            this.createTime = createTime;
        }
    
        public String getMethod() {
            return method;
        }
    
        public void setMethod(String method) {
            this.method = method;
        }
    
        public String getPageaddr() {
            return pageaddr;
        }
    
        public void setPageaddr(String pageaddr) {
            this.pageaddr = pageaddr;
        }
    
        public String getFlow() {
            return flow;
        }
    
        public void setFlow(String flow) {
            this.flow = flow;
        }
    
        public String getStatus() {
            return status;
        }
    
        public void setStatus(String status) {
            this.status = status;
        }
    
        @Override
        public void write(PreparedStatement preparedStatement) throws SQLException {
            preparedStatement.setString(2,this.ip);
            java.sql.Date date = new java.sql.Date(this.createTime.getTime());
            preparedStatement.setDate(3,date);
            preparedStatement.setString(4,this.method);
            preparedStatement.setString(5,this.pageaddr);
            preparedStatement.setString(6,this.flow);
            preparedStatement.setString(7,this.status);
        }
    
        @Override
        public void readFields(ResultSet resultSet) throws SQLException {
            this.ip = resultSet.getString("ip");
            this.createTime = resultSet.getDate("createTime");
            this.method = resultSet.getString("method");
            this.pageaddr = resultSet.getString("pageaddr");
            this.flow = resultSet.getString("flow");
            this.status = resultSet.getString("status");
        }
    }
    
    

4.开发自定义的Writable对象,将处理的结果输出到数据库表中

package com.baizhi.pv_mysql;

import org.apache.hadoop.mapreduce.lib.db.DBWritable;

import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.Date;

public class LogOutWritable implements DBWritable {
    private Date date;
    private Integer times;

    public LogOutWritable() {
    }

    public LogOutWritable(Date date, Integer times) {
        this.date = date;
        this.times = times;
    }

    public Date getDate() {
        return date;
    }

    public void setDate(Date date) {
        this.date = date;
    }

    public Integer getTimes() {
        return times;
    }

    public void setTimes(Integer times) {
        this.times = times;
    }

    @Override
    public void write(PreparedStatement preparedStatement) throws SQLException {
        java.sql.Date date1 = new java.sql.Date(this.date.getTime());
        preparedStatement.setDate(1,date1);
        preparedStatement.setInt(2,this.times);
    }

    @Override
    public void readFields(ResultSet resultSet) throws SQLException {
        this.date = resultSet.getDate("date");
        this.times = resultSet.getInt("times");
    }
}

5.开发处理的Map任务

package com.baizhi.pv_mysql;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;
import java.util.Date;

public class MyMapper extends Mapper<LongWritable, LogWritable, Text, IntWritable> {

    @Override
    protected void map(LongWritable key, LogWritable value, Context context) throws IOException, InterruptedException {
        String string = value.getCreateTime().toString();
        context.write(new Text(string),new IntWritable(1));
    }
}

6.开发统计的Reduce任务

package com.baizhi.pv_mysql;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Iterator;

public class MyReducer extends Reducer<Text, IntWritable,LogOutWritable, NullWritable> {
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        int sum = 0;
        Iterator<IntWritable> iterator = values.iterator();
        while (iterator.hasNext()){
            int i = iterator.next().get();
            sum += i;
        }
        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
        Date date = null;
        try {
            date = sdf.parse(key.toString());
        } catch (ParseException e) {
            e.printStackTrace();
        }
        context.write(new LogOutWritable(date,sum),null);
    }
}

7.设置初始化类

package com.baizhi.pv_mysql;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.db.DBConfiguration;
import org.apache.hadoop.mapreduce.lib.db.DBInputFormat;
import org.apache.hadoop.mapreduce.lib.db.DBOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

import java.io.IOException;

public class PvApplication {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration configuration = new Configuration();
        configuration.set(DBConfiguration.DRIVER_CLASS_PROPERTY,"com.mysql.jdbc.Driver");
        configuration.set(DBConfiguration.URL_PROPERTY,"jdbc:mysql://localhost:3306/test");
        configuration.set(DBConfiguration.USERNAME_PROPERTY,"root");
        configuration.set(DBConfiguration.PASSWORD_PROPERTY,"root");
        Job job = Job.getInstance(configuration, "pv");
        job.setJarByClass(PvApplication.class);

        job.setInputFormatClass(DBInputFormat.class);
        job.setOutputFormatClass(DBOutputFormat.class);

        DBInputFormat.setInput(job, LogWritable.class,"logs",null,null,"ip","createTime","method","pageaddr","flow","status");
        DBOutputFormat.setOutput(job,"result_pv","date","times");

        job.setMapperClass(MyMapper.class);
        job.setReducerClass(MyReducer.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        job.setOutputKeyClass(LogOutWritable.class);
        job.setOutputValueClass(NullWritable.class);

        job.waitForCompletion(true);
    }
}

8.创建输出的数据库表
MapReduce中的常见InputFormat之DBInPutFormat和DBOutPutFormat案例详解_第3张图片

9.引入数据源驱动的Jar包

本地计算:在Maven项目中导入mysql的依赖即可

远程计算:将MySQL的驱动jar包上传到hadoop安装目录的/lib

10.运行程序`右键初始化类 --> Run as
MapReduce中的常见InputFormat之DBInPutFormat和DBOutPutFormat案例详解_第4张图片

你可能感兴趣的:(DBInputFormat,DBOutputFormat,Hadoop,MapReduce,BigData)