为了方便MapReduce直接访问关系型数据库(Mysql,Oracle),Hadoop提供了DBInputFormat和DBOutputFormat两个类。
通过DBInputFormat类把数据库表数据读入到HDFS,根据DBOutputFormat类把MapReduce产生的结果集导入到数据库表中。
以PV、UV为案例,使用IDEA和MySQL等开发工具,从MySQL数据库表里面读取数据,并将计算结果输出到MySQL数据库表里
DBInputFormat
特点:从数据库中获取数据,将获得的数据作为Map任务的输入
DBOutputFormat
特点:基于数据库的数据输出格式,将Reducer的计算结果输出保存到数据库,reduce方法每输出一次则在数据库产生一条记录
开发自定义的Writable对象,读写数据库表中的记录
package com.baizhi.pv_mysql;
import org.apache.hadoop.mapreduce.lib.db.DBWritable;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.Date;
public class LogWritable implements DBWritable {
private String ip;
private Date createTime;
private String method;
private String pageaddr;
private String flow;
private String status;
public LogWritable() {
}
public LogWritable(String ip, Date createTime, String method, String pageaddr, String flow, String status) {
this.ip = ip;
this.createTime = createTime;
this.method = method;
this.pageaddr = pageaddr;
this.flow = flow;
this.status = status;
}
public String getIp() {
return ip;
}
public void setIp(String ip) {
this.ip = ip;
}
public Date getCreateTime() {
return createTime;
}
public void setCreateTime(Date createTime) {
this.createTime = createTime;
}
public String getMethod() {
return method;
}
public void setMethod(String method) {
this.method = method;
}
public String getPageaddr() {
return pageaddr;
}
public void setPageaddr(String pageaddr) {
this.pageaddr = pageaddr;
}
public String getFlow() {
return flow;
}
public void setFlow(String flow) {
this.flow = flow;
}
public String getStatus() {
return status;
}
public void setStatus(String status) {
this.status = status;
}
@Override
public void write(PreparedStatement preparedStatement) throws SQLException {
preparedStatement.setString(2,this.ip);
java.sql.Date date = new java.sql.Date(this.createTime.getTime());
preparedStatement.setDate(3,date);
preparedStatement.setString(4,this.method);
preparedStatement.setString(5,this.pageaddr);
preparedStatement.setString(6,this.flow);
preparedStatement.setString(7,this.status);
}
@Override
public void readFields(ResultSet resultSet) throws SQLException {
this.ip = resultSet.getString("ip");
this.createTime = resultSet.getDate("createTime");
this.method = resultSet.getString("method");
this.pageaddr = resultSet.getString("pageaddr");
this.flow = resultSet.getString("flow");
this.status = resultSet.getString("status");
}
}
4.开发自定义的Writable对象,将处理的结果输出到数据库表中
package com.baizhi.pv_mysql;
import org.apache.hadoop.mapreduce.lib.db.DBWritable;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.Date;
public class LogOutWritable implements DBWritable {
private Date date;
private Integer times;
public LogOutWritable() {
}
public LogOutWritable(Date date, Integer times) {
this.date = date;
this.times = times;
}
public Date getDate() {
return date;
}
public void setDate(Date date) {
this.date = date;
}
public Integer getTimes() {
return times;
}
public void setTimes(Integer times) {
this.times = times;
}
@Override
public void write(PreparedStatement preparedStatement) throws SQLException {
java.sql.Date date1 = new java.sql.Date(this.date.getTime());
preparedStatement.setDate(1,date1);
preparedStatement.setInt(2,this.times);
}
@Override
public void readFields(ResultSet resultSet) throws SQLException {
this.date = resultSet.getDate("date");
this.times = resultSet.getInt("times");
}
}
5.开发处理的Map任务
package com.baizhi.pv_mysql;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
import java.util.Date;
public class MyMapper extends Mapper<LongWritable, LogWritable, Text, IntWritable> {
@Override
protected void map(LongWritable key, LogWritable value, Context context) throws IOException, InterruptedException {
String string = value.getCreateTime().toString();
context.write(new Text(string),new IntWritable(1));
}
}
6.开发统计的Reduce任务
package com.baizhi.pv_mysql;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Iterator;
public class MyReducer extends Reducer<Text, IntWritable,LogOutWritable, NullWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
Iterator<IntWritable> iterator = values.iterator();
while (iterator.hasNext()){
int i = iterator.next().get();
sum += i;
}
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
Date date = null;
try {
date = sdf.parse(key.toString());
} catch (ParseException e) {
e.printStackTrace();
}
context.write(new LogOutWritable(date,sum),null);
}
}
7.设置初始化类
package com.baizhi.pv_mysql;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.db.DBConfiguration;
import org.apache.hadoop.mapreduce.lib.db.DBInputFormat;
import org.apache.hadoop.mapreduce.lib.db.DBOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import java.io.IOException;
public class PvApplication {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration configuration = new Configuration();
configuration.set(DBConfiguration.DRIVER_CLASS_PROPERTY,"com.mysql.jdbc.Driver");
configuration.set(DBConfiguration.URL_PROPERTY,"jdbc:mysql://localhost:3306/test");
configuration.set(DBConfiguration.USERNAME_PROPERTY,"root");
configuration.set(DBConfiguration.PASSWORD_PROPERTY,"root");
Job job = Job.getInstance(configuration, "pv");
job.setJarByClass(PvApplication.class);
job.setInputFormatClass(DBInputFormat.class);
job.setOutputFormatClass(DBOutputFormat.class);
DBInputFormat.setInput(job, LogWritable.class,"logs",null,null,"ip","createTime","method","pageaddr","flow","status");
DBOutputFormat.setOutput(job,"result_pv","date","times");
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(LogOutWritable.class);
job.setOutputValueClass(NullWritable.class);
job.waitForCompletion(true);
}
}
9.引入数据源驱动的Jar包
本地计算:在Maven项目中导入mysql的依赖即可
远程计算:将MySQL的驱动jar包上传到
hadoop安装目录的/lib
中