经常听到小伙伴吐槽 MapReduce 计算的结果无法直接写入数据库,
实际上 MapReduce 是有操作数据库实现的
本案例代码将实现 MapReduce 数据库读写操作和将数据表中数据复制到另外一张数据表中
create database htu;
use htu;
create table word(
name varchar(255) comment '单词',
count int comment '数量'
);
create table new_word(
name varchar(255) comment '单词',
count int comment '数量'
);
package com.lihaozhe.db;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.Setter;
import org.apache.hadoop.mapreduce.lib.db.DBWritable;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
/**
* @author 李昊哲
* @version 1.0.0
* @create 2023/11/7
*/
@Getter
@Setter
@NoArgsConstructor
@AllArgsConstructor
public class Word implements DBWritable {
/**
* 单词
*/
private String name;
/**
* 单词数量
*/
private int count;
@Override
public String toString() {
return this.name + "\t" + this.count;
}
@Override
public void write(PreparedStatement pst) throws SQLException {
pst.setString(1, this.name);
pst.setInt(2, this.count);
}
@Override
public void readFields(ResultSet rs) throws SQLException {
this.name = rs.getString(1);
this.count = rs.getInt(2);
}
}
package com.lihaozhe.db;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.db.DBConfiguration;
import org.apache.hadoop.mapreduce.lib.db.DBOutputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import java.io.IOException;
/**
* @author 李昊哲
* @version 1.0
* @create 2023-11-7
*/
public class Write {
public static class WordMapper extends Mapper<LongWritable, Text, Word, NullWritable> {
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Word, NullWritable>.Context context) throws IOException, InterruptedException {
String[] split = value.toString().split("\t");
Word word = new Word();
word.setName(split[0]);
word.setCount(Integer.parseInt(split[1]));
context.write(word, NullWritable.get());
}
}
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
// 设置环境变量 hadoop 用户名 为 root
System.setProperty("HADOOP_USER_NAME", "root");
// 参数配置对象
Configuration conf = new Configuration();
// 配置JDBC 参数
DBConfiguration.configureDB(conf,
"com.mysql.cj.jdbc.Driver",
"jdbc:mysql://spark03:3306/htu?useUnicode=true&createDatabaseIfNotExist=true&characterEncoding=UTF8&useSSL=false&serverTimeZone=Asia/Shanghai",
"root", "Lihaozhe!!@@1122"
);
// 跨平台提交
conf.set("mapreduce.app-submission.cross-platform", "true");
// 本地运行
// conf.set("mapreduce.framework.name", "local");
// 设置默认文件系统为 本地文件系统
// conf.set("fs.defaultFS", "file:///");
// 声明Job对象 就是一个应用
Job job = Job.getInstance(conf, "write db");
// 指定当前Job的驱动类
// 本地提交 注释该行
job.setJarByClass(Write.class);
// 本地提交启用该行
// job.setJar("D:\\work\\河南师范大学\\2023\\bigdata2023\\Hadoop\\code\\hadoop\\target\\hadoop.jar");
// 指定当前Job的 Mapper
job.setMapperClass(WordMapper.class);
// 指定当前Job的 Combiner 注意:一定不能影响最终计算结果 否则 不使用
// job.setCombinerClass(WordCountReduce.class);
// 指定当前Job的 Reducer
// job.setReducerClass(WordCountReduce.class);
// 设置 reduce 数量为 零
job.setNumReduceTasks(0);
// 设置 map 输出 key 的数据类型
job.setMapOutputValueClass(WordMapper.class);
// 设置 map 输出 value 的数据类型
job.setMapOutputValueClass(NullWritable.class);
// 设置最终输出 key 的数据类型
// job.setOutputKeyClass(Text.class);
// 设置最终输出 value 的数据类型
// job.setOutputValueClass(NullWritable.class);
// 定义 map 输入的路径 注意:该路径默认为hdfs路径
FileInputFormat.addInputPath(job, new Path("/wordcount/result/part-r-00000"));
// 定义 reduce 输出数据持久化的路径 注意:该路径默认为hdfs路径
// Path dst = new Path("/video/ods");
// // 保护性代码 如果 reduce 输出目录已经存在则删除 输出目录
// DistributedFileSystem dfs = new DistributedFileSystem();
// String nameService = conf.get("dfs.nameservices");
// String hdfsRPCUrl = "hdfs://" + nameService + ":" + 8020;
// dfs.initialize(URI.create(hdfsRPCUrl), conf);
// if (dfs.exists(dst)) {
// dfs.delete(dst, true);
// }
// FileSystem fs = FileSystem.get(conf);
// if (fs.exists(dst)) {
// fs.delete(dst, true);
// }
// FileOutputFormat.setOutputPath(job, dst);
// 设置输出类
job.setOutputFormatClass(DBOutputFormat.class);
// 配置将数据写入表
DBOutputFormat.setOutput(job, "word", "name", "count");
// 提交 job
// job.submit();
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
注意:
由于集群环境 导致 MapTask数量不可控可导致最终输出文件可能不止一个,
可以在代码使用 conf.set(“mapreduce.job.maps”, “1”) 设置 MapTask 数量
package com.lihaozhe.db;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.db.DBConfiguration;
import org.apache.hadoop.mapreduce.lib.db.DBInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.net.URI;
/**
* @author 李昊哲
* @version 1.0
* @create 2023-11-7
*/
public class Read {
public static class WordMapper extends Mapper<LongWritable, Word, Word, NullWritable> {
@Override
protected void map(LongWritable key, Word value, Mapper<LongWritable, Word, Word, NullWritable>.Context context) throws IOException, InterruptedException {
context.write(value, NullWritable.get());
}
}
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
// 设置环境变量 hadoop 用户名 为 root
System.setProperty("HADOOP_USER_NAME", "root");
// 参数配置对象
Configuration conf = new Configuration();
// 配置JDBC 参数
DBConfiguration.configureDB(conf,
"com.mysql.cj.jdbc.Driver",
"jdbc:mysql://spark03:3306/htu?useUnicode=true&createDatabaseIfNotExist=true&characterEncoding=UTF8&useSSL=false&serverTimeZone=Asia/Shanghai",
"root", "Lihaozhe!!@@1122"
);
// 跨平台提交
conf.set("mapreduce.app-submission.cross-platform", "true");
// 设置 MapTask 数量
conf.set("mapreduce.job.maps", "1");
// 本地运行
// conf.set("mapreduce.framework.name", "local");
// 设置默认文件系统为 本地文件系统
// conf.set("fs.defaultFS", "file:///");
// 声明Job对象 就是一个应用
Job job = Job.getInstance(conf, "read db");
// 指定当前Job的驱动类
// 本地提交 注释该行
job.setJarByClass(Read.class);
// 本地提交启用该行
// job.setJar("D:\\work\\河南师范大学\\2023\\bigdata2023\\Hadoop\\code\\hadoop\\target\\hadoop.jar");
// 指定当前Job的 Mapper
job.setMapperClass(WordMapper.class);
// 指定当前Job的 Combiner 注意:一定不能影响最终计算结果 否则 不使用
// job.setCombinerClass(WordCountReduce.class);
// 指定当前Job的 Reducer
// job.setReducerClass(WordCountReduce.class);
// 设置 reduce 数量为 零
job.setNumReduceTasks(0);
// 设置 map 输出 key 的数据类型
job.setMapOutputValueClass(Word.class);
// 设置 map 输出 value 的数据类型
job.setMapOutputValueClass(NullWritable.class);
// 设置最终输出 key 的数据类型
// job.setOutputKeyClass(Text.class);
// 设置最终输出 value 的数据类型
// job.setOutputValueClass(NullWritable.class);
// 设置输入类
job.setInputFormatClass(DBInputFormat.class);
// 配置将数据写入表
DBInputFormat.setInput(job, Word.class,
"select name,count from word",
"select count(*) from word");
// 定义 map 输入的路径 注意:该路径默认为hdfs路径
// FileInputFormat.addInputPath(job, new Path("/wordcount/result/part-r-00000"));
// 定义 reduce 输出数据持久化的路径 注意:该路径默认为hdfs路径
Path dst = new Path("/wordcount/db");
// 保护性代码 如果 reduce 输出目录已经存在则删除 输出目录
DistributedFileSystem dfs = new DistributedFileSystem();
String nameService = conf.get("dfs.nameservices");
String hdfsRPCUrl = "hdfs://" + nameService + ":" + 8020;
dfs.initialize(URI.create(hdfsRPCUrl), conf);
if (dfs.exists(dst)) {
dfs.delete(dst, true);
}
// FileSystem fs = FileSystem.get(conf);
// if (fs.exists(dst)) {
// fs.delete(dst, true);
// }
FileOutputFormat.setOutputPath(job, dst);
// 提交 job
// job.submit();
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
MapReduce 实现将数据库一张数据表的数据复制到另外一张数据表中
package com.lihaozhe.db;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.db.DBConfiguration;
import org.apache.hadoop.mapreduce.lib.db.DBInputFormat;
import org.apache.hadoop.mapreduce.lib.db.DBOutputFormat;
import java.io.IOException;
/**
* @author 李昊哲
* @version 1.0
* @create 2023-11-7
*/
public class Copy {
public static class RWMapper extends Mapper<LongWritable, Word, Word, NullWritable> {
@Override
protected void map(LongWritable key, Word value, Mapper<LongWritable, Word, Word, NullWritable>.Context context) throws IOException, InterruptedException {
context.write(value, NullWritable.get());
}
}
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
System.setProperty("HADOOP_USER_NAME", "root");
// 参数配置对象
Configuration conf = new Configuration();
// 配置JDBC 参数
DBConfiguration.configureDB(conf,
"com.mysql.cj.jdbc.Driver",
"jdbc:mysql://spark03:3306/htu?useUnicode=true&createDatabaseIfNotExist=true&characterEncoding=UTF8&useSSL=false&serverTimeZone=Asia/Shanghai",
"root", "Lihaozhe!!@@1122"
);
// 跨平台提交
conf.set("mapreduce.app-submission.cross-platform", "true");
// 设置 MapTask 数量
// conf.set("mapreduce.job.maps", "1");
// 声明Job对象 就是一个应用
Job job = Job.getInstance(conf, "read db");
job.setJarByClass(Read.class);
job.setMapperClass(Read.WordMapper.class);
job.setNumReduceTasks(0);
job.setMapOutputValueClass(Word.class);
job.setMapOutputValueClass(NullWritable.class);
// 设置输入类
job.setInputFormatClass(DBInputFormat.class);
// 配置将数据写入表
DBInputFormat.setInput(job, Word.class,
"select name,count from word",
"select count(*) from word");
// 设置输出类
job.setOutputFormatClass(DBOutputFormat.class);
// 配置将数据写入表
DBOutputFormat.setOutput(job, "new_word", "name", "count");
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}