准备工作:
1、安装和配置好 MySQL
2、创建数据表,并写入测试数据;以及创建结果表,以便MR写入数据
注意:
1、与 mysql 交互时,Mapper的输出类型 (LongWritable,DBWritable);Reducer的输出类型 (DBWritable,NullWritable)
2、如果提交到集群上运行,需要注意 2 点:
a> mysql 的 url 地址要修改为 mysql所在主机的 IP
b> 因为打 jar 包时,不会将mysql的驱动一并打入jar包;
所以,要将 mysql 驱动 jar包 拷贝到各个节点的 $HADOOP_HOME/share/hadoop/common/lib 目录下
做好以上准备工作,就可以开始写代码:
WCApp.class
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.db.DBConfiguration;
import org.apache.hadoop.mapreduce.lib.db.DBInputFormat;
import org.apache.hadoop.mapreduce.lib.db.DBOutputFormat;
public class WCApp {
public static void main(String[] args) throws Exception {
System.setProperty("hadoop.home.dir", "H:\\hadoop-2.4.1");
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(WCApp.class);
job.setJobName("WCDBApp");
job.setNumReduceTasks(3);
job.setMapperClass(WCDBMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setReducerClass(WCReducer.class);
job.setOutputKeyClass(WordRSDBWritable.class);
job.setOutputValueClass(NullWritable.class);
//设置输入和输出的类型
job.setInputFormatClass(DBInputFormat.class);
job.setOutputFormatClass(DBOutputFormat.class);
//配置数据库信息
String driverClass = "com.mysql.jdbc.Driver";
//注意:如果要提交到集群上运行,这里的localhost要修改成mysql所在主机的ip地址
String dbUrl = "jdbc:mysql://localhost:3306/bigdata";
String userName = "root";
String passwd = "root";
//Configuration conf, String driverClass, String dbUrl, String userName, String passwd
//设置数据库信息
DBConfiguration.configureDB(job.getConfiguration(),driverClass,dbUrl,userName,passwd);
//设置数据库输入内容
DBInputFormat.setInput(job,WordsDBWritable.class,"select * from words","select count(*) from words");
//设置输出路径
DBOutputFormat.setOutput(job,"wc_result","id","word","count");
job.waitForCompletion(true);
}
}
WCDBMapper.class
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class WCDBMapper extends Mapper {
@Override
protected void map(LongWritable key, WordsDBWritable value, Context context) throws IOException, InterruptedException {
//注意:这里的key是从0开始的
//value是数据库表里的一条记录
//获取数据
String txt = value.getTxt();
//分词
String[] split = txt.split(" ");
//压平
for (String s : split) {
context.write(new Text(s),new IntWritable(1));
}
}
}
WCReducer.class
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/**
* 注意:将数据写入sql时,Reducer的输出的key和value的数据类型分别为: DBWritable,NullWritable
*/
public class WCReducer extends Reducer {
@Override
protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException {
int total = 0;
for (IntWritable value : values) {
total += value.get();
}
WordRSDBWritable wordRSDBWritable = new WordRSDBWritable();
wordRSDBWritable.setName(key.toString());
wordRSDBWritable.setCount(total);
context.write(wordRSDBWritable,NullWritable.get());
}
}
WordsDBWritable.class
/**
* 自定义的类,用于与数据库交互
* 对应的sql表名:words
*/
public class WordsDBWritable implements Writable,DBWritable {
private int id;
private String name;
private String txt;
public int getId() {
return id;
}
public void setId(int id) {
this.id = id;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getTxt() {
return txt;
}
public void setTxt(String txt) {
this.txt = txt;
}
public void write(DataOutput out) throws IOException {
out.writeInt(id);
out.writeUTF(name);
out.writeUTF(txt);
}
public void readFields(DataInput in) throws IOException {
id = in.readInt();
name = in.readUTF();
txt = in.readUTF();
}
/*
向 db 写
*/
public void write(PreparedStatement statement) throws SQLException {
statement.setInt(1,id);
statement.setString(2,name);
statement.setString(3,txt);
}
/*
从 db 读
*/
public void readFields(ResultSet resultSet) throws SQLException {
id = resultSet.getInt(1);
name = resultSet.getString(2);
txt = resultSet.getString(3);
}
}
WordRSDBWritable.class
/**
* 自定义的类,用于与数据库交互
* 对应的sql表名:wc_result
*/
public class WordRSDBWritable implements Writable,DBWritable {
private int id;
private String word;
private int count;
public int getId() {
return id;
}
public void setId(int id) {
this.id = id;
}
public String getName() {
return word;
}
public void setName(String name) {
this.word = name;
}
public int getCount() {
return count;
}
public void setCount(int count) {
this.count = count;
}
public void write(DataOutput out) throws IOException {
out.writeInt(id);
out.writeUTF(word);
out.writeInt(count);
}
public void readFields(DataInput in) throws IOException {
id = in.readInt();
word = in.readUTF();
count = in.readInt();
}
/*
向 db 写
*/
public void write(PreparedStatement statement) throws SQLException {
statement.setInt(1,id);
statement.setString(2, word);
statement.setInt(3, count);
}
/*
从 db 读
*/
public void readFields(ResultSet resultSet) throws SQLException {
id = resultSet.getInt(1);
word = resultSet.getString(2);
count = resultSet.getInt(3);
}
}
注意:因为要用mysql的驱动,所以要在pom.xml文件中加入 mysql的引用。
mysql
mysql-connector-java
5.1.17