有一个文件,每行是一个员工的信息,记录了员工号、部门、基础工资、奖金等内容,现需要通过MapReduce对该文件内容进行计算分析,计算出每个部门的平均工资(基本工资+奖金),并将结果存储进MySQL。
环境:Eclipse + jdk1.8 + hadoop3.2.1 + mysql8
7499,ALLEN,SALESMAN,7698,1981-02-20,1600,300,30
7369,SMITH,CLERK,7902,1980-12-17,800,,20
7521,WARD,SALESMAN,7698,1981-02-22,1250,500,30
7566,JONES,MANAGER,7839,1981-04-02,3375,,20
7654,MARTIN,SALESMAN,7698,1981-09-28,1250,1400,30
7698,BLAKE,MANAGER,7839,1981-05-01,2850,,30
7782,CLARK,MANAGER,7839,1981-06-09,2450,,10
7788,SCOTT,ANALYST,7566,1987-04-19,3000,,20
7839,KING,PRESIDENT,,1981-11-17,5000,,10
7844,TURNER,SALESMAN,7698,1981-09-08,1500,0,30
7876,ADAMS,CLERK,7788,1987-05-23,1100,,20
7900,JAMES,CLERK,7698,1981-12-03,950,,30
7902,FORD,ANALYST,7566,1981-12-03,3000,,20
7934,MILLER,CLERK,7782,1982-01-23,1300,,10
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.ex.demo</groupId>
<artifactId>hadoopExProject</artifactId>
<version>0.0.1-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>3.2.1</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-common</artifactId>
<version>3.2.1</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>3.2.1</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>8.0.15</version>
</dependency>
</dependencies>
</project>
package com.ex.demo.util;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
public class ConnectionFactory {
public static Connection getConnection(){
Connection connection=null;
try {
Class.forName("com.mysql.cj.jdbc.Driver");
String url="jdbc:mysql://localhost:3306/mrout?characterEcoding=utf-8&useSSL=false&serverTimezone=UTC&rewriteBatchedStatements=true";
String username="root";
String password="root";
connection = DriverManager.getConnection(url, username, password);
} catch (ClassNotFoundException | SQLException e) {
System.err.println("连接数据库失败!");
e.printStackTrace();
}
return connection;
}
}
注意mysql8的驱动类和url。
参考:link
package com.ex.demo.dao;
import com.ex.demo.util.ConnectionFactory;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.SQLException;
public class EmpSalaryDao {
//插入数据
public void insert(int dept,float salary){
PreparedStatement preparedStatement=null;
Connection connection = ConnectionFactory.getConnection();
String sql="insert into deptsalary values(?,?)";
try {
preparedStatement = connection.prepareStatement(sql);
preparedStatement.setInt(1,dept);
preparedStatement.setFloat(2,salary);
preparedStatement.executeUpdate();
} catch (SQLException e) {
System.err.println("执行插入失败!");
e.printStackTrace();
}
} //清空表的数据
public void clear(){
PreparedStatement preparedStatement=null;
Connection connection = ConnectionFactory.getConnection();
String sql="truncate table deptsalary";
try {
preparedStatement = connection.prepareStatement(sql);
preparedStatement.executeUpdate();
} catch (SQLException e) {
System.err.println("执行清空数据失败!");
e.printStackTrace();
} }
}
package com.ex.demo.mapper;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class EmpSalMapper extends Mapper<LongWritable,Text,IntWritable,IntWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] strings = value.toString().split(",");
int salary = Integer.parseInt(strings[5]);
if ("".equals(strings[6]))
strings[6]="0";
int bonus = Integer.parseInt(strings[6]);
context.write(new IntWritable(Integer.parseInt(strings[7])),new IntWritable(salary+bonus));
}
}
package com.ex.demo.reducer;
import com.ex.demo.dao.EmpSalaryDao;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.sql.SQLException;
public class EmpSalReducer extends Reducer<IntWritable,IntWritable,IntWritable,FloatWritable> {
@Override
protected void reduce(IntWritable key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
//求平均工资
int sum=0;
int count=0;
for (IntWritable val:values){
sum+=val.get();
count++;
}
float avgSal= sum*1.0f / count;
//写出HDFS
context.write(key,new FloatWritable(avgSal));
//保存到SQL数据库
new EmpSalaryDao().insert(key.get(),avgSal);
}
}
package com.ex.demo.job;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import com.ex.demo.dao.EmpSalaryDao;
import com.ex.demo.mapper.EmpSalMapper;
import com.ex.demo.reducer.EmpSalReducer;
public class EmpSalJob {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//清空数据库
new EmpSalaryDao().clear();
//清空HDFS数据结果
Configuration conf = new Configuration();
conf.set("fs.defaultFS","hdfs://192.168.43.50:9000");
conf.set("dfs.replication","1");
FileSystem fs = FileSystem.get(conf);
Path path = new Path("/mrout/empsalout");
if (fs.exists(path)){
boolean res = fs.delete(path, true);
System.out.println("清空HDFS数据结果集:"+res);
}
fs.close();
//执行job
Job job = Job.getInstance(conf);
job.setJarByClass(EmpSalJob.class);
job.setMapperClass(EmpSalMapper.class);
job.setReducerClass(EmpSalReducer.class);
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(IntWritable.class);//原本是job.setMapOutputValueClass(FloatWritable.class); 4.1解决后更改成现有代码
FileInputFormat.setInputPaths(job,new Path("hdfs://192.168.43.50:9000/input/emp.csv"));
FileOutputFormat.setOutputPath(job,new Path("hdfs://192.168.43.50:9000/mrout/empsalout"));
System.exit(job.waitForCompletion(true)?0:1);
}
}
以上代码是已经解决好的代码,原代码备注在了注释里,以下为编写代码过程中遇到的错误,仅此记录。
java.lang.Exception: java.io.IOException: Type mismatch in value from map: expected org.apache.hadoop.io.FloatWritable, received org.apache.hadoop.io.IntWritable
at org.apache.hadoop.mapred.LocalJobRunner$Job.runTasks(LocalJobRunner.java:492)
at org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:552)
Caused by: java.io.IOException: Type mismatch in value from map: expected org.apache.hadoop.io.FloatWritable, received org.apache.hadoop.io.IntWritable
at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.collect(MapTask.java:1093)
at org.apache.hadoop.mapred.MapTask$NewOutputCollector.write(MapTask.java:727)
at org.apache.hadoop.mapreduce.task.TaskInputOutputContextImpl.write(TaskInputOutputContextImpl.java:89)
at org.apache.hadoop.mapreduce.lib.map.WrappedMapper$Context.write(WrappedMapper.java:112)
at com.ex.demo.mapper.EmpSalMapper.map(EmpSalMapper.java:20)
at com.ex.demo.mapper.EmpSalMapper.map(EmpSalMapper.java:1)
at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:146)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:799)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:347)
at org.apache.hadoop.mapred.LocalJobRunner$Job$MapTaskRunnable.run(LocalJobRunner.java:271)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
解决:
错误在设置mapper的
补充(参考link):
Mapper过程:Mapper
Reducer过程:Reducer
Mapper和Reducer的K2 V2需要保持一致。
K1,V1 的类型一般由job.setInputFormatClass()指定,比如job.setInputFormatClass(TextInputFormat.class)等等;
Mapper和Reducer的输出类型由job.setOutputKeyClass(Text.class);job.setOutputValueClass(Text.class);设置,同时设置了
如果想要Mapper和Reducer的输出key或value类型不同,可以通过setMapOutputKeyClass 和 setMapOutputValueClass来设定Mapper的输出key/value对。