把MapReduce分析结果存放到MySQL中

1. 介绍

有一个文件,每行是一个员工的信息,记录了员工号、部门、基础工资、奖金等内容,现需要通过MapReduce对该文件内容进行计算分析,计算出每个部门的平均工资(基本工资+奖金),并将结果存储进MySQL。

环境:Eclipse + jdk1.8 + hadoop3.2.1 + mysql8

2.数据准备

2.1 准备表emp.csv,上传到hadoop
7499,ALLEN,SALESMAN,7698,1981-02-20,1600,300,30
7369,SMITH,CLERK,7902,1980-12-17,800,,20
7521,WARD,SALESMAN,7698,1981-02-22,1250,500,30
7566,JONES,MANAGER,7839,1981-04-02,3375,,20
7654,MARTIN,SALESMAN,7698,1981-09-28,1250,1400,30
7698,BLAKE,MANAGER,7839,1981-05-01,2850,,30
7782,CLARK,MANAGER,7839,1981-06-09,2450,,10
7788,SCOTT,ANALYST,7566,1987-04-19,3000,,20
7839,KING,PRESIDENT,,1981-11-17,5000,,10
7844,TURNER,SALESMAN,7698,1981-09-08,1500,0,30
7876,ADAMS,CLERK,7788,1987-05-23,1100,,20
7900,JAMES,CLERK,7698,1981-12-03,950,,30
7902,FORD,ANALYST,7566,1981-12-03,3000,,20
7934,MILLER,CLERK,7782,1982-01-23,1300,,10
2.2 在mysql中创建结果表 deptsalary

在这里插入图片描述

3. 项目目录结构:

把MapReduce分析结果存放到MySQL中_第1张图片

4. 代码

4.1 POM.xml
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>
  <groupId>com.ex.demo</groupId>
  <artifactId>hadoopExProject</artifactId>
  <version>0.0.1-SNAPSHOT</version>
  
  <dependencies>
   <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-client</artifactId>
      <version>3.2.1</version>
    </dependency>
      <dependency>
          <groupId>org.apache.hadoop</groupId>
          <artifactId>hadoop-mapreduce-client-common</artifactId>
          <version>3.2.1</version>
      </dependency>
      <dependency>
          <groupId>org.apache.hadoop</groupId>
          <artifactId>hadoop-mapreduce-client-core</artifactId>
          <version>3.2.1</version>
      </dependency>
      <dependency>
          <groupId>mysql</groupId>
          <artifactId>mysql-connector-java</artifactId>
          <version>8.0.15</version>
      </dependency>
  </dependencies>
  
</project>
4.2 ConnectionFactory.java
package com.ex.demo.util;

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;

public class ConnectionFactory {
     
    public static Connection getConnection(){
     
        Connection connection=null;
        try {
     
            Class.forName("com.mysql.cj.jdbc.Driver");
            String url="jdbc:mysql://localhost:3306/mrout?characterEcoding=utf-8&useSSL=false&serverTimezone=UTC&rewriteBatchedStatements=true";
            String username="root";
            String password="root";
            connection = DriverManager.getConnection(url, username, password);
        } catch (ClassNotFoundException | SQLException e) {
     
            System.err.println("连接数据库失败!");
            e.printStackTrace();
        }
        return connection;
   }
}

注意mysql8的驱动类和url。
参考:link

4.3 EmpSalaryDao.java
package  com.ex.demo.dao;

import com.ex.demo.util.ConnectionFactory;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.SQLException;

public class EmpSalaryDao {
         //插入数据
    public void insert(int dept,float salary){
     
        PreparedStatement preparedStatement=null;
        Connection connection = ConnectionFactory.getConnection();
        String sql="insert into deptsalary values(?,?)";
        try {
     
            preparedStatement = connection.prepareStatement(sql);
            preparedStatement.setInt(1,dept);
            preparedStatement.setFloat(2,salary);
            preparedStatement.executeUpdate();
        } catch (SQLException e) {
     
            System.err.println("执行插入失败!");
            e.printStackTrace();
        }
    }    //清空表的数据
    public void clear(){
     
        PreparedStatement preparedStatement=null;
        Connection connection = ConnectionFactory.getConnection();
        String sql="truncate table deptsalary";
        try {
     
            preparedStatement = connection.prepareStatement(sql);
            preparedStatement.executeUpdate();
        } catch (SQLException e) {
     
            System.err.println("执行清空数据失败!");
            e.printStackTrace();
        }    }
}
4.4 EmpSalMapper.java
package com.ex.demo.mapper;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;


public class EmpSalMapper extends Mapper<LongWritable,Text,IntWritable,IntWritable> {
     
@Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
     
        String[] strings = value.toString().split(",");
        int salary = Integer.parseInt(strings[5]);
        if ("".equals(strings[6]))
            strings[6]="0";
        int bonus = Integer.parseInt(strings[6]);
        context.write(new IntWritable(Integer.parseInt(strings[7])),new IntWritable(salary+bonus));
    }
}
4.5 EmpSalReducer.java
package com.ex.demo.reducer;

import com.ex.demo.dao.EmpSalaryDao;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.sql.SQLException;

public class EmpSalReducer extends Reducer<IntWritable,IntWritable,IntWritable,FloatWritable> {
     

    @Override
    protected void reduce(IntWritable key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
     
      //求平均工资
        int sum=0;
        int count=0;
        for (IntWritable val:values){
     
            sum+=val.get();
            count++;
        }
        float avgSal= sum*1.0f / count;
        //写出HDFS
        context.write(key,new FloatWritable(avgSal));
        //保存到SQL数据库
        new EmpSalaryDao().insert(key.get(),avgSal);
    }
}
4.6 EmpSalJob.jva
package com.ex.demo.job;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import com.ex.demo.dao.EmpSalaryDao;
import com.ex.demo.mapper.EmpSalMapper;
import com.ex.demo.reducer.EmpSalReducer;

public class EmpSalJob {
     
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
     
    //清空数据库
    new EmpSalaryDao().clear();
    //清空HDFS数据结果
    Configuration conf = new Configuration();
    conf.set("fs.defaultFS","hdfs://192.168.43.50:9000");
    conf.set("dfs.replication","1");
    FileSystem fs = FileSystem.get(conf);
    Path path = new Path("/mrout/empsalout");
    if (fs.exists(path)){
     
        boolean res = fs.delete(path, true);
        System.out.println("清空HDFS数据结果集:"+res);
    }
    fs.close();
    //执行job
    Job job = Job.getInstance(conf);
    job.setJarByClass(EmpSalJob.class);
    job.setMapperClass(EmpSalMapper.class);
    job.setReducerClass(EmpSalReducer.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IntWritable.class);//原本是job.setMapOutputValueClass(FloatWritable.class); 4.1解决后更改成现有代码
    FileInputFormat.setInputPaths(job,new Path("hdfs://192.168.43.50:9000/input/emp.csv"));
    FileOutputFormat.setOutputPath(job,new Path("hdfs://192.168.43.50:9000/mrout/empsalout"));
    System.exit(job.waitForCompletion(true)?0:1);
  }
}

5. 处理异常

以上代码是已经解决好的代码,原代码备注在了注释里,以下为编写代码过程中遇到的错误,仅此记录。

5.1 类型不匹配
java.lang.Exception: java.io.IOException: Type mismatch in value from map: expected org.apache.hadoop.io.FloatWritable, received org.apache.hadoop.io.IntWritable
 at org.apache.hadoop.mapred.LocalJobRunner$Job.runTasks(LocalJobRunner.java:492)
 at org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:552)
Caused by: java.io.IOException: Type mismatch in value from map: expected org.apache.hadoop.io.FloatWritable, received org.apache.hadoop.io.IntWritable
 at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.collect(MapTask.java:1093)
 at org.apache.hadoop.mapred.MapTask$NewOutputCollector.write(MapTask.java:727)
 at org.apache.hadoop.mapreduce.task.TaskInputOutputContextImpl.write(TaskInputOutputContextImpl.java:89)
 at org.apache.hadoop.mapreduce.lib.map.WrappedMapper$Context.write(WrappedMapper.java:112)
 at com.ex.demo.mapper.EmpSalMapper.map(EmpSalMapper.java:20)
 at com.ex.demo.mapper.EmpSalMapper.map(EmpSalMapper.java:1)
 at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:146)
 at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:799)
 at org.apache.hadoop.mapred.MapTask.run(MapTask.java:347)
 at org.apache.hadoop.mapred.LocalJobRunner$Job$MapTaskRunnable.run(LocalJobRunner.java:271)
 at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
 at java.util.concurrent.FutureTask.run(FutureTask.java:266)
 at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
 at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
 at java.lang.Thread.run(Thread.java:748)

解决:
错误在设置mapper的类型,将job.setMapOutputValueClass(FloatWritable.class); 的FloatWritable更改为IntWritable。

补充(参考link):

  • Mapper过程:Mapper
    Reducer过程:Reducer
    Mapper和Reducer的K2 V2需要保持一致。

  • K1,V1 的类型一般由job.setInputFormatClass()指定,比如job.setInputFormatClass(TextInputFormat.class)等等;

    Mapper和Reducer的输出类型由job.setOutputKeyClass(Text.class);job.setOutputValueClass(Text.class);设置,同时设置了

    如果想要Mapper和Reducer的输出key或value类型不同,可以通过setMapOutputKeyClass 和 setMapOutputValueClass来设定Mapper的输出key/value对。

你可能感兴趣的:(大数据,mysql,大数据,hadoop)