使用sqoop api完成mysql到hadoop的导入


程序目的:从已有的mysql数据导入到hdfs中,使用java编程的方式减少安装sqoop本地环境的麻烦。

数据准备阶段

CREATE DATABASE test;
USE test;

CREATE TABLE `vote_record` (
  `id`          INT(11)     NOT NULL AUTO_INCREMENT,
  `user_id`     VARCHAR(20) NOT NULL,
  `vote_id`     INT(11)     NOT NULL,
  `group_id`    INT(11)     NOT NULL,
  `create_time` DATETIME    NOT NULL,
  PRIMARY KEY (`id`),
  KEY `index_user_id` (`user_id`) USING HASH
)
  ENGINE = INNODB
  AUTO_INCREMENT = 1
  DEFAULT CHARSET = utf8;

CREATE FUNCTION `rand_string`(n INT)
  RETURNS VARCHAR(255) CHARSET latin1
  BEGIN
    DECLARE chars_str VARCHAR(100) DEFAULT 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789';
    DECLARE return_str VARCHAR(255) DEFAULT '';
    DECLARE i INT DEFAULT 0;
    WHILE i < n DO
      SET return_str = concat(return_str, substring(chars_str, FLOOR(1 + RAND() * 62), 1));
      SET i = i + 1;
    END WHILE;
    RETURN return_str;
  END;

CREATE PROCEDURE `add_vote_record`(IN n INT)
  BEGIN
    DECLARE i INT DEFAULT 1;
    WHILE (i <= n) DO
      INSERT INTO vote_record (user_id, vote_id, group_id, create_time)
      VALUES (rand_string(20), FLOOR(RAND() * 1000), FLOOR(RAND() * 100), now());
      SET i = i + 1;
    END WHILE;
  END;

CALL add_vote_record(1000000);

maven依赖

    <dependencies>
        <dependency>
            <groupId>junitgroupId>
            <artifactId>junitartifactId>
            <version>4.11version>
            <scope>testscope>
        dependency>
        <dependency>
            <groupId>mysqlgroupId>
            <artifactId>mysql-connector-javaartifactId>
            <version>5.1.46version>
        dependency>
        <dependency>
            <groupId>org.apache.sqoopgroupId>
            <artifactId>sqoopartifactId>
            <version>1.4.7version>
            <classifier>hadoop260classifier>
        dependency>
        <dependency>
            <groupId>org.apache.hadoopgroupId>
            <artifactId>hadoop-clientartifactId>
            <version>2.9.0version>
        dependency>
        <dependency>
            <groupId>org.apache.hadoopgroupId>
            <artifactId>hadoop-commonartifactId>
            <version>2.9.0version>
        dependency>
        <dependency>
            <groupId>org.apache.hadoopgroupId>
            <artifactId>hadoop-hdfsartifactId>
            <version>2.9.0version>
        dependency>
        <dependency>
            <groupId>org.apache.hadoopgroupId>
            <artifactId>hadoop-mapreduce-client-commonartifactId>
            <version>2.9.0version>
        dependency>
        <dependency>
            <groupId>org.apache.avrogroupId>
            <artifactId>avroartifactId>
            <version>1.8.2version>
        dependency>
    dependencies>

执行代码

package cn.hackcoder;

import org.apache.hadoop.conf.Configuration;
import org.apache.sqoop.Sqoop;
import org.apache.sqoop.tool.SqoopTool;
import org.apache.sqoop.util.OptionsFileUtil;

/**
 * Created by linzhichao on 2018/5/15.
 */
public class SqoopUtils {
    private static int importDataFromMysql() throws Exception {
        String[] args = new String[]{
                "--connect", "jdbc:mysql://127.0.0.1:3306/test",
                "--driver", "com.mysql.jdbc.Driver",
                "-username", "root",
                "-password", "system",
                "--table", "vote_record",
                "-m", "1",
                "--target-dir", "/user/root/import_vote_record"
        };

        String[] expandArguments = OptionsFileUtil.expandArguments(args);

        SqoopTool tool = SqoopTool.getTool("import");

        Configuration conf = new Configuration();
        conf.set("fs.default.name", "hdfs://127.0.0.1:9000");//设置HDFS服务地址
        conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
        conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
        Configuration loadPlugins = SqoopTool.loadPlugins(conf);

        Sqoop sqoop = new Sqoop((com.cloudera.sqoop.tool.SqoopTool) tool, loadPlugins);
        return Sqoop.runSqoop(sqoop, expandArguments);
    }

    private static int listTablesFromMysql() throws Exception {
        String[] args = new String[]{
                "--connect", "jdbc:mysql://127.0.0.1:3306/test",
                "-username", "root",
                "-password", "system",
        };

        String[] expandArguments = OptionsFileUtil.expandArguments(args);

        Configuration pluginConf = SqoopTool.loadPlugins(new Configuration());
        com.cloudera.sqoop.tool.SqoopTool tool = (com.cloudera.sqoop.tool.SqoopTool) SqoopTool
                .getTool("list-tables");
        if (null == tool) {
            System.err.println("No such sqoop tool: list-tables See 'sqoop help'.");
        }
        Sqoop sqoop = new Sqoop(tool, pluginConf);
        return Sqoop.runSqoop(sqoop, expandArguments);
    }

    public static void main(String[] args) throws Exception {
        System.out.println(listTablesFromMysql());
        importDataFromMysql();
    }
}

  • 使用中需要特别注意hadoop的版本需要与maven中hadoop配置的版本兼容。
  • 使用mysql开启binlog模式时,创建mysql函数需要执行
    set global log_bin_trust_function_creators=TRUE;

你可能感兴趣的:(使用sqoop api完成mysql到hadoop的导入)