本节内容针对sqoop1,版本1.4.6
sqoop自身支持直接抽取数据进入hive,同时支持单分区格式的hive表数据导入,命令如下:
sqoop import \
--connect jdbc:oracle:thin:@{IP}:@{PORT}:hive \
--username test --password test --table test --columns col1,col2 \
--fields-terminated-by '\001' [--null-string '\\N'] \
-m 1 --create-hive-table \
--hive-drop-import-delims \
--hive-overwrite \
--hive-import --hive-partition-key KEY --hive-partition-value VALUE \
--target-dir /test/test_partition \
--delete-target-dir \
但是该方法针对多分区的hive表则无法使用。大体解决方法有两种,一种是直接加载数据到分区路径然后alter table,另一种就是更改sqoop源码中loadstmt的部分。
过程如下:
1、创建多分区表
create table dbtest.testTable(ID int,NAME int,AGE int,Instruction string,CreateDate string) row format delimited fields terminated by '\001' partitioned by (ORG_NO string,LOAD_DATA string)
2、创建分区
alter table dbtest.testTable add partition (ORG_NO='001',LOAD_DATA='2018-01-01')
3、数据导入
sqoop import --append --connect jdbc:oracle:thin:@{IP}:@{PORT}:hive \
--username test --password test --query @{sql} \
--target-dir "/hive/warehouse/dbtest.db/testTable/ORG_NO=001/LOAD_DATA=2018-01-01" \
--fields-terminated-by '\\N' \
--hive-database “dbtest” --hive-table "testTable" \
--split-by ID -m 1
之所以sqoop自身不支持多分区的表的原因,只是因为在数据加载完成后sqoop往hive中提交数据时的代码如下(org.apache.sqoop.hive.TableDefWriter):
/**
* @return the LOAD DATA statement to import the data in HDFS into hive.
*/
public String getLoadDataStmt() throws IOException {
Path finalPath = getFinalPath();
StringBuilder sb = new StringBuilder();
sb.append("LOAD DATA INPATH '");
sb.append(finalPath.toString() + "'");
if (options.doOverwriteHiveTable()) {
sb.append(" OVERWRITE");
}
sb.append(" INTO TABLE `");
if(options.getHiveDatabaseName() != null) {
sb.append(options.getHiveDatabaseName()).append("`.`");
}
sb.append(outputTableName);
sb.append('`');
/** 这部分是重点 **/
if (options.getHivePartitionKey() != null) {
sb.append(" PARTITION (")
.append(options.getHivePartitionKey())
.append("='").append(options.getHivePartitionValue())
.append("')");
}
LOG.debug("Load statement: " + sb.toString());
return sb.toString();
}
从代码中能看出来,sqoop取出HivePartitionKey和HivePartitionValue直接拼接,所以只支持单分区的情况。因此只需对这段逻辑修正就能实现多分区的数据导入,修改后代码如下:
/**
* @return the LOAD DATA statement to import the data in HDFS into hive.
*/
public String getLoadDataStmt() throws IOException {
Path finalPath = getFinalPath();
StringBuilder sb = new StringBuilder();
sb.append("LOAD DATA INPATH '");
sb.append(finalPath.toString() + "'");
if (options.doOverwriteHiveTable()) {
sb.append(" OVERWRITE");
}
sb.append(" INTO TABLE `");
if(options.getHiveDatabaseName() != null) {
sb.append(options.getHiveDatabaseName()).append("`.`");
}
sb.append(outputTableName);
sb.append('`');
/** 修改为通过‘,’分割的多分区传参形式 **/
if (options.getHivePartitionKey() != null) {
String partitionKeys = options.getHivePartitionKey();
String partitionValues = options.getHivePartitionValue();
String[] pks = partitionKeys.split(",");
String[] pvs = partitionValues.split(",");
sb.append(" PARTITION (");
for (int i = 0; i < pks.length; i++) {
if (i != 0) {
sb.append(" , ");
}
sb.append(pks[i]).append("='").append(pvs[i]).append("'");
}
sb.append(")");
}
LOG.debug("Load statement: " + sb.toString());
return sb.toString();
}
为保证api create-hive-table的可用性,需同步对以下方法进行修改,内容如下:
/**
* @return the CREATE TABLE statement for the table to load into hive.
*/
public String getCreateTableStmt() throws IOException {
……
boolean first = true;
String partitionKeys = options.getHivePartitionKey();
for (String col : colNames) {
if (partitionKeys != null) {
// 修正排除每个分区列
String[] pks = partitionKeys.split(",");
for (String pk : pks) {
if (col.equals(pk)) {
throw new IllegalArgumentException("Partition key " + col + " cannot "
+ "be a column to import.");
}
}
}
if (!first) {
sb.append(", ");
}
first = false;
Integer colType = columnTypes.get(col);
String hiveColType = userMapping.getProperty(col);
if (hiveColType == null) {
hiveColType = connManager.toHiveType(inputTableName, col, colType);
}
if (null == hiveColType) {
throw new IOException("Hive does not support the SQL type for column "
+ col);
}
sb.append('`').append(col).append("` ").append(hiveColType);
if (HiveTypes.isHiveTypeImprovised(colType)) {
LOG.warn(
"Column " + col + " had to be cast to a less precise type in Hive");
}
}
sb.append(") ");
if (commentsEnabled) {
DateFormat dateFormat = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss");
String curDateStr = dateFormat.format(new Date());
sb.append("COMMENT 'Imported by sqoop on " + curDateStr + "' ");
}
if (partitionKeys != null) {
sb.append("PARTITIONED BY (");
// 修正拼接每个分区列
String[] pks = partitionKeys.split(",");
for (String pk : pks) {
sb.append(pk).append(" STRING,");
}
sb.setLength(sb.length()-1);
sb.append(")");
}
sb.append("ROW FORMAT DELIMITED FIELDS TERMINATED BY '");
sb.append(getHiveOctalCharCode((int) options.getOutputFieldDelim()));
sb.append("' LINES TERMINATED BY '");
sb.append(getHiveOctalCharCode((int) options.getOutputRecordDelim()));
……
LOG.debug("Create statement: " + sb.toString());
return sb.toString();
}
修正后加载方式如下:
sqoop import \
--connect jdbc:oracle:thin:@{IP}:@{PORT}:hive \
--username test --password test --table test --columns col1,col2 \
--fields-terminated-by '\001' [--null-string '\\N'] \
-m 1 --create-hive-table \
--hive-drop-import-delims \
--hive-overwrite \
--hive-import \
--hive-partition-key KEY1[,KEY2,...] \
--hive-partition-value VALUE1[,VALUE2,...] \
--target-dir /test/test_partition \
--delete-target-dir \