Sqoop是一个etl工具,是sql to hadoop的缩写。即将关系型数据库数据导入hadoop中。可以用sqoop来创建hive表和导数据,实际上也是在hdfs中创建目录和将数据存储在hdfs中。
1.Sqoop基本命令
@see url
2.mysql导入hive
--hive-database qianyang #指定hive数据库
@see link
3.Sqoop job的CRUD
@see sqoop_crud
4.查看Hive建表语句
查看hive建表语句:show create table tablename;
查看hive表结构:describe tablename; 简写:desc tablename;
/usr/bin/sqoop create-hive-table --connect jdbc:oracle:thin:@$server:$port:$mysql_database --username $usern
ame --password $password --hive-database $hdb --table $mysql_table
#>sqoop -version Sqoop 1.4.6-cdh5.12.2
CREATE TABLE `test`(
`id` string,
...
`create_date` string)
COMMENT 'Imported by sqoop on 2018/06/19 13:32:17'
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES (
'field.delim'='\u0001',
'line.delim'='\n',
'serialization.format'='\u0001')
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
'hdfs://ketech-server51:8020/user/hive/warehouse/mydb.db/test'
TBLPROPERTIES (
'transient_lastDdlTime'='1529415143')
5.job示例
#@see https://www.jianshu.com/p/084d1b1e094c
#!/bin/bash
source /etc/profile
source /root/.bashrc
source /etc/hive/conf/hive-env.sh
source /etc/sqoop/conf/sqoop-env.sh
#指定hive和hdfs用户
export HDFS_USER=hdfs
export HIVE_USER=hive
##############################################
## $1:日期 $2:表名
## 第一个参数为日期,第二个参数为mysql表名
##############################################
#hive库名 default will be:default
hdb=test
#hive表名
hive_table=t_test
#mysql表名
mysql_table=T_TEST
#mysql服务器地址
server=192.168.0.75
#mysql端口号
port=1521
#数据库名
mysql_database=orcl
#用户名
username=root
#密码
password=123456
job_name=cdr_record
#判断Hive是否存在,不存在执行下面创建语句,否则跳过
/usr/bin/hive -e "use $hdb;select * from $hive_table limit 1;"
if [ $? -ne 0 ]
then
echo "表不存在,执行创建表结构"
/usr/bin/sqoop create-hive-table --connect jdbc:oracle:thin:@$server:$port:$mysql_database --username $username --password $password --hive-database $hdb --table $mysql_table
else
echo "表已存在,执行增量导入。。。"
fi
#exit
#
#一种是 append,即通过指定一个递增的列,比如:
#--incremental append --check-column num_iid --last-value 0
#另种是可以根据时间戳,比如:
#--incremental lastmodified --check-column created --last-value '2012-02-01 11:0:00'
#就是只导入created 比'2012-02-01 11:0:00'更大的数据。
sqoop job --show $job_name > 1 > /dev/null 2>&1
if [ $? -ne 0 ]
then
echo "job不存在,执行创建"
echo "创建job"
#append
/usr/bin/sqoop job --create $job_name -- import --connect jdbc:oracle:thin:@$server:$port:$mysql_database --username $username --password $password --table $mysql_table --fields-terminated-by "\001" --null-string
'\\N' --null-non-string '\\N' --target-dir /user/hive/warehouse/test.db/t_test --incremental lastmodified --check-column CREATE_DATE -m 1 -z --append
else
echo "job已存在,执行增量导入。。。"
echo "append增量导入模式启动。。。"
/usr/bin/sqoop job --exec $job_name
fi
exit
6.建表源码
/**
* Sqoop 1.4.3
*/
package org.apache.sqoop.hive;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Map;
import java.util.Date;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Properties;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.sqoop.io.CodecMap;
import com.cloudera.sqoop.SqoopOptions;
import com.cloudera.sqoop.manager.ConnManager;
/**
* Creates (Hive-specific) SQL DDL statements to create tables to hold data
* we're importing from another source.
*
* After we import the database into HDFS, we can inject it into Hive using the
* CREATE TABLE and LOAD DATA INPATH statements generated by this object.
*/
public class TableDefWriter {
public static final Log LOG = LogFactory.getLog(TableDefWriter.class.getName());
private SqoopOptions options;
private ConnManager connManager;
private Configuration configuration;
private String inputTableName;
private String outputTableName;
private boolean commentsEnabled;
/**
* Creates a new TableDefWriter to generate a Hive CREATE TABLE statement.
*
* @param opts
* program-wide options
* @param connMgr
* the connection manager used to describe the table.
* @param inputTable
* the name of the table to load.
* @param outputTable
* the name of the Hive table to create.
* @param config
* the Hadoop configuration to use to connect to the dfs
* @param withComments
* if true, then tables will be created with a timestamp comment.
*/
public TableDefWriter(final SqoopOptions opts, final ConnManager connMgr, final String inputTable,
final String outputTable, final Configuration config, final boolean withComments) {
this.options = opts;
this.connManager = connMgr;
this.inputTableName = inputTable;
this.outputTableName = outputTable;
this.configuration = config;
this.commentsEnabled = withComments;
}
private Map externalColTypes;
/**
* Set the column type map to be used. (dependency injection for testing;
* not used in production.)
*/
public void setColumnTypes(Map colTypes) {
this.externalColTypes = colTypes;
LOG.debug("Using test-controlled type map");
}
/**
* Get the column names to import.
*/
private String[] getColumnNames() {
String[] colNames = options.getColumns();
if (null != colNames) {
return colNames; // user-specified column names.
} else if (null != externalColTypes) {
// Test-injection column mapping. Extract the col names from this.
ArrayList keyList = new ArrayList();
for (String key : externalColTypes.keySet()) {
keyList.add(key);
}
return keyList.toArray(new String[keyList.size()]);
} else if (null != inputTableName) {
return connManager.getColumnNames(inputTableName);
} else {
return connManager.getColumnNamesForQuery(options.getSqlQuery());
}
}
/**
* @return the CREATE TABLE statement for the table to load into hive.
*/
public String getCreateTableStmt() throws IOException {
Map columnTypes;
Properties userMapping = options.getMapColumnHive();
if (externalColTypes != null) {
// Use pre-defined column types.
columnTypes = externalColTypes;
} else {
// Get these from the database.
if (null != inputTableName) {
columnTypes = connManager.getColumnTypes(inputTableName);
} else {
columnTypes = connManager.getColumnTypesForQuery(options.getSqlQuery());
}
}
String[] colNames = getColumnNames();
StringBuilder sb = new StringBuilder();
if (options.doFailIfHiveTableExists()) {
sb.append("CREATE TABLE `").append(outputTableName).append("` ( ");
} else {
sb.append("CREATE TABLE IF NOT EXISTS `");
sb.append(outputTableName).append("` ( ");
}
// Check that all explicitly mapped columns are present in result set
for (Object column : userMapping.keySet()) {
boolean found = false;
for (String c : colNames) {
if (c.equals(column)) {
found = true;
break;
}
}
if (!found) {
throw new IllegalArgumentException("No column by the name " + column + "found while importing data");
}
}
boolean first = true;
String partitionKey = options.getHivePartitionKey();
for (String col : colNames) {
if (col.equals(partitionKey)) {
throw new IllegalArgumentException("Partition key " + col + " cannot " + "be a column to import.");
}
if (!first) {
sb.append(", ");
}
first = false;
Integer colType = columnTypes.get(col);
String hiveColType = userMapping.getProperty(col);
if (hiveColType == null) {
hiveColType = connManager.toHiveType(inputTableName, col, colType);
}
if (null == hiveColType) {
throw new IOException("Hive does not support the SQL type for column " + col);
}
sb.append('`').append(col).append("` ").append(hiveColType);
if (HiveTypes.isHiveTypeImprovised(colType)) {
LOG.warn("Column " + col + " had to be cast to a less precise type in Hive");
}
}
sb.append(") ");
if (commentsEnabled) {
DateFormat dateFormat = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss");
String curDateStr = dateFormat.format(new Date());
sb.append("COMMENT 'Imported by sqoop on " + curDateStr + "' ");
}
if (partitionKey != null) {
sb.append("PARTITIONED BY (").append(partitionKey).append(" STRING) ");
}
sb.append("ROW FORMAT DELIMITED FIELDS TERMINATED BY '");
sb.append(getHiveOctalCharCode((int) options.getOutputFieldDelim()));
sb.append("' LINES TERMINATED BY '");
sb.append(getHiveOctalCharCode((int) options.getOutputRecordDelim()));
String codec = options.getCompressionCodec();
if (codec != null && (codec.equals(CodecMap.LZOP) || codec.equals(CodecMap.getCodecClassName(CodecMap.LZOP)))) {
sb.append("' STORED AS INPUTFORMAT " + "'com.hadoop.mapred.DeprecatedLzoTextInputFormat'");
sb.append(" OUTPUTFORMAT " + "'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'");
} else {
sb.append("' STORED AS TEXTFILE");
}
LOG.debug("Create statement: " + sb.toString());
return sb.toString();
}
/**
* @return the LOAD DATA statement to import the data in HDFS into hive.
*/
public String getLoadDataStmt() throws IOException {
Path finalPath = getFinalPath();
StringBuilder sb = new StringBuilder();
sb.append("LOAD DATA INPATH '");
sb.append(finalPath.toString() + "'");
if (options.doOverwriteHiveTable()) {
sb.append(" OVERWRITE");
}
sb.append(" INTO TABLE `");
sb.append(outputTableName);
sb.append('`');
if (options.getHivePartitionKey() != null) {
sb.append(" PARTITION (").append(options.getHivePartitionKey()).append("='")
.append(options.getHivePartitionValue()).append("')");
}
LOG.debug("Load statement: " + sb.toString());
return sb.toString();
}
public Path getFinalPath() throws IOException {
String warehouseDir = options.getWarehouseDir();
if (null == warehouseDir) {
warehouseDir = "";
} else if (!warehouseDir.endsWith(File.separator)) {
warehouseDir = warehouseDir + File.separator;
}
// Final path is determined in the following order:
// 1. Use target dir if the user specified.
// 2. Use input table name.
String tablePath = null;
String targetDir = options.getTargetDir();
if (null != targetDir) {
tablePath = warehouseDir + targetDir;
} else {
tablePath = warehouseDir + inputTableName;
}
FileSystem fs = FileSystem.get(configuration);
return new Path(tablePath).makeQualified(fs);
}
/**
* Return a string identifying the character to use as a delimiter in Hive,
* in octal representation. Hive can specify delimiter characters in the
* form '\ooo' where ooo is a three-digit octal number between 000 and 177.
* Values may not be truncated ('\12' is wrong; '\012' is ok) nor may they
* be zero-prefixed (e.g., '\0177' is wrong).
*
* @param charNum
* the character to use as a delimiter
* @return a string of the form "\ooo" where ooo is an octal number in [000,
* 177].
* @throws IllegalArgumentException
* if charNum > 0177.
*/
public static String getHiveOctalCharCode(int charNum) {
if (charNum > 0177) {
throw new IllegalArgumentException("Character " + charNum + " is an out-of-range delimiter");
}
return String.format("\\%03o", charNum);
}
}
1.sqoop增量数据导入数据重复问题
(数据重复问题 https://blog.csdn.net/qq_20641565/article/details/52763663)
解决:指定主键 配置--merge-key id merge-key这种模式是进行了一次完整的mapreduce操作去更新 https://blog.csdn.net/qq_26937525/article/details/53670213
2.sqoop兼容性问题
时区问题、精度问题,--direct支持持mysql、postgresql 部分版本、数据类型匹配问题
--direct
--direct模式不支持BLOB和CLOB,不支持视图
为了性能,一般的是32MB提交一次,可以通过 -D sqoop.mysql.export.checkpoint.bytes=size来指定
它的单位是bytes,设置为0,就禁用检查点了。
在对生产环境导入数据时,用户也在用,我们通过stage表来解决稳定性的问题,肯定会对生产环境产生影响。
split-by
当split-by不是int型时出现如上场景中的问题。目前想到的解决办法是:将-m 设置称1,split-by不设置,即只有一个map运行,缺点是不能并行map录入数据。(注意,当-m 设置的值大于1时,split-by必须设置字段)
--staging-table
导出数据时候的临时表
3.sqoop使用笔记
https://www.jianshu.com/p/bb78ccd0252f
参考资料
https://yq.aliyun.com/articles/60288