Sqoop碎碎念

Sqoop是一个etl工具,是sql to hadoop的缩写。即将关系型数据库数据导入hadoop中。可以用sqoop来创建hive表和导数据,实际上也是在hdfs中创建目录和将数据存储在hdfs中。

1.Sqoop基本命令 

   @see url

2.mysql导入hive 

  --hive-database qianyang  #指定hive数据库

    @see link

3.Sqoop job的CRUD

     @see sqoop_crud

4.查看Hive建表语句

查看hive建表语句:show create table tablename;
查看hive表结构:describe  tablename; 简写:desc tablename;

   

/usr/bin/sqoop create-hive-table --connect jdbc:oracle:thin:@$server:$port:$mysql_database --username $usern
ame --password $password --hive-database $hdb --table $mysql_table
#>sqoop -version          Sqoop 1.4.6-cdh5.12.2
CREATE TABLE `test`(
  `id` string, 
   ...
  `create_date` string)
COMMENT 'Imported by sqoop on 2018/06/19 13:32:17'
ROW FORMAT SERDE 
  'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' 
WITH SERDEPROPERTIES ( 
  'field.delim'='\u0001', 
  'line.delim'='\n', 
  'serialization.format'='\u0001') 
STORED AS INPUTFORMAT 
  'org.apache.hadoop.mapred.TextInputFormat' 
OUTPUTFORMAT 
  'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
  'hdfs://ketech-server51:8020/user/hive/warehouse/mydb.db/test'
TBLPROPERTIES (
  'transient_lastDdlTime'='1529415143')

 

 

5.job示例   

 

 

  #@see https://www.jianshu.com/p/084d1b1e094c

    

#!/bin/bash
source /etc/profile
source /root/.bashrc
source /etc/hive/conf/hive-env.sh
source /etc/sqoop/conf/sqoop-env.sh
#指定hive和hdfs用户
export HDFS_USER=hdfs
export HIVE_USER=hive
##############################################
##  $1:日期   $2:表名
##  第一个参数为日期,第二个参数为mysql表名
##############################################
#hive库名 default will be:default
hdb=test
#hive表名
hive_table=t_test
#mysql表名
mysql_table=T_TEST
#mysql服务器地址
server=192.168.0.75
#mysql端口号
port=1521
#数据库名
mysql_database=orcl
#用户名
username=root
#密码
password=123456
job_name=cdr_record

#判断Hive是否存在,不存在执行下面创建语句,否则跳过
/usr/bin/hive -e "use $hdb;select * from $hive_table limit 1;"
if [ $? -ne 0 ]
then
    echo "表不存在,执行创建表结构"
    /usr/bin/sqoop create-hive-table --connect jdbc:oracle:thin:@$server:$port:$mysql_database --username $username --password $password --hive-database $hdb --table $mysql_table
else
    echo "表已存在,执行增量导入。。。"
fi
#exit
#
#一种是 append,即通过指定一个递增的列,比如:
#--incremental append  --check-column num_iid --last-value 0
#另种是可以根据时间戳,比如:
#--incremental lastmodified --check-column created --last-value '2012-02-01 11:0:00'
#就是只导入created 比'2012-02-01 11:0:00'更大的数据。

sqoop job --show $job_name > 1 > /dev/null 2>&1
if [ $? -ne 0 ]
then
    echo "job不存在,执行创建"
    echo "创建job"
    #append
    /usr/bin/sqoop job --create $job_name -- import --connect jdbc:oracle:thin:@$server:$port:$mysql_database --username $username --password $password --table $mysql_table --fields-terminated-by "\001" --null-string
'\\N' --null-non-string '\\N' --target-dir /user/hive/warehouse/test.db/t_test --incremental lastmodified --check-column CREATE_DATE -m 1 -z --append
else
    echo "job已存在,执行增量导入。。。"
    echo "append增量导入模式启动。。。"
    /usr/bin/sqoop job --exec $job_name
fi

exit

6.建表源码

/**
 *  Sqoop 1.4.3
 */

package org.apache.sqoop.hive;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Map;
import java.util.Date;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Properties;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.sqoop.io.CodecMap;

import com.cloudera.sqoop.SqoopOptions;
import com.cloudera.sqoop.manager.ConnManager;

/**
 * Creates (Hive-specific) SQL DDL statements to create tables to hold data
 * we're importing from another source.
 *
 * After we import the database into HDFS, we can inject it into Hive using the
 * CREATE TABLE and LOAD DATA INPATH statements generated by this object.
 */
public class TableDefWriter {

	public static final Log LOG = LogFactory.getLog(TableDefWriter.class.getName());

	private SqoopOptions options;
	private ConnManager connManager;
	private Configuration configuration;
	private String inputTableName;
	private String outputTableName;
	private boolean commentsEnabled;

	/**
	 * Creates a new TableDefWriter to generate a Hive CREATE TABLE statement.
	 * 
	 * @param opts
	 *            program-wide options
	 * @param connMgr
	 *            the connection manager used to describe the table.
	 * @param inputTable
	 *            the name of the table to load.
	 * @param outputTable
	 *            the name of the Hive table to create.
	 * @param config
	 *            the Hadoop configuration to use to connect to the dfs
	 * @param withComments
	 *            if true, then tables will be created with a timestamp comment.
	 */
	public TableDefWriter(final SqoopOptions opts, final ConnManager connMgr, final String inputTable,
			final String outputTable, final Configuration config, final boolean withComments) {
		this.options = opts;
		this.connManager = connMgr;
		this.inputTableName = inputTable;
		this.outputTableName = outputTable;
		this.configuration = config;
		this.commentsEnabled = withComments;
	}

	private Map externalColTypes;

	/**
	 * Set the column type map to be used. (dependency injection for testing;
	 * not used in production.)
	 */
	public void setColumnTypes(Map colTypes) {
		this.externalColTypes = colTypes;
		LOG.debug("Using test-controlled type map");
	}

	/**
	 * Get the column names to import.
	 */
	private String[] getColumnNames() {
		String[] colNames = options.getColumns();
		if (null != colNames) {
			return colNames; // user-specified column names.
		} else if (null != externalColTypes) {
			// Test-injection column mapping. Extract the col names from this.
			ArrayList keyList = new ArrayList();
			for (String key : externalColTypes.keySet()) {
				keyList.add(key);
			}

			return keyList.toArray(new String[keyList.size()]);
		} else if (null != inputTableName) {
			return connManager.getColumnNames(inputTableName);
		} else {
			return connManager.getColumnNamesForQuery(options.getSqlQuery());
		}
	}

	/**
	 * @return the CREATE TABLE statement for the table to load into hive.
	 */
	public String getCreateTableStmt() throws IOException {
		Map columnTypes;
		Properties userMapping = options.getMapColumnHive();

		if (externalColTypes != null) {
			// Use pre-defined column types.
			columnTypes = externalColTypes;
		} else {
			// Get these from the database.
			if (null != inputTableName) {
				columnTypes = connManager.getColumnTypes(inputTableName);
			} else {
				columnTypes = connManager.getColumnTypesForQuery(options.getSqlQuery());
			}
		}

		String[] colNames = getColumnNames();
		StringBuilder sb = new StringBuilder();
		if (options.doFailIfHiveTableExists()) {
			sb.append("CREATE TABLE `").append(outputTableName).append("` ( ");
		} else {
			sb.append("CREATE TABLE IF NOT EXISTS `");
			sb.append(outputTableName).append("` ( ");
		}

		// Check that all explicitly mapped columns are present in result set
		for (Object column : userMapping.keySet()) {
			boolean found = false;
			for (String c : colNames) {
				if (c.equals(column)) {
					found = true;
					break;
				}
			}

			if (!found) {
				throw new IllegalArgumentException("No column by the name " + column + "found while importing data");
			}
		}

		boolean first = true;
		String partitionKey = options.getHivePartitionKey();
		for (String col : colNames) {
			if (col.equals(partitionKey)) {
				throw new IllegalArgumentException("Partition key " + col + " cannot " + "be a column to import.");
			}

			if (!first) {
				sb.append(", ");
			}

			first = false;

			Integer colType = columnTypes.get(col);
			String hiveColType = userMapping.getProperty(col);
			if (hiveColType == null) {
				hiveColType = connManager.toHiveType(inputTableName, col, colType);
			}
			if (null == hiveColType) {
				throw new IOException("Hive does not support the SQL type for column " + col);
			}

			sb.append('`').append(col).append("` ").append(hiveColType);

			if (HiveTypes.isHiveTypeImprovised(colType)) {
				LOG.warn("Column " + col + " had to be cast to a less precise type in Hive");
			}
		}

		sb.append(") ");

		if (commentsEnabled) {
			DateFormat dateFormat = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss");
			String curDateStr = dateFormat.format(new Date());
			sb.append("COMMENT 'Imported by sqoop on " + curDateStr + "' ");
		}

		if (partitionKey != null) {
			sb.append("PARTITIONED BY (").append(partitionKey).append(" STRING) ");
		}

		sb.append("ROW FORMAT DELIMITED FIELDS TERMINATED BY '");
		sb.append(getHiveOctalCharCode((int) options.getOutputFieldDelim()));
		sb.append("' LINES TERMINATED BY '");
		sb.append(getHiveOctalCharCode((int) options.getOutputRecordDelim()));
		String codec = options.getCompressionCodec();
		if (codec != null && (codec.equals(CodecMap.LZOP) || codec.equals(CodecMap.getCodecClassName(CodecMap.LZOP)))) {
			sb.append("' STORED AS INPUTFORMAT " + "'com.hadoop.mapred.DeprecatedLzoTextInputFormat'");
			sb.append(" OUTPUTFORMAT " + "'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'");
		} else {
			sb.append("' STORED AS TEXTFILE");
		}

		LOG.debug("Create statement: " + sb.toString());
		return sb.toString();
	}

	/**
	 * @return the LOAD DATA statement to import the data in HDFS into hive.
	 */
	public String getLoadDataStmt() throws IOException {
		Path finalPath = getFinalPath();

		StringBuilder sb = new StringBuilder();
		sb.append("LOAD DATA INPATH '");
		sb.append(finalPath.toString() + "'");
		if (options.doOverwriteHiveTable()) {
			sb.append(" OVERWRITE");
		}
		sb.append(" INTO TABLE `");
		sb.append(outputTableName);
		sb.append('`');

		if (options.getHivePartitionKey() != null) {
			sb.append(" PARTITION (").append(options.getHivePartitionKey()).append("='")
					.append(options.getHivePartitionValue()).append("')");
		}

		LOG.debug("Load statement: " + sb.toString());
		return sb.toString();
	}

	public Path getFinalPath() throws IOException {
		String warehouseDir = options.getWarehouseDir();
		if (null == warehouseDir) {
			warehouseDir = "";
		} else if (!warehouseDir.endsWith(File.separator)) {
			warehouseDir = warehouseDir + File.separator;
		}

		// Final path is determined in the following order:
		// 1. Use target dir if the user specified.
		// 2. Use input table name.
		String tablePath = null;
		String targetDir = options.getTargetDir();
		if (null != targetDir) {
			tablePath = warehouseDir + targetDir;
		} else {
			tablePath = warehouseDir + inputTableName;
		}
		FileSystem fs = FileSystem.get(configuration);
		return new Path(tablePath).makeQualified(fs);
	}

	/**
	 * Return a string identifying the character to use as a delimiter in Hive,
	 * in octal representation. Hive can specify delimiter characters in the
	 * form '\ooo' where ooo is a three-digit octal number between 000 and 177.
	 * Values may not be truncated ('\12' is wrong; '\012' is ok) nor may they
	 * be zero-prefixed (e.g., '\0177' is wrong).
	 *
	 * @param charNum
	 *            the character to use as a delimiter
	 * @return a string of the form "\ooo" where ooo is an octal number in [000,
	 *         177].
	 * @throws IllegalArgumentException
	 *             if charNum > 0177.
	 */
	public static String getHiveOctalCharCode(int charNum) {
		if (charNum > 0177) {
			throw new IllegalArgumentException("Character " + charNum + " is an out-of-range delimiter");
		}

		return String.format("\\%03o", charNum);
	}

}

 

1.sqoop增量数据导入数据重复问题 
(数据重复问题 https://blog.csdn.net/qq_20641565/article/details/52763663)
    解决:指定主键 配置--merge-key id merge-key这种模式是进行了一次完整的mapreduce操作去更新     https://blog.csdn.net/qq_26937525/article/details/53670213

 

2.sqoop兼容性问题

时区问题、精度问题,--direct支持持mysql、postgresql 部分版本、数据类型匹配问题

--direct
--direct模式不支持BLOB和CLOB,不支持视图
为了性能,一般的是32MB提交一次,可以通过 -D sqoop.mysql.export.checkpoint.bytes=size来指定
它的单位是bytes,设置为0,就禁用检查点了。
在对生产环境导入数据时,用户也在用,我们通过stage表来解决稳定性的问题,肯定会对生产环境产生影响。
split-by
当split-by不是int型时出现如上场景中的问题。目前想到的解决办法是:将-m 设置称1,split-by不设置,即只有一个map运行,缺点是不能并行map录入数据。(注意,当-m 设置的值大于1时,split-by必须设置字段) 
--staging-table
导出数据时候的临时表

3.sqoop使用笔记

 https://www.jianshu.com/p/bb78ccd0252f

参考资料

https://yq.aliyun.com/articles/60288

 

你可能感兴趣的:(bigdatanode)