sqoop目前用的比较多的是数据仓库的建立,在很多数情况下,同步一个表,需要些sqoop脚本,来同步表,有的时候执行成功与否,数据量多少,数据源表是否更新,都不清楚,而且脚本文件不便于管理。
本次介绍的数据平台具有专业的管理功能。
下面是表的设计
DROP TABLE IF EXISTS task_info; CREATE TABLE task_info(task_id INT,database_id INT,db_table VARCHAR(100), hive_table VARCHAR(100), db_columns VARCHAR(5000), where_express VARCHAR(200) , is_hive_import VARCHAR(2), hive_partition_key VARCHAR(10), mappers INT, split_by VARCHAR(10) , is_use VARCHAR(2), priority INT, cron VARCHAR(100), alter_date longtext, meta_data longtext, PRIMARY KEY (task_id)); DROP TABLE IF EXISTS db_info; CREATE TABLE db_info(id INT, uname VARCHAR(100) , password VARCHAR(500), url VARCHAR(200),PRIMARY KEY (id)); DROP TABLE IF EXISTS task_log; CREATE TABLE task_log(name VARCHAR(100) , begin_time VARCHAR(100), end_time VARCHAR(100), result VARCHAR(100), data_num VARCHAR(50), log longtext);
将sqoop需要的相关参数可配置化,包括数据源表也实行配置化。当然如果要改变就执行执行一个sql,让调度程序注意到就行了。这是一个定时执行的。
接下来是核心的具体代码
public class SecureEncrypter implements Encrypter{ private int seed = 0; public SecureEncrypter(int seed){ this.seed=seed; } @Override public String encode(String input) { StringBuilder builder =new StringBuilder(); for(int i=0;i<input.length();i++){ int k =(int)input.charAt(i); //int code = new Integer(k^flag); builder.append("u" + new Integer(k^seed).toString()); } return builder.substring(1); } @Override public String decode(String input) { String[] arr=input.split("u"); StringBuilder builder= new StringBuilder(); for(String str:arr){ int t=Integer.valueOf(str); t = t ^ seed; builder.append((char)t); } return builder.toString(); } }
上面是密码的加密解密,有些密码不能轻易透漏给他人必须加密处理,然后解密作为sqoop的密码参数
public class HDFSExtract implements Extract { /** * @param args * 该类负责管理HDFS的数据抽取工作 */ public static Logger log = Logger.getLogger(HDFSExtract.class); private Connection con = null; private Statement st = null; private ResultSet rs = null; private int task_id; private String db_user = null; private String db_password = null; private String db_table = null; private String db_columns = null; private String url = null; private String where = null; private String is_hive_import = null; private String hive_table = null; private String hive_partition_key = null; private String split_by = null; private String count_date = null; private String mappers= null; public void extract(TaskInfo tInfo) { // 1 读取配置表内容 yhd_extract_to_hdfs //count_date 默认为yesterday count_date = ManaUtils.getData(-1); try { if(tInfo.getAlter_date() != null && !"".equals(tInfo.getAlter_date())) { count_date = tInfo.getAlter_date(); } String sql = "select d.uname,d.password,d.url from db_info d where d.id = " ; sql += tInfo.getDatabase_id(); log.info("sql: "+sql); con = DBConnection.getInstance(); st = con.createStatement(); rs = st.executeQuery(sql); Runtime run = null; if(rs.next()) { ETL_vo vo = new ETL_vo(); setFromResultSet(rs, tInfo); vo.setTableName(hive_table); run = Runtime.getRuntime(); try { run.exec("hadoop fs -rmr "+db_table.trim()); run.exec("hadoop fs -rmr "+db_table.trim().toUpperCase()); run.exec("hadoop fs -rmr "+db_table.trim().toLowerCase()); log.info("Begin excuete task "+task_id+" "+hive_table+" ......"+ManaUtils.sdf.format(new Date())) ; vo.setBeginTime(ManaUtils.sdf.format(new Date())); vo.setResult("SUCCESS"); System.out.println(toString(createOptions())); Process p = run.exec(createOptions()); ManaUtils.exeOption(p, tInfo, st, null); } catch (Exception e) { e.printStackTrace(); } } log.info("Finish Extracting ! "+ManaUtils.sdf.format(new Date())) ; } catch (Exception e) { log.info("Error: extract fail !"); e.printStackTrace(); return ; } finally{ // try { // rs.close(); // st.close(); // con.close(); // } catch (SQLException e) { // e.printStackTrace(); // } } } public void setFromResultSet(ResultSet rs, TaskInfo tInfo) throws SQLException { task_id = tInfo.getTask_id(); db_user = rs.getString("uname").trim(); db_password = rs.getString("password").trim(); url = rs.getString("url"); db_table = tInfo.getDb_table(); db_columns = tInfo.getDb_columns(); where = tInfo.getWhere_express(); is_hive_import = tInfo.getIs_hive_import(); hive_table = tInfo.getHive_table(); hive_partition_key = tInfo.getHive_partition_key(); mappers = tInfo.getMappers().toString(); split_by = tInfo.getSplit_by(); } public String toString(String[] args) { StringBuffer stringBuffer = new StringBuffer(); // System.out.println("---shell---"); for(String a : args) { // System.out.println(a); stringBuffer.append(a); stringBuffer.append(" "); } // System.out.println("---shell---"); return stringBuffer.toString(); } public String[] createOptions() throws Exception { List<String> optSb = new ArrayList<String>(); optSb.add("sqoop"); optSb.add("import"); optSb.add("-D"); optSb.add("mapred.job.queue.name=pms"); optSb.add("--connect"); optSb.add(url); optSb.add("--username"); optSb.add(db_user); optSb.add("--password"); optSb.add(EncrypterFactory.getEncryperInstance(1).decode(db_password)); if (mappers != null && ! "".equals(mappers)) { optSb.add("-m"); optSb.add(mappers); } if (split_by != null && !"".equals(split_by)) { optSb.add("--split-by"); optSb.add(split_by.toUpperCase()); } optSb.add("--null-string"); optSb.add("''"); optSb.add("--table"); optSb.add(db_table); //以下是数据表非必填项,需要判断 if(db_columns != null && ! "".equals(db_columns)){ optSb.add("--columns"); optSb.add("\""+db_columns.toUpperCase()+"\""); } if (where != null && ! "".equals(where)) { optSb.add("--where"); where = where.replaceAll("\\$TXDATE", "'"+count_date+"'") ; optSb.add("\""+where.trim()+"\""); } if (is_hive_import != null && ! "".equals(is_hive_import) && "Y".equalsIgnoreCase(is_hive_import)) { optSb.add("--hive-overwrite"); // 统一为覆盖模式,要求hive 表必须存在 optSb.add("--hive-import"); optSb.add("--hive-drop-import-delims"); if (hive_table == null || "".equals(hive_table)) { log.info("Error: hive_table must be set 当--hive-import时 !"); }else { optSb.add("--hive-table"); optSb.add(hive_table.trim()); } if (hive_partition_key != null && !"".equals(hive_partition_key)) { optSb.add("--hive-partition-key"); optSb.add(hive_partition_key.trim()); optSb.add("--hive-partition-value"); optSb.add("\""+count_date.trim()+"\""); } } optSb.add("--null-string"); optSb.add("'\\\\N'"); optSb.add("--null-non-string"); optSb.add("'\\\\N'"); return optSb.toArray(new String[0]); } }
在很多情况下,需要每天增量的同步,条件里需要加上日期相关