开源ETL工具Kettle transformation中的tableInput步骤无法将上一步骤中参数添加到inputTable 输出流中。但是有时需要上一步骤中的数据,这种情况下就很难处理了。
Kettle 版本 5.0
数据库 Oracle 11g R2
Eclipse
如上图所示,采用记录关联步骤,实现前一步骤和tableInput步骤数据流整合在一起,但是这样处理一个局限性,就是tableInput步骤前一步骤查询的结果值有且只有一行时,数据流才是正确,因为这个方式采用了笛卡尔积进行数据关联。
对tableInput步骤进行改进,使其能够将前一步骤的参数输入流整合到tableInput步骤的输出流中。为了改进tableInput步骤,构建Kettle代码开发调试环境,如下图所示:
在engine/src代码目录中可以找到tableInput步骤相关代码,当然为了避免直接修改Kettle源代码,一般在plugins下作为插件开发,源代码不做过多描述。如下图所示:
将kettle源代码,复制到plugins下,并重新命名package以及代码中的依赖关系
1、TableInputData增加两个新的变量
public class TableInputData extends BaseStepData implements StepDataInterface {
public Object[] nextrow;
public Object[] thisrow;
public Database db;
public ResultSet rs;
public String lookupStep;
public RowMetaInterface rowMeta;
public RowMetaInterface newrowMeta;//扩展后字段元信息
public Object[] newthisrow;//扩展后字段值信息
public RowSet rowSet;
public boolean isCanceled;
public StreamInterface infoStream;
public TableInputData() {
super ();
db = null;
thisrow = null;
nextrow = null;
newthisrow= null;
rs = null;
lookupStep = null;
}
}
TableInput.java
public boolean processRow(StepMetaInterface smi,StepDataInterface sdi) throws KettleException{
if (first) // we just got started
{
Object[] parameters;
RowMetaInterface parametersMeta;
first = false;
// Make sure we read data from source steps...
if (data.infoStream.getStepMeta () != null) { if (meta.isExecuteEachInputRow ()) { if (log.isDetailed ()) logDetailed ("Reading single row from stream [" + data.infoStream.getStepname () + "]"); data.rowSet = findInputRowSet (data.infoStream.getStepname ()); if (data.rowSet == null) { throw new KettleException ("Unable to find rowset to read from, perhaps step [" + data.infoStream.getStepname () + "] doesn't exist. (or perhaps you are trying a preview?)"); }
parameters = getRowFrom (data.rowSet);
parametersMeta = data.rowSet.getRowMeta ();
} else {
if (log.isDetailed ()) logDetailed ("Reading query parameters from stream [" + data.infoStream.getStepname () + "]");
RowMetaAndData rmad = readStartDate (); // Read values in lookup table (look)
parameters = rmad.getData ();
parametersMeta = rmad.getRowMeta ();
}
if (parameters != null) {
if (log.isDetailed ()) logDetailed ("Query parameters found = " + parametersMeta.getString (parameters));
}
} else {
parameters = new Object[] {};
parametersMeta = new RowMeta ();
}
if (meta.isExecuteEachInputRow () && (parameters == null || parametersMeta.size () == 0)) {
setOutputDone (); // signal end to receiver(s)
return false; // stop immediately, nothing to do here.
}
boolean success = doQuery (parametersMeta, parameters);
if (!success) { return false; }
} else {
if (data.thisrow != null) // We can expect more rows
{
data.nextrow = data.db.getRow (data.rs, meta.isLazyConversionActive ());
if (data.nextrow != null) incrementLinesInput ();
}
}
if (data.thisrow == null) // Finished reading?
{
boolean done = false;
if (meta.isExecuteEachInputRow ()) // Try to get another row from the input stream
{
Object[] nextRow = getRowFrom (data.rowSet);
if (nextRow == null) // Nothing more to get!
{
done = true;
} else {
// First close the previous query, otherwise we run out of cursors!
closePreviousQuery ();
boolean success = doQuery (data.rowSet.getRowMeta (), nextRow); // OK, perform a new query
if (!success) { return false; }
if (data.thisrow != null) { //tableinput步骤输出扩展后的字段信息和数据流 putRow (data.newrowMeta, data.newthisrow); // fill the rowset(s). (wait for empty) data.thisrow = data.nextrow; if (checkFeedback (getLinesInput ())) { if (log.isBasic ()) logBasic ("linenr " + getLinesInput ()); }
}
}
} else {
done = true;
}
if (done) {
setOutputDone (); // signal end to receiver(s)
return false; // end of data or error.
}
} else {
//tableinput步骤输出扩展后的字段信息和数据流
putRow (data.newrowMeta, data.newthisrow); // fill the rowset(s). (wait for empty)
data.thisrow = data.nextrow;
if (checkFeedback (getLinesInput ())) {
if (log.isBasic ()) logBasic ("linenr " + getLinesInput ());
}
}
return true;
}
private boolean doQuery(RowMetaInterface parametersMeta,Object[] parameters) throws KettleDatabaseException{
boolean success = true;
// Open the query with the optional parameters received from the source steps.
String sql = null;
if (meta.isVariableReplacementActive ()) sql = environmentSubstitute (meta.getSQL ());
else sql = meta.getSQL ();
if (log.isDetailed ()) logDetailed ("SQL query : " + sql);
if (parametersMeta.isEmpty ()) {
data.rs = data.db.openQuery (sql, null, null, ResultSet.FETCH_FORWARD, meta.isLazyConversionActive ());
} else {
data.rs = data.db.openQuery (sql, parametersMeta, parameters, ResultSet.FETCH_FORWARD, meta.isLazyConversionActive ());
}
if (data.rs == null) { logError ("Couldn't open Query [" + sql + "]"); setErrors (1); stopAll (); success = false; } else { // Keep the metadata data.rowMeta = data.db.getReturnRowMeta (); // Set the origin on the row metadata... if (data.rowMeta != null) { for ( ValueMetaInterface valueMeta : data.rowMeta.getValueMetaList () ) { valueMeta.setOrigin (getStepname ()); }
}
// Get the first row...
data.thisrow = data.db.getRow (data.rs);
if (data.thisrow != null) { incrementLinesInput (); data.nextrow = data.db.getRow (data.rs); if (data.nextrow != null) incrementLinesInput (); }
// 若参数不为空,将参数字段元信息和数据扩展的新的变量中
if (!parametersMeta.isEmpty ()) {
// 重新分配数据存储数组大小,并将值添加到tableinput步骤数据流末尾
data.newthisrow = RowDataUtil.allocateRowData (data.rowMeta.size () + parameters.length + 2);
System.arraycopy (data.thisrow, 0, data.newthisrow, 0, data.rowMeta.size ());
//将参数元信息,加入到tableinput步骤输出字段信息末尾
data.newrowMeta = data.rowMeta.clone ();
int len = data.rowMeta.size ();
for ( int i = 0 ; i < parametersMeta.size () ; i++ )
data.newthisrow[len + i] = parameters[i];
data.newrowMeta.addRowMeta (parametersMeta);
}else{//若是不存在参数,不行扩容处理
data.newrowMeta = data.rowMeta.clone ();
data.newthisrow = data.thisrow.clone ();
}
}
return success;
}
改进代码完毕后,进行代码测试。在kettle-steps.xml配置插件信息后启动Spoon.java
运行结果如下图所示:
作者 @zokaper
2016 年 01 月 17日
声明:因时间精力的原因,一般会只写一下提纲内容慢慢写,若有你刚兴趣的地方,请留言…