kettle -ETL技术

为了给下个项目做个准备工作,我的下个项目主要就是数据抽取这一块,整个系统中的数据都是从其他系统中通过日钟传输过来的,所以为了解决数据的导入问题,我们要在这个项目用ETL技术,ETL(extract,transform,load)数据的抽取,转换,和导入。数据的抽取有很多种来源,比如说:直接从数据库中拿,从文本文件中获取,或者从execl文件中获取等,第二个重要的步骤就是转换,我们获取的数据必须转换成我们合法的数据,怎么说呢,就像你有很多水果,但是顾客想要的是好的水果,坏的我不要,在ETL中我们就要对数据进行清洗了,清洗好的数据,我们再进行转化就成为我们想要的数据咯,说到这里,大家估计明白了一点了,ETL其实是一个很抽象的概念,我们所要做的,就是怎么去理解他的思想,理解了他,你就对他了解了一半,还一半我们该怎么半呢?毫无疑问当然是实践咯!
ETL技术有很多的框架来支撑这项技术,比如说非常流行的kettle,还有oracle的,具体的叫什么我也忘记了。
现在就让我来给大家讲讲kettle是怎么工作的吧!
下载一个kettle有很多中方式,你可以使用svn下载,也可以从kettle的官方网站下载。如果你还不知道怎么下载,那就来我的网站下载吧,我等会会给大家上传一个kettle 的,我的这个是kettle 3.0.2的。
package test;

import java.io.DataOutputStream;
import java.io.File;
import java.io.FileOutputStream;

import org.pentaho.di.core.Const;
import org.pentaho.di.core.KettleEnvironment;
import org.pentaho.di.core.NotePadMeta;
import org.pentaho.di.core.database.Database;
import org.pentaho.di.core.database.DatabaseMeta;
import org.pentaho.di.core.exception.KettleException;
import org.pentaho.di.core.logging.LogWriter;
import org.pentaho.di.core.util.EnvUtil;
import org.pentaho.di.trans.StepLoader;
import org.pentaho.di.trans.Trans;
import org.pentaho.di.trans.TransHopMeta;
import org.pentaho.di.trans.TransMeta;
import org.pentaho.di.trans.step.StepMeta;
import org.pentaho.di.trans.step.StepMetaInterface;
import org.pentaho.di.trans.steps.selectvalues.SelectValuesMeta;
import org.pentaho.di.trans.steps.tableinput.TableInputMeta;
import org.pentaho.di.trans.steps.tableoutput.TableOutputMeta;


/**
* Class created to demonstrate the creation of transformations on-the-fly.
*
* @author Matt
*
*/
public class TransBuilder
{
public static final String[] databasesXML = {
"" +
"" +
"target" +
"localhost" +
"MYSQL" +
"Native" +
"test" +
"3306" +
"root" +
"12345" +
"
",

"" +
"" +
"source" +
"localhost" +
"MYSQL" +
"Native" +
"test" +
"3306" +
"root" +
"12345" +
"
"
};

/**
* Creates a new Transformation using input parameters such as the tablename to read from.
* @param transformationName The name of the transformation
* @param sourceDatabaseName The name of the database to read from
* @param sourceTableName The name of the table to read from
* @param sourceFields The field names we want to read from the source table
* @param targetDatabaseName The name of the target database
* @param targetTableName The name of the target table we want to write to
* @param targetFields The names of the fields in the target table (same number of fields as sourceFields)
* @return A new transformation
* @throws KettleException In the rare case something goes wrong
*/
public static final TransMeta buildCopyTable(String transformationName, String sourceDatabaseName, String sourceTableName, String[] sourceFields, String targetDatabaseName, String targetTableName, String[] targetFields) throws KettleException
{
EnvUtil.environmentInit();
try
{
//
// Create a new transformation...
//
TransMeta transMeta = new TransMeta();
transMeta.setName(transformationName);

// Add the database connections
for (int i=0;i {
DatabaseMeta databaseMeta = new DatabaseMeta(databasesXML[i]);
transMeta.addDatabase(databaseMeta);
}

DatabaseMeta sourceDBInfo = transMeta.findDatabase(sourceDatabaseName);
DatabaseMeta targetDBInfo = transMeta.findDatabase(targetDatabaseName);


//
// Add a note
//
String note = "Reads information from table [" + sourceTableName+ "] on database [" + sourceDBInfo + "]" + Const.CR;
note += "After that, it writes the information to table [" + targetTableName + "] on database [" + targetDBInfo + "]";
NotePadMeta ni = new NotePadMeta(note, 150, 10, -1, -1);
transMeta.addNote(ni);

//
// create the source step...
//
String fromstepname = "read from [" + sourceTableName + "]";
TableInputMeta tii = new TableInputMeta();
tii.setDatabaseMeta(sourceDBInfo);
String selectSQL = "SELECT "+Const.CR;
for (int i=0;i {
if (i>0) selectSQL+=", "; else selectSQL+=" ";
selectSQL+=sourceFields[i]+Const.CR;
}
selectSQL+="FROM "+sourceTableName;
tii.setSQL(selectSQL);

StepLoader steploader = StepLoader.getInstance();

String fromstepid = steploader.getStepPluginID(tii);
StepMeta fromstep = new StepMeta(fromstepid, fromstepname, (StepMetaInterface) tii);
fromstep.setLocation(150, 100);
fromstep.setDraw(true);
fromstep.setDescription("Reads information from table [" + sourceTableName + "] on database [" + sourceDBInfo + "]");
transMeta.addStep(fromstep);

//
// add logic to rename fields
// Use metadata logic in SelectValues, use SelectValueInfo...
//
SelectValuesMeta svi = new SelectValuesMeta();
svi.allocate(0, 0, sourceFields.length);
for (int i = 0; i < sourceFields.length; i++)
{
svi.getMeta()[i].setName(sourceFields[i]);
svi.getMeta()[i].setRename(targetFields[i]);
}

String selstepname = "Rename field names";
String selstepid = steploader.getStepPluginID(svi);
StepMeta selstep = new StepMeta(selstepid, selstepname, (StepMetaInterface) svi);
selstep.setLocation(350, 100);
selstep.setDraw(true);
selstep.setDescription("Rename field names");
transMeta.addStep(selstep);

TransHopMeta shi = new TransHopMeta(fromstep, selstep);
transMeta.addTransHop(shi);
fromstep = selstep;

//
// Create the target step...
//
//
// Add the TableOutputMeta step...
//
String tostepname = "write to [" + targetTableName + "]";
TableOutputMeta toi = new TableOutputMeta();
toi.setDatabaseMeta(targetDBInfo);
toi.setTablename(targetTableName);
toi.setCommitSize(200);
toi.setTruncateTable(true);

String tostepid = steploader.getStepPluginID(toi);
StepMeta tostep = new StepMeta(tostepid, tostepname, (StepMetaInterface) toi);
tostep.setLocation(550, 100);
tostep.setDraw(true);
tostep.setDescription("Write information to table [" + targetTableName + "] on database [" + targetDBInfo + "]");
transMeta.addStep(tostep);

//
// Add a hop between the two steps...
//
TransHopMeta hi = new TransHopMeta(fromstep, tostep);
transMeta.addTransHop(hi);

// OK, if we're still here: overwrite the current transformation...
return transMeta;
}
catch (Exception e)
{
throw new KettleException("An unexpected error occurred creating the new transformation", e);
}
}

/**
* 1) create a new transformation
* 2) save the transformation as XML file
* 3) generate the SQL for the target table
* 4) Execute the transformation
* 5) drop the target table to make this program repeatable
*
* @param args
*/
public static void main(String[] args) throws Exception
{
KettleEnvironment.init();

// Init the logging...
//
LogWriter.getInstance("TransBuilder.log", true, LogWriter.LOG_LEVEL_DETAILED);

// The parameters we want, optionally this can be
String fileName = "NewTrans.xml";
String transformationName = "Test Transformation";
String sourceDatabaseName = "source";
String sourceTableName = "customer";
String sourceFields[] = {
"id",
"customernr",
"name",
"firstname"
};

String targetDatabaseName = "target";
String targetTableName = "cust";
String targetFields[] = {
"id",
"custNo",
"lastName",
"firstName"
};


// Generate the transformation.
TransMeta transMeta = TransBuilder.buildCopyTable(
transformationName,
sourceDatabaseName,
sourceTableName,
sourceFields,
targetDatabaseName,
targetTableName,
targetFields
);

// Save it as a file:
String xml = transMeta.getXML();
DataOutputStream dos = new DataOutputStream(new FileOutputStream(new File(fileName)));
dos.write(xml.getBytes("UTF-8"));
dos.close();
System.out.println("Saved transformation to file: "+fileName);

// OK, What's the SQL we need to execute to generate the target table?
String sql = transMeta.getSQLStatementsString();

// Execute the SQL on the target table:
Database targetDatabase = new Database(transMeta.findDatabase(targetDatabaseName));
targetDatabase.connect();
targetDatabase.execStatements(sql);

// Now execute the transformation...
Trans trans = new Trans(transMeta);
trans.execute(null);
trans.waitUntilFinished();

// For testing/repeatability, we drop the target table again
targetDatabase.execStatement("drop table "+targetTableName);
targetDatabase.disconnect();
}


}

你可能感兴趣的:(guobin_ETL)