.NET实现 数据提取、转换和加载

早晨看到一篇文章:

http://www.codeproject.com/Articles/34556/Write-ETL-jobs-in-pure-C

 

有二个文件:User name file(ID NAME)、Addresses file(ID ADDRESS):

Id Name
1  Bob

Id Address

1 123 Main St.

2 42 Everywhich way
如果想得到
Id Name Address 如果在SQL中一个查询 就可得到。 在文件处理的过程中,基于c# 如何处理呢? 代码中给出了解决方案。

首先把二个文件的内容读取:
FileEngine是基于 FileHelpers 
public class UserNameRead : AbstractOperation

{

    public UserNameRead(string filePath)

    {

        this.filePath = filePath;

    }



    string filePath = null;



    public override IEnumerable<Row> Execute(IEnumerable<Row> rows)

    {

        using (FileEngine file = FluentFile.For<UserNameRecord>().From(filePath))

        {

            foreach (object obj in file)

            {

                yield return Row.FromObject(obj);

            }

        }

    }

}



public class UserAddressRead : AbstractOperation

{

    public UserAddressRead(string filePath)

    {

        this.filePath = filePath;

    }



    string filePath = null;



    public override IEnumerable<Row> Execute(IEnumerable<Row> rows)

    {

        using (FileEngine file = FluentFile.For<UserAddressRecord>().From(filePath))

        {

            foreach (object obj in file)

            {

                yield return Row.FromObject(obj);

            }

        }

    }

}

创建二个文件的关系并构造新的文件
public class JoinUserRecords : JoinOperation

{

    protected override void SetupJoinConditions()

    {

        InnerJoin

            .Left("Id")

            .Right("Id");

    }



    protected override Row MergeRows(Row leftRow, Row rightRow)

    {

        Row row = new Row();

        row.Copy(leftRow);



        //copy over all properties not in the user records

        row["Address"] = rightRow["Address"];



        return row;

    }

}


创建好的结构 如何输出:
public class UserFullWrite : AbstractOperation

{

    public UserFullWrite(string filePath)

    {

        this.filePath = filePath;

    }



    string filePath = null;



    public override IEnumerable<Row> Execute(IEnumerable<Row> rows)

    {

        FluentFile engine = FluentFile.For<UserFullRecord>();

        engine.HeaderText = "Id\tName\tAddress";

        using (FileEngine file = engine.To(filePath))

        {

            foreach (Row row in rows)

            {

                file.Write(row.ToObject<UserFullRecord>());



                //pass through rows if needed for another later operation 

                yield return row;

            }

        }

    }

}

调用方法:
public class MainProcess : EtlProcess

{

    protected override void Initialize()

    {

        Register(new JoinUserRecords()

            .Left(new UserNameRead(Settings.Default.NamesFile))

            .Right(new UserAddressRead(Settings.Default.AddressesFile))

        );



        Register(new UserFullWrite(Settings.Default.OutputFile));

    }

}

总结: 对于结构化的文件 , 通常比较好处理,但是对于非结构化的文件处理,不易处理。


                            

你可能感兴趣的:(.net)