环境:CentOS 6.3, Hadoop 1.1.2, JDK 1.6, Eclipse, Cascading
应用场景:当需要把输入文件根据某个字段分片(数据按字段值输入到相应的目录)输出时。
直接上代码:
package core.ebay.subscription;
import cascading.flow.FlowConnector; import cascading.flow.FlowDef; import cascading.flow.hadoop.HadoopFlowConnector; import cascading.pipe.Pipe; import cascading.scheme.hadoop.TextDelimited; import cascading.tap.SinkMode; import cascading.tap.Tap; import cascading.tap.hadoop.Hfs; import cascading.tap.hadoop.TemplateTap; import cascading.tuple.Fields;
public class SequenceFileTest {
public static void main(String[] args) {
TextDelimited inScheme = new TextDelimited(new Fields("year", "month","entry"), ";"); TextDelimited outScheme = new TextDelimited(new Fields("month","entry")); Hfs intap = new Hfs(inScheme, args[0]); //define input Hfs intap1 = new Hfs(outScheme, args[1]); //define output String template = "%s"; //output file path type //String template = "%s-%s"; //base on year & month genereate output directory Tap month = new TemplateTap(intap1, template,new Fields("month"), SinkMode.REPLACE); //Base on the Filed generate to according directory Pipe inputPipe = new Pipe("inputPipe"); FlowDef flowDef = FlowDef.flowDef().addSource(inputPipe, intap) .addTailSink(inputPipe, month); FlowConnector flowConnector = new HadoopFlowConnector(); flowConnector.connect(flowDef).complete(); } }
inputFile:
2013;10;132131
2013;10;13213fsfsdfd
2013;10;13213fsfsdfdfsadf
2014;09;sfdsfd
outputFilePath:
10:
10 132131
10 13213fsfsdfd
10 13213fsfsdfdfsadf