pig自带的pigstorage不能指定行分隔符,所以自己重写了一个简单的UDF类,可以指定列和行的分隔符,之前研究过的简单的,
http://blog.csdn.net/ruishenh/article/details/12048067
但是弊端大,所以这次重写一下。
操作步骤打好包上传到服务器,
grunt> register /home/pig/pig-0.11.0/udflib/myStorage.jar
grunt> cat student;
1,xiaohouzi,25/2,xiaohouzi2,24/3,xiaohouzi3,23
grunt> a = load 'student' using com.hcr.hadoop.pig.MyStorage(',','/');
grunt> dump a;
(1,xiaohouzi,25)
(2,xiaohouzi2,24)
(3,xiaohouzi3,23)
grunt> store a into 'myStorageOut' using com.hcr.hadoop.pig.MyStorage(',','/');
执行提示成功后查看
grunt> cat myStorageOut
1,xiaohouzi,25/2,xiaohouzi2,24/3,xiaohouzi3,23/
源码类
package com.hcr.hadoop.pig; import java.io.DataOutputStream; import java.io.IOException; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.io.compress.BZip2Codec; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.mapreduce.InputFormat; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.OutputFormat; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.LineRecordReader; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.ReflectionUtils; import org.apache.pig.Expression; import org.apache.pig.LoadFunc; import org.apache.pig.LoadMetadata; import org.apache.pig.PigException; import org.apache.pig.ResourceSchema; import org.apache.pig.ResourceSchema.ResourceFieldSchema; import org.apache.pig.ResourceStatistics; import org.apache.pig.StoreFunc; import org.apache.pig.StoreFuncInterface; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit; import org.apache.pig.data.DataType; import org.apache.pig.data.Tuple; import org.apache.pig.data.TupleFactory; import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema; import org.apache.pig.impl.util.StorageUtil; public class MyStorage extends LoadFunc implements StoreFuncInterface,LoadMetadata { private static final Log LOG = LogFactory.getLog(MyStorage.class); private static final String utf8 = "UTF-8"; private static String fieldDel = "\t"; private static String recordDel = "\n"; protected RecordReader recordReader = null; protected RecordWriter writer = null; public MyStorage() { } public MyStorage(String fieldDel) { this(fieldDel, "\n"); } public MyStorage(String fieldDel, String recordDel) { this.fieldDel = fieldDel; this.recordDel = recordDel; } @Override public void setLocation(String s, Job job) throws IOException { FileInputFormat.setInputPaths(job, s); } @Override public InputFormat getInputFormat() throws IOException { return new MyStorageInputFormat(recordDel); } @Override public void prepareToRead(RecordReader recordReader, PigSplit pigSplit) throws IOException { this.recordReader = recordReader; } @Override public Tuple getNext() throws IOException { try { boolean flag = recordReader.nextKeyValue(); if (!flag) { return null; } Text value = (Text) recordReader.getCurrentValue(); String[] strArray = value.toString().split(fieldDel); List lst = new ArrayList<String>(); int i = 0; for (String singleItem : strArray) { lst.add(i++, singleItem); } return TupleFactory.getInstance().newTuple(lst); } catch (InterruptedException e) { throw new ExecException("Read data error", PigException.REMOTE_ENVIRONMENT, e); } } /** * */ @Override public String relToAbsPathForStoreLocation(String location, Path curDir) throws IOException { return LoadFunc.getAbsolutePath(location, curDir); } @Override public OutputFormat getOutputFormat() throws IOException { return new MyStorageOutputFormat(StorageUtil.parseFieldDel(fieldDel), this.recordDel); } @Override public void setStoreLocation(String location, Job job) throws IOException { job.getConfiguration().set("mapred.textoutputformat.separator", ""); FileOutputFormat.setOutputPath(job, new Path(location)); if ("true".equals(job.getConfiguration().get( "output.compression.enabled"))) { FileOutputFormat.setCompressOutput(job, true); String codec = job.getConfiguration().get( "output.compression.codec"); try { FileOutputFormat.setOutputCompressorClass(job, (Class<? extends CompressionCodec>) Class .forName(codec)); } catch (ClassNotFoundException e) { throw new RuntimeException("Class not found: " + codec); } } else { // This makes it so that storing to a directory ending with ".gz" or // ".bz2" works. setCompression(new Path(location), job); } } private void setCompression(Path path, Job job) { String location = path.getName(); if (location.endsWith(".bz2") || location.endsWith(".bz")) { FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class); } else if (location.endsWith(".gz")) { FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); } else { FileOutputFormat.setCompressOutput(job, false); } } @Override public void checkSchema(ResourceSchema s) throws IOException { // TODO Auto-generated method stub } @Override public void prepareToWrite(RecordWriter writer) throws IOException { this.writer = writer; } @Override public void putNext(Tuple t) throws IOException { try { writer.write(null, t); } catch (InterruptedException e) { throw new IOException(e); } } @Override public void setStoreFuncUDFContextSignature(String signature) { // TODO Auto-generated method stub } @Override public void cleanupOnFailure(String location, Job job) throws IOException { StoreFunc.cleanupOnFailureImpl(location, job); } @Override public void cleanupOnSuccess(String location, Job job) throws IOException { // TODO Auto-generated method stub } @Override public ResourceSchema getSchema(String location, Job job) throws IOException { ResourceSchema rs=new ResourceSchema(); FieldSchema c1 = new FieldSchema("c1", DataType.INTEGER); FieldSchema c2 = new FieldSchema("c2", DataType.INTEGER); FieldSchema c3 = new FieldSchema("c3", DataType.DOUBLE); ResourceFieldSchema fs1 =new ResourceFieldSchema(c1); ResourceFieldSchema fs2 =new ResourceFieldSchema(c2); ResourceFieldSchema fs3 =new ResourceFieldSchema(c3); rs.setFields(new ResourceFieldSchema[]{fs1,fs2,fs3}); return rs; } @Override public ResourceStatistics getStatistics(String location, Job job) throws IOException { // TODO Auto-generated method stub return null; } @Override public String[] getPartitionKeys(String location, Job job) throws IOException { // TODO Auto-generated method stub return null; } @Override public void setPartitionFilter(Expression partitionFilter) throws IOException { // TODO Auto-generated method stub } } class MyStorageInputFormat extends TextInputFormat { private final String recordDel; public MyStorageInputFormat(String recordDel) { this.recordDel = recordDel; } @Override public RecordReader<LongWritable, Text> createRecordReader( InputSplit split, TaskAttemptContext context) { String delimiter = context.getConfiguration().get( "textinputformat.record.delimiter"); if (recordDel != null) { delimiter = recordDel; } byte[] recordDelimiterBytes = null; if (null != delimiter){ try { recordDelimiterBytes = decode(delimiter).getBytes("UTF-8"); } catch (UnsupportedEncodingException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } return new LineRecordReader(recordDelimiterBytes); } /** * 工作流传过来的列分隔符,有可能是特殊字符,用八进制或者十六进制表示 * @throws IOException */ public static String decode(String str) throws IOException { String re = str; if (str != null && str.startsWith("\\")) { str = str.substring(1, str.length()); String[] chars = str.split("\\\\"); byte[] bytes = new byte[chars.length]; for (int i = 0; i < chars.length; i++) { if (chars[i].equals("t")) { bytes[i] = 9; } else if (chars[i].equals("r")) { bytes[i] = 13; } else if (chars[i].equals("n")) { bytes[i] = 10; } else if (chars[i].equals("b")) { bytes[i] = 8; } else { bytes[i] = Byte.decode(chars[i]); } } try { re = new String(bytes, "UTF-8"); } catch (UnsupportedEncodingException e) { throw new IOException(str, e); } } return re; } } class MyStorageOutputFormat extends TextOutputFormat<WritableComparable, Tuple> { private final byte fieldDel; private final String recordDel; public MyStorageOutputFormat(byte delimiter) { this(delimiter, "\n"); } public MyStorageOutputFormat(byte delimiter, String recordDel) { this.fieldDel = delimiter; this.recordDel = recordDel; } protected static class MyRecordWriter extends TextOutputFormat.LineRecordWriter<WritableComparable, Tuple> { private static byte[] newline; private final byte fieldDel; public MyRecordWriter(DataOutputStream out, byte fieldDel) throws UnsupportedEncodingException { this(out, fieldDel, "\n".getBytes("UTF-8")); } public MyRecordWriter(DataOutputStream out, byte fieldDel, byte[] record) { super(out); this.fieldDel = fieldDel; this.newline = record; } public synchronized void write(WritableComparable key, Tuple value) throws IOException { int sz = value.size(); for (int i = 0; i < sz; i++) { StorageUtil.putField(out, value.get(i)); if (i != sz - 1) { out.writeByte(fieldDel); } } out.write(newline); } } @Override public RecordWriter<WritableComparable, Tuple> getRecordWriter( TaskAttemptContext job) throws IOException, InterruptedException { Configuration conf = job.getConfiguration(); boolean isCompressed = getCompressOutput(job); CompressionCodec codec = null; String extension = ""; if (isCompressed) { Class<? extends CompressionCodec> codecClass = getOutputCompressorClass( job, GzipCodec.class); codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf); extension = codec.getDefaultExtension(); } Path file = getDefaultWorkFile(job, extension); FileSystem fs = file.getFileSystem(conf); if (!isCompressed) { FSDataOutputStream fileOut = fs.create(file, false); return new MyRecordWriter(fileOut, fieldDel, this.recordDel.getBytes()); } else { FSDataOutputStream fileOut = fs.create(file, false); return new MyRecordWriter(new DataOutputStream( codec.createOutputStream(fileOut)), fieldDel, this.recordDel.getBytes()); } } }
本人试验 \001 和 \002同样可以
grunt> register /home/pig/pig-0.11.0/udflib/myStorage.jar grunt> cat X; keyDataKNZKCZY:ZDKJS:616150:AFS:3842708d_20131219194420-642464756keyDataKNZKCZY:ZDKJS:616614:AFS:3843920d_20131219194420-642464756keyDataKNZKCZY:ZDKJS:616661:AFS:3844040d_20131219194420-642464756 grunt> a = load 'X' using com.hcr.hadoop.pig.MyStorage('\\001','\\002'); grunt> dump a; (keyData,KNZKCZY:ZDKJS:616150:AFS:3842708,d_20131219194420-642464756) (keyData,KNZKCZY:ZDKJS:616614:AFS:3843920,d_20131219194420-642464756) (keyData,KNZKCZY:ZDKJS:616661:AFS:3844040,d_20131219194420-642464756) grunt>
有的时候如果加载模式不想指定具体模式(比如太多了字段,或者不够公有化)就想使用已存在的模式
实现LoadMetadata接口,然后
重写
}
这一个简单的例子中就返回了直接使用模式的形式
grunt> register /home/pig/pig-0.11.0/udflib/myStorage.jar
grunt> a = load 'student' using com.hcr.hadoop.pig.MyStorage(',','/');
grunt> describe a;
a: {c1: int,c2: int,c3: double}
grunt> b = foreach a generate c1,c2,c3;
grunt> describe b;
b: {c1: int,c2: int,c3: double}