参考文档: Hadoop:The Definitive Guide3E Chapter11.Pig
代码具体地址: tomwhite-hadoop-book-32dae01\ch11\src\main\java\com\hadoopbook\pig
工具类
public class Range { private final int start; private final int end; public Range(int start, int end) { this.start = start; this.end = end; } public int getStart() { return start; } public int getEnd() { return end; } public String getSubstring(String line) { //abcdefghi ==>sample.txt的每一行 //rangeSpec: 1-2,5-6 ==>CutLoadFunc()的参数值 //1-2: start=1,end=2, "abcedfghi".substring(0,2)=>ab //5-6: start=5,end=6, "abcedfghi".substring(4,6)=>df return line.substring(start - 1, end); } @Override public int hashCode() { return start * 37 + end; } @Override public boolean equals(Object obj) { if (!(obj instanceof Range)) { return false; } Range other = (Range) obj; return this.start == other.start && this.end == other.end; } //1-2,5-6 public static List<Range> parse(String rangeSpec) throws IllegalArgumentException { if (rangeSpec.length() == 0) { return Collections.emptyList(); } List<Range> ranges = new ArrayList<Range>(); String[] specs = rangeSpec.split(","); //["1-2", "5-6"] for (String spec : specs) { String[] split = spec.split("-"); //["1", "2"] try { ranges.add(new Range(Integer.parseInt(split[0]), Integer .parseInt(split[1]))); //start=1, end=2 } catch (NumberFormatException e) { throw new IllegalArgumentException(e.getMessage()); } } return ranges; } }
自定义函数
public class CutLoadFunc extends LoadFunc { private static final Log LOG = LogFactory.getLog(CutLoadFunc.class); private final List<Range> ranges; //工具类,解析范围参数,范围针对的是文件里的每一行数据.进行截取操作 private final TupleFactory tupleFactory = TupleFactory.getInstance(); private RecordReader reader; //LOAD命令加载文件,reader会去读取文件里的每一行数据 public CutLoadFunc(String cutPattern) { //构造函数接收参数:范围->'16-19,88-92,93-93' ranges = Range.parse(cutPattern); } @Override public void setLocation(String location, Job job) throws IOException { FileInputFormat.setInputPaths(job, location); } @Override public InputFormat getInputFormat() { return new TextInputFormat(); } @Override public void prepareToRead(RecordReader reader, PigSplit split) { this.reader = reader; } //读取文件里的每一行记录 @Override public Tuple getNext() throws IOException { try { if (!reader.nextKeyValue()) { return null; } Text value = (Text) reader.getCurrentValue(); //读取到当前行的数据 String line = value.toString(); //line为当前行的数据 Tuple tuple = tupleFactory.newTuple(ranges.size()); for (int i = 0; i < ranges.size(); i++) { Range range = ranges.get(i); //调用构造函数时,通过范围参数,已经将List<Range>的数据填充完毕.即此时每一个Range对象的start.end都是有值的. if (range.getEnd() > line.length()) { LOG.warn(String.format( "Range end (%s) is longer than line length (%s)", range.getEnd(), line.length())); continue; } tuple.set(i, new DataByteArray(range.getSubstring(line))); //传入当前行的数据,调用截取方法(start,end已经有值),截取当前行对应的范围的字符串. } return tuple; //tuple可以看做是一个上下文. 参数就是通过上下文传入的. } catch (InterruptedException e) { throw new ExecException(e); } } }