hadoop命令源码之text实现

package cn.yws;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.zip.GZIPInputStream;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.util.ReflectionUtils;

public class MyText {

	private Configuration conf;

	public MyText(Configuration conf) {
		super();
		this.conf = conf;
	}

	private abstract class DelayedExceptionThrowing {
		abstract void process(Path p, FileSystem srcFs) throws IOException;

		final void globAndProcess(Path srcPattern, FileSystem srcFs)
				throws IOException {
			List<IOException> exceptions = new ArrayList<IOException>();
			for (Path p : FileUtil.stat2Paths(srcFs.globStatus(srcPattern),
					srcPattern))
				try {
					process(p, srcFs);
				} catch (IOException ioe) {
					exceptions.add(ioe);
				}

			if (!exceptions.isEmpty())
				if (exceptions.size() == 1)
					throw exceptions.get(0);
				else
					throw new IOException("Multiple IOExceptions: "
							+ exceptions);
		}
	}

	public void text(String srcf) throws IOException {
		Path srcPattern = new Path(srcf);
		new DelayedExceptionThrowing() {
			@Override
			void process(Path p, FileSystem srcFs) throws IOException {
				if (srcFs.isDirectory(p)) {
					throw new IOException("Source must be a file.");
				}

				printToStdout(forMagic(p, srcFs));
			}
		}.globAndProcess(srcPattern, srcPattern.getFileSystem(getConf()));
	}

	private void printToStdout(InputStream in) throws IOException {
		try {
			IOUtils.copyBytes(in, System.out, getConf(), false);
		} finally {
			in.close();
		}
	}

	private InputStream forMagic(Path p, FileSystem srcFs) throws IOException {
		FSDataInputStream i = srcFs.open(p);
		switch (i.readShort()) {
		case 0x1f8b: // RFC 1952
			i.seek(0);
			return new GZIPInputStream(i);
		case 0x5345: // 'S' 'E'
			if (i.readByte() == 'Q') {
				i.close();
				return new TextRecordInputStream(srcFs.getFileStatus(p), srcFs);
			}
			break;
		}
		i.seek(0);
		return i;
	}

	public Configuration getConf() {

		return conf;
	}

	private class TextRecordInputStream extends InputStream {
		SequenceFile.Reader r;
		WritableComparable key;
		Writable val;

		DataInputBuffer inbuf;
		DataOutputBuffer outbuf;

		public TextRecordInputStream(FileStatus f, FileSystem fs)
				throws IOException {
			r = new SequenceFile.Reader(fs, f.getPath(), getConf());
			key = ReflectionUtils.newInstance(
					r.getKeyClass().asSubclass(WritableComparable.class),
					getConf());
			val = ReflectionUtils.newInstance(
					r.getValueClass().asSubclass(Writable.class), getConf());
			inbuf = new DataInputBuffer();
			outbuf = new DataOutputBuffer();
		}

		public int read() throws IOException {
			int ret;
			if (null == inbuf || -1 == (ret = inbuf.read())) {
				if (!r.next(key, val)) {
					return -1;
				}
				byte[] tmp = key.toString().getBytes();
				outbuf.write(tmp, 0, tmp.length);
				outbuf.write('\t');
				tmp = val.toString().getBytes();
				outbuf.write(tmp, 0, tmp.length);
				outbuf.write('\n');
				inbuf.reset(outbuf.getData(), outbuf.getLength());
				outbuf.reset();
				ret = inbuf.read();
			}
			return ret;
		}
	}

	public static void main(String[] args) {
		
		if (args.length != 1) {
			System.out.println("args!==1,len=" + args.length);
			return;
		}
		Configuration conf = new Configuration();
		conf.set("mapred.job.tracker", "192.168.0.58:9001");
		MyText myText = new MyText(conf);
		try {
			myText.text(args[0]);
		} catch (IOException e) {
			e.printStackTrace();
		}
	}

}

你可能感兴趣的:(hadoop命令源码之text实现)