package cn.yws; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.List; import java.util.zip.GZIPInputStream; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.DataInputBuffer; import org.apache.hadoop.io.DataOutputBuffer; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.util.ReflectionUtils; public class MyText { private Configuration conf; public MyText(Configuration conf) { super(); this.conf = conf; } private abstract class DelayedExceptionThrowing { abstract void process(Path p, FileSystem srcFs) throws IOException; final void globAndProcess(Path srcPattern, FileSystem srcFs) throws IOException { List<IOException> exceptions = new ArrayList<IOException>(); for (Path p : FileUtil.stat2Paths(srcFs.globStatus(srcPattern), srcPattern)) try { process(p, srcFs); } catch (IOException ioe) { exceptions.add(ioe); } if (!exceptions.isEmpty()) if (exceptions.size() == 1) throw exceptions.get(0); else throw new IOException("Multiple IOExceptions: " + exceptions); } } public void text(String srcf) throws IOException { Path srcPattern = new Path(srcf); new DelayedExceptionThrowing() { @Override void process(Path p, FileSystem srcFs) throws IOException { if (srcFs.isDirectory(p)) { throw new IOException("Source must be a file."); } printToStdout(forMagic(p, srcFs)); } }.globAndProcess(srcPattern, srcPattern.getFileSystem(getConf())); } private void printToStdout(InputStream in) throws IOException { try { IOUtils.copyBytes(in, System.out, getConf(), false); } finally { in.close(); } } private InputStream forMagic(Path p, FileSystem srcFs) throws IOException { FSDataInputStream i = srcFs.open(p); switch (i.readShort()) { case 0x1f8b: // RFC 1952 i.seek(0); return new GZIPInputStream(i); case 0x5345: // 'S' 'E' if (i.readByte() == 'Q') { i.close(); return new TextRecordInputStream(srcFs.getFileStatus(p), srcFs); } break; } i.seek(0); return i; } public Configuration getConf() { return conf; } private class TextRecordInputStream extends InputStream { SequenceFile.Reader r; WritableComparable key; Writable val; DataInputBuffer inbuf; DataOutputBuffer outbuf; public TextRecordInputStream(FileStatus f, FileSystem fs) throws IOException { r = new SequenceFile.Reader(fs, f.getPath(), getConf()); key = ReflectionUtils.newInstance( r.getKeyClass().asSubclass(WritableComparable.class), getConf()); val = ReflectionUtils.newInstance( r.getValueClass().asSubclass(Writable.class), getConf()); inbuf = new DataInputBuffer(); outbuf = new DataOutputBuffer(); } public int read() throws IOException { int ret; if (null == inbuf || -1 == (ret = inbuf.read())) { if (!r.next(key, val)) { return -1; } byte[] tmp = key.toString().getBytes(); outbuf.write(tmp, 0, tmp.length); outbuf.write('\t'); tmp = val.toString().getBytes(); outbuf.write(tmp, 0, tmp.length); outbuf.write('\n'); inbuf.reset(outbuf.getData(), outbuf.getLength()); outbuf.reset(); ret = inbuf.read(); } return ret; } } public static void main(String[] args) { if (args.length != 1) { System.out.println("args!==1,len=" + args.length); return; } Configuration conf = new Configuration(); conf.set("mapred.job.tracker", "192.168.0.58:9001"); MyText myText = new MyText(conf); try { myText.text(args[0]); } catch (IOException e) { e.printStackTrace(); } } }