hadoop下将大量小文件生成一个sequenceFile文件

1)遇到的问题,因为是在集群上运行,代码中String seqFsUrl = "hdfs://localhost:9000/user/mjiang/target-seq/sdfgz.seq";的localhost错误,

于是老是出现连接不上的问题,(Retrying connect to server: localhost/127.0.0.1:8020. Already tried 0 time(s).)

所以运行程序时出现连接不上hadoop的问题时,考虑是不是程序写错了。

2)sequenceFile中虽然是按文件名(或其他任何值)为键,文件内容为值来存储的。但用SequenceFileAsTextInputFormat来读取时,键值还是会被读到文件的第一行

没分析源码,不清楚原因

3)sequenceFile可以处理.gz文件(没有实验出不行,按说.gz文件是不可以分块存储的???逻辑上还是一个)

import java.io.BufferedInputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.File;
import java.net.URI;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;

public class sequeneceFile{
	
	public static void main(String[] args) throws IOException {
		
		//String seqFsUrl = "hdfs://localhost:9000/user/mjiang/target-seq/sdfgz.seq";
		String seqFsUrl = "user/mjiang/target-seq/sdfgz.seq";

		Configuration conf = new Configuration();
		//conf.set("fs.default.name", "hdfs://venus:9000");
		//conf.set("hadoop.job.user", "mjiang");
		//conf.set("mapred.job.tracker", "venus:9001");

		FileSystem fs = FileSystem.get(URI.create(seqFsUrl),conf);

		Path seqPath = new Path(seqFsUrl);

		//Text key = new Text();

		Text value = new Text();

		String filesPath = "/home/mjiang/java/eclipse/hadoop/sequenceFile/data/sdfgz/";

		File gzFilesDir = new File(filesPath);

		String[] gzFiles = gzFilesDir.list();
		
		int filesLen=gzFiles.length;
		
		SequenceFile.Writer writer = null;
		
		try {//返回一个SequenceFile.Writer实例 需要数据流和path对象 将数据写入了path对象
			
					
			writer = SequenceFile.createWriter(fs, conf, seqPath,NullWritable.class, value.getClass());
			
			//for (int i=0;i<2;i++){
					
			while (filesLen>0){
			
				File gzFile = new File(filesPath+gzFiles[filesLen-1]);
			
				InputStream in = new BufferedInputStream(new FileInputStream(gzFile));
			
				long len = gzFile.length();
			
				byte[] buff = new byte[(int)len];	

				if ((len = in.read(buff))!= -1) {
				
					value.set(buff);
				
					writer.append(NullWritable.get(), value);//将每条记录追加到SequenceFile.Writer实例的末尾   

				}
				
				//process
				
				System.out.println(gzFiles[filesLen-1]);
			
				//key.clear();
			
				value.clear();
				
				IOUtils.closeStream(in);
				
				filesLen--;//!!
			
			}
			//filesLen = 2; }
		} finally {

			IOUtils.closeStream(writer);

		}
	}
}


你可能感兴趣的:(hadoop,String,File,存储,Path,byte)