使用Hadoop API(抽象类CombineFileInputFormat)来解决小文件的问题。抽象类CombineFileInputFormat的基本思想是通过使用一个定制的InputFormat允许将小文件合并到Hadoop的分片(split)或块(chunk)中。要使用抽象类CombineFileInputFormat,需要事项2个定制类。
1、 CombineSmallfileInputFormat要扩展CombineFileInputFormat,创建子类来支持定制格式的输入。
2、 CombineSmallfileRecordReader是一个定制RecordReader。
public class CombineSmallfileInputFormat extends CombineFileInputFormat {
@Override
public RecordReader createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException {
CombineFileSplit combineFileSplit = (CombineFileSplit) split;
CombineFileRecordReader recordReader = new CombineFileRecordReader(combineFileSplit, context, CombineSmallfileRecordReader.class);
try {
recordReader.initialize(combineFileSplit, context);
} catch (InterruptedException e) {
new RuntimeException("Error to initialize CombineSmallfileRecordReader.");
}
return recordReader;
}
private final Log LOG = LogFactory.getLog(getClass());
@Override
protected List listStatus(JobContext job) throws IOException {
LOG.info("###################### CombineSmallfileInputFormat.listStatus invoked ######################");
Path[] dirs = getInputPaths(job);
if (dirs.length == 0) {
throw new IOException("No input paths specified in job");
}
// get tokens for all the required FileSystems..
TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration());
// Whether we need to recursive look into the directory structure
boolean recursive = getInputDirRecursive(job);
// creates a MultiPathFilter with the hiddenFileFilter and the
// user provided one (if any).
List filters = new ArrayList();
filters.add(hiddenFileFilter);
PathFilter jobFilter = getInputPathFilter(job);
if (jobFilter != null) {
filters.add(jobFilter);
}
PathFilter inputFilter = new MultiPathFilter(filters);
List result = null;
int numThreads = job.getConfiguration().getInt(LIST_STATUS_NUM_THREADS, DEFAULT_LIST_STATUS_NUM_THREADS);
Stopwatch sw = new Stopwatch().start();
if (numThreads == 1) {
result = singleThreadedListStatus(job, dirs, inputFilter, recursive);
} else {
Iterable locatedFiles = null;
try {
LocatedFileStatusFetcher locatedFileStatusFetcher = new LocatedFileStatusFetcher(job.getConfiguration(),
dirs, recursive, inputFilter, true);
locatedFiles = locatedFileStatusFetcher.getFileStatuses();
} catch (InterruptedException e) {
throw new IOException("Interrupted while getting file statuses");
}
result = Lists.newArrayList(locatedFiles);
}
sw.stop();
if (LOG.isDebugEnabled()) {
LOG.debug("Time taken to get FileStatuses: " + sw.elapsedMillis());
}
LOG.info("Total input paths to process : " + result.size());
return result;
}
private List singleThreadedListStatus(JobContext job, Path[] dirs, PathFilter inputFilter,
boolean recursive) throws IOException {
List result = new ArrayList();
// List errors = new ArrayList();
List errors = new ArrayList();
for (int i = 0; i < dirs.length; ++i) {
Path p = dirs[i];
FileSystem fs = p.getFileSystem(job.getConfiguration());
FileStatus[] matches = fs.globStatus(p, inputFilter);
if (matches == null) {
// errors.add(new IOException("Input path does not exist: " + p));
errors.add("Input path does not exist: " + p);
} else if (matches.length == 0) {
// errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
errors.add("Input Pattern " + p + " matches 0 files");
} else {
for (FileStatus globStat : matches) {
if (globStat.isDirectory()) {
RemoteIterator iter = fs.listLocatedStatus(globStat.getPath());
while (iter.hasNext()) {
LocatedFileStatus stat = iter.next();
if (inputFilter.accept(stat.getPath())) {
if (recursive && stat.isDirectory()) {
addInputPathRecursively(result, fs, stat.getPath(), inputFilter);
} else {
result.add(stat);
}
}
}
} else {
result.add(globStat);
}
}
}
}
if (!errors.isEmpty()) {
// throw new InvalidInputException(errors);
for (String error : errors) {
LOG.error(error);
}
}
return result;
}
/**
* Proxy PathFilter that accepts a path only if all filters given in the
* constructor do. Used by the listPaths() to apply the built-in
* hiddenFileFilter together with a user provided one (if any).
*/
private static class MultiPathFilter implements PathFilter {
private List filters;
public MultiPathFilter(List filters) {
this.filters = filters;
}
public boolean accept(Path path) {
for (PathFilter filter : filters) {
if (!filter.accept(path)) {
return false;
}
}
return true;
}
}
private static final PathFilter hiddenFileFilter = new PathFilter() {
public boolean accept(Path p) {
String name = p.getName();
return !name.startsWith("_") && !name.startsWith(".");
}
};
}
public class CombineSmallfileRecordReader extends RecordReader {
private static Log log = LogFactory.getLog(CombineSmallfileRecordReader.class);
private CombineFileSplit combineFileSplit;
private LineRecordReader lineRecordReader = new LineRecordReader();
private Path[] paths;
private int totalLength;
private int currentIndex;
private float currentProgress = 0;
private LongWritable currentKey;
private Text currentValue = new Text();
public CombineSmallfileRecordReader(CombineFileSplit combineFileSplit, TaskAttemptContext context, Integer index) throws IOException {
super();
this.combineFileSplit = combineFileSplit;
this.currentIndex = index; // 当前要处理的小文件Block在CombineFileSplit中的索引
// FileStatus f = FileSystem.get(context.getConfiguration()).getFileStatus(combineFileSplit.getPath(currentIndex));
try {
context.getConfiguration().set("file.filename",combineFileSplit.getPath(currentIndex).getName());
context.getConfiguration().set("file.modificationtime",
FileSystem.get(context.getConfiguration()).getFileStatus(combineFileSplit.getPath(currentIndex)).getModificationTime()+"");
} catch (Exception e) {
e.printStackTrace();
}
}
@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
this.combineFileSplit = (CombineFileSplit) split;
// 处理CombineFileSplit中的一个小文件Block,因为使用LineRecordReader,需要构造一个FileSplit对象,然后才能够读取数据
FileSplit fileSplit = null;
try {
fileSplit = new FileSplit(combineFileSplit.getPath(currentIndex), combineFileSplit.getOffset(currentIndex), combineFileSplit.getLength(currentIndex), combineFileSplit.getLocations());
lineRecordReader.initialize(fileSplit, context);
}catch(FileNotFoundException e){
log.error("initialize file is not found ,..........................");
}catch(Exception e){
log.error("initialize is wrong ..........................");
}
this.paths = combineFileSplit.getPaths();
totalLength = paths.length;
// context.getConfiguration().set("map.input.file.path", combineFileSplit.getPath(currentIndex).toString());
}
@Override
public LongWritable getCurrentKey() throws IOException, InterruptedException {
currentKey = lineRecordReader.getCurrentKey();
return currentKey;
}
@Override
public Text getCurrentValue() throws IOException, InterruptedException {
Text content = lineRecordReader.getCurrentValue();
currentValue.set(content);
// currentValue.set(content, 0, content.length);
return currentValue;
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
boolean exist = false;
if (currentIndex >= 0 && currentIndex < totalLength) {
try{
exist = lineRecordReader.nextKeyValue();
}catch(Exception e){
exist = false;
}
return exist;
} else {
return false;
}
}
@Override
public float getProgress() throws IOException {
if (currentIndex >= 0 && currentIndex < totalLength) {
currentProgress = (float) currentIndex / totalLength;
return currentProgress;
}
return currentProgress;
}
@Override
public void close() throws IOException {
lineRecordReader.close();
}
}
public class WordCount {
// private static Log log = LogFactory.getLog(WordCount.class);
private static Logger log = LoggerFactory.getLogger(WordCount.class);
private static Path in;
private static Path out;
public static void main(String[] args) {
System.setProperty("hadoop.home.dir", "D:\\installed\\work\\hadoop-2.7.3");
String inputPath = args[0];
String outputPath = args[1];
String jobName = args[2];
String queueName = args[3];
String htablePath = args[4];
log.info("############inputPath:" + inputPath + "############");
log.info("############outputPath:" + outputPath + "############");
if (jobName == null || "".equals(jobName)) {
jobName = "StatisticCount";
}
if (queueName == null || "".equals(queueName)) {
queueName = "statdpi";
}
log.info("############jobName:" + jobName + "############");
log.info("############queueName:" + queueName + "############");
try {
execute(inputPath, outputPath, jobName, queueName, htablePath);
} catch (Exception e) {
log.error(e.getMessage());
e.printStackTrace();
}
}
private static void execute(String inputPath, String outputPath,
String jobName, String queueName, String htablePath) throws Exception {
Configuration conf = new Configuration();
// conf.set("mapreduce.job.queuename", queueName);
conf.set("mapreduce.jobtracker.split.metainfo.maxsize", "100000000");
conf.setLong("mapreduce.map.failures.maxpercent", 100);
conf.setLong("mapreduce.reduce.failures.maxpercent", 100);
conf.setInt("mapreduce.input.fileinputformat.split.minsize", 1);
conf.setLong("mapreduce.input.fileinputformat.split.maxsize", 157286400);
// conf.set("htablePath", htablePath);
out = new Path(outputPath);
in = new Path(inputPath);
FileSystem fs = FileSystem.get(conf);
Job job = Job.getInstance(conf);
job.setJobName(jobName);
job.setJarByClass(WordCount.class);
job.setMapperClass(WordCountMapper.class);
job.setReducerClass(WordCountReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setNumReduceTasks(1);
job.setInputFormatClass(CombineSmallfileInputFormat.class);
//报错,实例化错误
// job.setInputFormatClass(CombineFileInputFormat.class);
FileInputFormat.addInputPath(job, in);
// FileInputFormat.setInputPaths(job, in);
FileOutputFormat.setOutputPath(job, out);
//FileOutputFormat.setCompressOutput(job, true);
if (fs.exists(out)) {
fs.delete(out, true);
}
boolean flag = job.waitForCompletion(true);
log.info("#####################执行结果:" + flag);
log.info("#############################################################################################");
}
}
class WordCountMapper extends Mapper {
final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString().trim();
String[] tokens = StringUtils.split(line, "\t");
for (String tok : tokens) {
word.set(tok);
context.write(word, one);
}
}
}
class WordCountReducer extends Reducer {
public void reduce(Text key, Iterable values, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
context.write(key, new IntWritable(sum));
}
}