2.定制RecordReader实现方法:实现RecordReader接口(旧版API)继承RecordReader类(新版API),下面以新版API为例实现以下方法:
public abstract void initialize(InputSplit split, TaskAttemptContext context ) throws IOException, InterruptedException;最核心的就是处理好迭代多行文本的内容的逻辑,每次迭代通过调用nextKeyValue()方法来判断是否还有可读的文本行,直接设置当前的Key和Value,分别在方法getCurrentKey()和getCurrentValue()中返回对应的值。在实现的代码中会有相应注释说明。
定制InputFormat:
import java.io.IOException; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class XMLInputFormat extends TextInputFormat { private static final Logger log = LoggerFactory.getLogger(XMLInputFormat.class); @Override public RecordReader<LongWritable, Text> createRecordReader( InputSplit inputSplit, TaskAttemptContext context) { try { return new XMLRecordReader(inputSplit, context.getConfiguration()); } catch (IOException e) { log.warn("Error while creating XmlRecordReader", e); return null; } } @Override protected boolean isSplitable(JobContext context, Path file) { // TODO Auto-generated method stub return super.isSplitable(context, file); } }定义RecordReader:(这是xml文件处理的关键)
import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.DataOutputBuffer; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.FileSplit; public class XMLRecordReader extends RecordReader<LongWritable, Text> { private long start; private long end; private FSDataInputStream fsin; private DataOutputBuffer buffer = new DataOutputBuffer(); private byte[] startTag; private byte[] endTag; private LongWritable currentKey; private Text currentValue; public static final String START_TAG_KEY = "xmlinput.start"; public static final String END_TAG_KEY = "xmlinput.end"; public XMLRecordReader() { } /** * 初始化读取资源以及相关的参数也可以放到initialize()方法中去执行 * @param inputSplit * @param context * @throws IOException */ public XMLRecordReader(InputSplit inputSplit, Configuration context) throws IOException { /** * 获取开传入的开始和结束标签 */ startTag = context.get(START_TAG_KEY).getBytes("UTF-8"); endTag = context.get(END_TAG_KEY).getBytes("UTF-8"); FileSplit fileSplit = (FileSplit) inputSplit; /** * 获取分片的开始位置和结束的位置 */ start = fileSplit.getStart(); end = start + fileSplit.getLength(); Path file = fileSplit.getPath(); FileSystem fs = file.getFileSystem(context); /** * 根据分片打开一个HDFS的文件输入流 */ fsin = fs.open(fileSplit.getPath()); /** * 定位到分片开始的位置 */ fsin.seek(start); } @Override public void close() throws IOException { fsin.close(); } @Override public LongWritable getCurrentKey() throws IOException, InterruptedException { return currentKey; } @Override public Text getCurrentValue() throws IOException, InterruptedException { return currentValue; } @Override public float getProgress() throws IOException, InterruptedException { return fsin.getPos() - start / (float) end - start; } @Override public void initialize(InputSplit inputSplit, TaskAttemptContext context) throws IOException, InterruptedException { /*startTag = context.getConfiguration().get(START_TAG_KEY).getBytes("UTF-8"); endTag = context.getConfiguration().get(END_TAG_KEY).getBytes("UTF-8"); FileSplit fileSplit = (FileSplit) inputSplit; start = fileSplit.getStart(); end = start + fileSplit.getLength(); Path file = fileSplit.getPath(); FileSystem fs = file.getFileSystem(context.getConfiguration()); fsin = fs.open(fileSplit.getPath()); fsin.seek(start);*/ } @Override public boolean nextKeyValue() throws IOException, InterruptedException { currentKey = new LongWritable(); currentValue = new Text(); return next(currentKey, currentValue); } private boolean next(LongWritable key, Text value) throws IOException { /** * 通过readUntilMatch方法查找xml段开始的标签,直到找到了,才开始 * 写xml片段到buffer中去,如readUntilMatch的第二个参数为false则不查找的过 * 程中写入数据到buffer,如果为true的话就边查找边写入 */ if( fsin.getPos() < end && readUntilMatch(startTag, false)) { //进入代码段则说明找到了开始标签,现在fsin的指针指在找到的开始标签的 //最后一位上,所以向buffer中写入开始标签 buffer.write(startTag); try { /** * 在fsin中去查找结束标签边查找边记录直到找到结束标签为止 */ if(readUntilMatch(endTag, true)) { /** * 找到标签后把结束标签的指针位置的偏移量赋值给key * 把buffer中记录的整个xml完整片断赋值给value */ key.set(fsin.getPos()); value.set(buffer.getData(), 0, buffer.getLength()); return true; } } finally { buffer.reset(); } } return false; } /** * 读取xml文件匹配标签的方法 * @param startTag * @param isWrite * @return * @throws IOException */ private boolean readUntilMatch(byte[] startTag, boolean isWrite) throws IOException { int i = 0; while(true) { /** * 从输入文件只读取一个Byte的数据 */ int b = fsin.read(); if( b == -1) { return false; } /** * 如果在查找开始标签则不记录查找过程, * 在查找结束标签时才记录查找过程。 */ if(isWrite) { buffer.write(b); } /** * 判断时否找到指定的标签来判断函数结束的时间点 */ if(b == startTag[i]) { i ++; if( i >= startTag.length) { return true; } } else { i = 0; } // see if we've passed the stop point: if (!isWrite && i == 0 && fsin.getPos() >= end) { return false; } } } }map阶段:
import static javax.xml.stream.XMLStreamConstants.CHARACTERS; import static javax.xml.stream.XMLStreamConstants.START_ELEMENT; import java.io.ByteArrayInputStream; import java.io.IOException; import javax.xml.stream.FactoryConfigurationError; import javax.xml.stream.XMLInputFactory; import javax.xml.stream.XMLStreamException; import javax.xml.stream.XMLStreamReader; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; public class XMLMapper extends Mapper<LongWritable, Text, Text, Text>{ /** * 从XMLRecordReader可以知道传入的value为指定的包含开始标签和结束标签 * 的一个片断 */ @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String content = value.toString(); System.out.println("--content--"); try { //把value的值转化为一个XML的输入流,以便后继的处理 XMLStreamReader reader = XMLInputFactory.newInstance().createXMLStreamReader( new ByteArrayInputStream(content.getBytes())); String propertyName = ""; String propertyValue = ""; String currentElement = ""; /** * 定义处理业务可以通过下面的main函数来看这段程序的实现的功能 */ while (reader.hasNext()) { int code = reader.next(); switch (code) { case START_ELEMENT: currentElement = reader.getLocalName(); break; case CHARACTERS: if (currentElement.equalsIgnoreCase("name")) { propertyName += reader.getText(); } else if (currentElement.equalsIgnoreCase("value")) { propertyValue += reader.getText(); } break; } } reader.close(); context.write(new Text(propertyName.trim()), new Text(propertyValue.trim())); } catch (XMLStreamException e) { e.printStackTrace(); } catch (FactoryConfigurationError e) { e.printStackTrace(); } } public static void main(String[] args) { String content = "<property><name>seven</name><value>24</value></property>"; System.out.println("--content--"); try { XMLStreamReader reader = XMLInputFactory.newInstance().createXMLStreamReader( new ByteArrayInputStream(content.getBytes())); String propertyName = ""; String propertyValue = ""; String currentElement = ""; while (reader.hasNext()) { int code = reader.next(); switch (code) { case START_ELEMENT: currentElement = reader.getLocalName(); break; case CHARACTERS: if (currentElement.equalsIgnoreCase("name")) { propertyName += reader.getText(); } else if (currentElement.equalsIgnoreCase("value")) { propertyValue += reader.getText(); } break; } } reader.close(); System.out.println(propertyName + " " + propertyValue); } catch (XMLStreamException e) { e.printStackTrace(); } catch (FactoryConfigurationError e) { e.printStackTrace(); } } }reduce阶段:
import java.io.IOException; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; public class XMLReducer extends Reducer<Text, Text, Text, Text>{ private Text val_ = new Text(); @Override protected void reduce(Text key, Iterable<Text> value, Context context) throws IOException, InterruptedException { for(Text val: value) { val_.set(val.toString()); context.write(key, val_); } } }启动函数:
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class JobMain { public static void main(String[] args) throws Exception{ Configuration configuration = new Configuration(); configuration.set("key.value.separator.in.input.line", " "); configuration.set("xmlinput.start", "<property>"); configuration.set("xmlinput.end", "</property>"); Job job = new Job(configuration, "xmlread-job"); job.setJarByClass(JobMain.class); job.setMapperClass(XMLMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setInputFormatClass(XMLInputFormat.class); job.setNumReduceTasks(1); job.setReducerClass(XMLReducer.class); //job.setOutputFormatClass(XMLOutputFormat.class); FileInputFormat.addInputPath(job, new Path(args[0])); Path output = new Path(args[1]); FileOutputFormat.setOutputPath(job, output); output.getFileSystem(configuration).delete(output, true); System.exit(job.waitForCompletion(true) ? 0: 1); } }运行结果:
这里通过mapreduce对xml输入文件的处理说了InputFormat以及RecordReader的定制,下一篇将基于这个实例说明OutputFormat以及RecordWriter的定制,实例将把最后结果 输出为xml格式的文件,可参见《MapReduce-XML处理-定制OutputFormat及定制RecordWriter》。