(3)下面实现一个自己的InputFormat,需要处理的数据为(时间:URL)
public class TimeUrlTextInputInputFormat extends FileInputFormat<Text,URLWritable>{ public RecordReader<Text,URLWritable> getRecordReader( InputSplit input,JobConf job,Reporter reporter)throws IOException{ return new TimeUrlLineRecorderReader(job,(FileSplit)input); } } pulbic class URLWritable implements Writable{ protected URL url; public URLWritable(){} public URLWritable(URL url){ This.url=url; } public void write(DataOutput out) throws IOException{ Out.writeUTF(url.toString()); } public void readFields(DataInput in) throws IOException{ url=new URL(in.readUTF()); } public void set(String s)throws MalformedURLException{ Url=new URL(s); } } class TimeUrlLineRecordReader implements RecordReader<Text,URLWritable>{ private KeyValueLineRecorderReader lineReader; private Text lineKey,lineValue; public TimeUrlLineRecordReader(JobConf job,FileSplit split) throws IOException{ lineRecorder=new KeyValueLineRecordReader(job,split); lineKey=lineReader.createKey(); lineValue=lineReader.createValue(); } public boolean next(Text key,URLWritable value) throws IOException{ if(!lineReader.next(lineKey,lineValue)){ Return false; } key.set(lineKey); Value.set(lineValue.toString()); return true; } public Text createKey(){ Return new Text(""); } public URLWritable createValue(){ return new URLWritable(); } public long getPos() throws IOException{ Return lineRecorder.getPos(); } public float getProgress() throws IOException{ Return lineReader.getProgress(); } public void close() throws IOException{ lineReader.close(); } }
七、输出格式 outputFormat
hadoop中实现了OutputFormat接口的类有如下几个
TextOutputFormat<K,V>:用tab键分隔输出,可以通过mapred.textoutputformat.separator
属性进行更换。
SequenceFileOutputFormat<K,V>:和SequeceFileOutputFormat搭配使用
NullOutputFormat<K,V>:什么都不输出
<!--EndFragment-->