MultipleInputs处理多输入源,本例子包括 windows上的mysql数据库数据和hdfs上的文本数据。
mysql数据:
hdfs数据:
[root@baolibin ~]# hadoop fs -text /input/hehe Warning: $HADOOP_HOME is deprecated. hello you hello me hello you hello me
代码:
写的比较简单,这是一个没有reduce的mapreduce,仅仅强调连接多输入源,读出数据:
package hadoop_2_6_0; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.sql.PreparedStatement; import java.sql.ResultSet; import java.sql.SQLException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.lib.db.DBConfiguration; import org.apache.hadoop.mapreduce.lib.db.DBInputFormat; import org.apache.hadoop.mapreduce.lib.db.DBWritable; import org.apache.hadoop.mapreduce.lib.input.MultipleInputs; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class MultipleInputsTest { public static class TextMapper extends Mapper<LongWritable, Text, LongWritable, Text> { final LongWritable k2 = new LongWritable(); final Text v2 = new Text(); protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, LongWritable, Text>.Context context) throws InterruptedException, IOException { v2.set(value.toString()); context.write(k2, v2); } } public static class DBMapper extends Mapper<LongWritable, MyDBWritable, LongWritable, Text> { final Text v2 = new Text(); protected void map( LongWritable key, MyDBWritable value, Mapper<LongWritable, MyDBWritable, LongWritable, Text>.Context context) throws InterruptedException, IOException { v2.set(value.toString()); context.write(key, v2); } } public static class MyDBWritable implements Writable, DBWritable { int id; String name; public void write(PreparedStatement statement) throws SQLException { statement.setInt(1, id); statement.setString(2, name); } public void readFields(ResultSet resultSet) throws SQLException { this.id = resultSet.getInt(1); this.name = resultSet.getString(2); } public void write(DataOutput out) throws IOException { out.write(id); out.writeUTF(name); } public void readFields(DataInput in) throws IOException { this.id = in.readInt(); this.name = in.readUTF(); } public String toString() { return "MyDBWritable[id=" + id + ",\t" + "name=" + name + "]"; } } public static void main(String[] args) throws Exception { final Configuration conf = new Configuration(); // DBConfiguration.configureDB(conf, "com.mysql.jdbc.Driver","jdbc:mysql://192.168.1.1:3306/oled", "root", "mysqladmin"); final Job job = Job.getInstance(conf,MultipleInputsTest.class.getSimpleName()); job.setJarByClass(MultipleInputsTest.class); // 1.1 //FileInputFormat.setInputPaths(job,"hdfs://192.168.1.10:9000/input/hehe"); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); // 2.2 job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); DBInputFormat.setInput(job, MyDBWritable.class,"select id,name from DB", "select count(1) from DB"); MultipleInputs.addInputPath(job, new Path("hdfs://192.168.1.100:9000/input/hehe"), TextInputFormat.class,TextMapper.class); MultipleInputs.addInputPath(job, new Path("hdfs://192.168.1.100:9000/"), DBInputFormat.class, DBMapper.class); // 2.3 FileOutputFormat.setOutputPath(job, new Path("hdfs://192.168.1.100:9000/DBout1")); job.waitForCompletion(true); } }
console输出:
15/04/16 16:06:03 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable 15/04/16 16:06:03 WARN mapred.JobClient: Use GenericOptionsParser for parsing the arguments. Applications should implement Tool for the same. 15/04/16 16:06:03 WARN mapred.JobClient: No job jar file set. User classes may not be found. See JobConf(Class) or JobConf#setJar(String). 15/04/16 16:06:04 INFO input.FileInputFormat: Total input paths to process : 1 15/04/16 16:06:04 WARN snappy.LoadSnappy: Snappy native library not loaded 15/04/16 16:06:05 INFO mapred.JobClient: Running job: job_local942775997_0001 15/04/16 16:06:05 INFO mapred.LocalJobRunner: Waiting for map tasks 15/04/16 16:06:05 INFO mapred.LocalJobRunner: Starting task: attempt_local942775997_0001_m_000000_0 15/04/16 16:06:05 INFO mapred.Task: Using ResourceCalculatorPlugin : null 15/04/16 16:06:05 INFO mapred.MapTask: Processing split: hdfs://192.168.1.100:9000/input/hehe:0+38 15/04/16 16:06:05 INFO mapred.MapTask: io.sort.mb = 100 15/04/16 16:06:05 INFO mapred.MapTask: data buffer = 79691776/99614720 15/04/16 16:06:05 INFO mapred.MapTask: record buffer = 262144/327680 15/04/16 16:06:06 INFO mapred.MapTask: Starting flush of map output 15/04/16 16:06:06 INFO mapred.MapTask: Finished spill 0 15/04/16 16:06:06 INFO mapred.Task: Task:attempt_local942775997_0001_m_000000_0 is done. And is in the process of commiting 15/04/16 16:06:06 INFO mapred.LocalJobRunner: 15/04/16 16:06:06 INFO mapred.Task: Task 'attempt_local942775997_0001_m_000000_0' done. 15/04/16 16:06:06 INFO mapred.LocalJobRunner: Finishing task: attempt_local942775997_0001_m_000000_0 15/04/16 16:06:06 INFO mapred.LocalJobRunner: Starting task: attempt_local942775997_0001_m_000001_0 15/04/16 16:06:06 INFO mapred.Task: Using ResourceCalculatorPlugin : null 15/04/16 16:06:06 INFO mapred.MapTask: Processing split: org.apache.hadoop.mapreduce.lib.db.DBInputFormat$DBInputSplit@3c3a1834 15/04/16 16:06:06 INFO mapred.MapTask: io.sort.mb = 100 15/04/16 16:06:06 INFO mapred.MapTask: data buffer = 79691776/99614720 15/04/16 16:06:06 INFO mapred.MapTask: record buffer = 262144/327680 15/04/16 16:06:06 INFO mapred.MapTask: Starting flush of map output 15/04/16 16:06:06 INFO mapred.MapTask: Finished spill 0 15/04/16 16:06:06 INFO mapred.Task: Task:attempt_local942775997_0001_m_000001_0 is done. And is in the process of commiting 15/04/16 16:06:06 INFO mapred.LocalJobRunner: 15/04/16 16:06:06 INFO mapred.Task: Task 'attempt_local942775997_0001_m_000001_0' done. 15/04/16 16:06:06 INFO mapred.LocalJobRunner: Finishing task: attempt_local942775997_0001_m_000001_0 15/04/16 16:06:06 INFO mapred.LocalJobRunner: Map task executor complete. 15/04/16 16:06:06 INFO mapred.JobClient: map 100% reduce 0% 15/04/16 16:06:07 INFO mapred.Task: Using ResourceCalculatorPlugin : null 15/04/16 16:06:07 INFO mapred.LocalJobRunner: 15/04/16 16:06:07 INFO mapred.Merger: Merging 2 sorted segments 15/04/16 16:06:08 INFO mapred.Merger: Down to the last merge-pass, with 2 segments left of total size: 127 bytes 15/04/16 16:06:08 INFO mapred.LocalJobRunner: 15/04/16 16:06:09 INFO mapred.Task: Task:attempt_local942775997_0001_r_000000_0 is done. And is in the process of commiting 15/04/16 16:06:09 INFO mapred.LocalJobRunner: 15/04/16 16:06:09 INFO mapred.Task: Task attempt_local942775997_0001_r_000000_0 is allowed to commit now 15/04/16 16:06:09 INFO output.FileOutputCommitter: Saved output of task 'attempt_local942775997_0001_r_000000_0' to hdfs://192.168.1.100:9000/DBout1 15/04/16 16:06:09 INFO mapred.LocalJobRunner: reduce > reduce 15/04/16 16:06:09 INFO mapred.Task: Task 'attempt_local942775997_0001_r_000000_0' done. 15/04/16 16:06:10 INFO mapred.JobClient: map 100% reduce 100% 15/04/16 16:06:10 INFO mapred.JobClient: Job complete: job_local942775997_0001 15/04/16 16:06:10 INFO mapred.JobClient: Counters: 19 15/04/16 16:06:10 INFO mapred.JobClient: File Output Format Counters 15/04/16 16:06:10 INFO mapred.JobClient: Bytes Written=83 15/04/16 16:06:10 INFO mapred.JobClient: FileSystemCounters 15/04/16 16:06:10 INFO mapred.JobClient: FILE_BYTES_READ=2727 15/04/16 16:06:10 INFO mapred.JobClient: HDFS_BYTES_READ=114 15/04/16 16:06:10 INFO mapred.JobClient: FILE_BYTES_WRITTEN=215188 15/04/16 16:06:10 INFO mapred.JobClient: HDFS_BYTES_WRITTEN=83 15/04/16 16:06:10 INFO mapred.JobClient: File Input Format Counters 15/04/16 16:06:10 INFO mapred.JobClient: Bytes Read=0 15/04/16 16:06:10 INFO mapred.JobClient: Map-Reduce Framework 15/04/16 16:06:10 INFO mapred.JobClient: Map output materialized bytes=135 15/04/16 16:06:10 INFO mapred.JobClient: Map input records=5 15/04/16 16:06:10 INFO mapred.JobClient: Reduce shuffle bytes=0 15/04/16 16:06:10 INFO mapred.JobClient: Spilled Records=10 15/04/16 16:06:10 INFO mapred.JobClient: Map output bytes=113 15/04/16 16:06:10 INFO mapred.JobClient: Total committed heap usage (bytes)=685178880 15/04/16 16:06:10 INFO mapred.JobClient: SPLIT_RAW_BYTES=476 15/04/16 16:06:10 INFO mapred.JobClient: Combine input records=0 15/04/16 16:06:10 INFO mapred.JobClient: Reduce input records=5 15/04/16 16:06:10 INFO mapred.JobClient: Reduce input groups=1 15/04/16 16:06:10 INFO mapred.JobClient: Combine output records=0 15/04/16 16:06:10 INFO mapred.JobClient: Reduce output records=5 15/04/16 16:06:10 INFO mapred.JobClient: Map output records=5
结果:
[root@baolibin ~]# hadoop fs -ls /DBout1 Warning: $HADOOP_HOME is deprecated. Found 2 items -rw-r--r-- 3 Administrator supergroup 0 2015-04-16 16:06 /DBout1/_SUCCESS -rw-r--r-- 3 Administrator supergroup 83 2015-04-16 16:06 /DBout1/part-r-00000 [root@baolibin ~]# hadoop fs -text /DBout1/part-* Warning: $HADOOP_HOME is deprecated. 0 MyDBWritable[id=1, name=鲍礼彬] 0 hello you 0 hello me 0 hello you 0 hello me
关键点:
自定义一个数据类型,并实现Writable和DBWritable接口:
public static class MyDBWritable implements Writable, DBWritable
用的是JDBC,指明驱动、要访问数据库、用户名、登陆密码:
DBConfiguration.configureDB(conf, "com.mysql.jdbc.Driver","jdbc:mysql://192.168.1.1:3306/oled", "root", "mysqladmin");
指明mapper类及查询条件:
DBInputFormat.setInput(job, MyDBWritable.class,"select id,name from DB", "select count(1) from DB");添加多输入源:
MultipleInputs.addInputPath(job, new Path("hdfs://192.168.1.100:9000/input/hehe"), TextInputFormat.class,TextMapper.class); MultipleInputs.addInputPath(job, new Path("hdfs://192.168.1.100:9000/"), DBInputFormat.class, DBMapper.class);