1,mr代码如下
package com.test.hadoop; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.orc.TypeDescription; import org.apache.orc.mapred.OrcStruct; import org.apache.orc.mapreduce.OrcInputFormat; import org.apache.orc.mapreduce.OrcOutputFormat; public class ORCSample { public static class ORCMapper extends Mapper<NullWritable, OrcStruct, Text, Text> { public void map(NullWritable key, OrcStruct value, Context output) throws IOException, InterruptedException { output.write((Text) value.getFieldValue(1), (Text) value.getFieldValue(2)); } } public static class ORCReducer extends Reducer<Text, Text, NullWritable, OrcStruct> { private TypeDescription schema = TypeDescription .fromString("struct<name:string,mobile:string>"); private OrcStruct pair = (OrcStruct) OrcStruct.createValue(schema); private final NullWritable nw = NullWritable.get(); public void reduce(Text key, Iterable<Text> values, Context output) throws IOException, InterruptedException { for (Text val : values) { pair.setFieldValue(0, key); pair.setFieldValue(1, val); output.write(nw, pair); } } } public static void main(String args[]) throws Exception { Configuration conf = new Configuration(); conf.set("orc.mapred.output.schema","struct<name:string,mobile:string>"); Job job = Job.getInstance(conf, "ORC Test"); job.setJarByClass(ORCSample.class); job.setMapperClass(ORCMapper.class); job.setReducerClass(ORCReducer.class); job.setInputFormatClass(OrcInputFormat.class); job.setOutputFormatClass(OrcOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(OrcStruct.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); } }2,pom.xml中添加依赖(基于hadoop2.7.1)
<dependencies> <dependency> <groupId>org.apache.orc</groupId> <artifactId>orc-mapreduce</artifactId> <version>1.1.0</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-mapreduce-client-core</artifactId> <version>2.7.1</version> </dependency> </dependencies>
3,创建表,在 t_test_orc中添加3行数据。
CREATE TABLE `t_test_orc`( `siteid` string, `name` string, `mobile` string) stored as orc
CREATE TABLE `t_test_orc_new`( `name` string, `mobile` string) ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat' LOCATION 'hdfs://namenode:9000/user/testorc3'
4,打包运行
hadoop jar MRTest-1.0-jar-with-dependencies.jar com.test.hadoop.ORCSample /hive/warehouse/mytest.db/t_test_orc /user/testorc3
5,完成后可以用hive --orcfiledump -d 查看执行结果
并且进入hive 查询orc格式的 t_test_orc表也可以看到数据
更多信息可以参考https://orc.apache.org/