建300列的ORC表,可以用execl简单建一个300列,10000行的数据,复制成以tab分割的txt文件
hdfs dfs -put ddd.txt hdfs://hadoop:9000/tmp/input/
create table test_orc_300(
c1 string ,
c2 string ,
c3 string ,
c4 string ,
c5 string ,
c6 string ,
c7 string ,
c8 string ,
c9 string ,
c10 string ,
c11 string ,
c12 string ,
c13 string ,
c14 string ,
c15 string ,
c16 string ,
c17 string ,
c18 string ,
c19 string ,
c20 string ,
c21 string ,
c22 string ,
c23 string ,
c24 string ,
c25 string ,
c26 string ,
c27 string ,
c28 string ,
c29 string ,
c30 string ,
c31 string ,
c32 string ,
c33 string ,
c34 string ,
c35 string ,
c36 string ,
c37 string ,
c38 string ,
c39 string ,
c40 string ,
c41 string ,
c42 string ,
c43 string ,
c44 string ,
c45 string ,
c46 string ,
c47 string ,
c48 string ,
c49 string ,
c50 string ,
c51 string ,
c52 string ,
c53 string ,
c54 string ,
c55 string ,
c56 string ,
c57 string ,
c58 string ,
c59 string ,
c60 string ,
c61 string ,
c62 string ,
c63 string ,
c64 string ,
c65 string ,
c66 string ,
c67 string ,
c68 string ,
c69 string ,
c70 string ,
c71 string ,
c72 string ,
c73 string ,
c74 string ,
c75 string ,
c76 string ,
c77 string ,
c78 string ,
c79 string ,
c80 string ,
c81 string ,
c82 string ,
c83 string ,
c84 string ,
c85 string ,
c86 string ,
c87 string ,
c88 string ,
c89 string ,
c90 string ,
c91 string ,
c92 string ,
c93 string ,
c94 string ,
c95 string ,
c96 string ,
c97 string ,
c98 string ,
c99 string ,
c100 string ,
c101 string ,
c102 string ,
c103 string ,
c104 string ,
c105 string ,
c106 string ,
c107 string ,
c108 string ,
c109 string ,
c110 string ,
c111 string ,
c112 string ,
c113 string ,
c114 string ,
c115 string ,
c116 string ,
c117 string ,
c118 string ,
c119 string ,
c120 string ,
c121 string ,
c122 string ,
c123 string ,
c124 string ,
c125 string ,
c126 string ,
c127 string ,
c128 string ,
c129 string ,
c130 string ,
c131 string ,
c132 string ,
c133 string ,
c134 string ,
c135 string ,
c136 string ,
c137 string ,
c138 string ,
c139 string ,
c140 string ,
c141 string ,
c142 string ,
c143 string ,
c144 string ,
c145 string ,
c146 string ,
c147 string ,
c148 string ,
c149 string ,
c150 string ,
c151 string ,
c152 string ,
c153 string ,
c154 string ,
c155 string ,
c156 string ,
c157 string ,
c158 string ,
c159 string ,
c160 string ,
c161 string ,
c162 string ,
c163 string ,
c164 string ,
c165 string ,
c166 string ,
c167 string ,
c168 string ,
c169 string ,
c170 string ,
c171 string ,
c172 string ,
c173 string ,
c174 string ,
c175 string ,
c176 string ,
c177 string ,
c178 string ,
c179 string ,
c180 string ,
c181 string ,
c182 string ,
c183 string ,
c184 string ,
c185 string ,
c186 string ,
c187 string ,
c188 string ,
c189 string ,
c190 string ,
c191 string ,
c192 string ,
c193 string ,
c194 string ,
c195 string ,
c196 string ,
c197 string ,
c198 string ,
c199 string ,
c200 string ,
c201 string ,
c202 string ,
c203 string ,
c204 string ,
c205 string ,
c206 string ,
c207 string ,
c208 string ,
c209 string ,
c210 string ,
c211 string ,
c212 string ,
c213 string ,
c214 string ,
c215 string ,
c216 string ,
c217 string ,
c218 string ,
c219 string ,
c220 string ,
c221 string ,
c222 string ,
c223 string ,
c224 string ,
c225 string ,
c226 string ,
c227 string ,
c228 string ,
c229 string ,
c230 string ,
c231 string ,
c232 string ,
c233 string ,
c234 string ,
c235 string ,
c236 string ,
c237 string ,
c238 string ,
c239 string ,
c240 string ,
c241 string ,
c242 string ,
c243 string ,
c244 string ,
c245 string ,
c246 string ,
c247 string ,
c248 string ,
c249 string ,
c250 string ,
c251 string ,
c252 string ,
c253 string ,
c254 string ,
c255 string ,
c256 string ,
c257 string ,
c258 string ,
c259 string ,
c260 string ,
c261 string ,
c262 string ,
c263 string ,
c264 string ,
c265 string ,
c266 string ,
c267 string ,
c268 string ,
c269 string ,
c270 string ,
c271 string ,
c272 string ,
c273 string ,
c274 string ,
c275 string ,
c276 string ,
c277 string ,
c278 string ,
c279 string ,
c280 string ,
c281 string ,
c282 string ,
c283 string ,
c284 string ,
c285 string ,
c286 string ,
c287 string ,
c288 string ,
c289 string ,
c290 string ,
c291 string ,
c292 string ,
c293 string ,
c294 string ,
c295 string ,
c296 string ,
c297 string ,
c298 string ,
c299 string ,
c300 string
) stored as orc ;
ORC读文件
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.orc.mapred.OrcStruct;
import org.apache.orc.mapreduce.OrcInputFormat;
import java.io.IOException;
public class OrcReaderMR {
public static class OrcMap extends Mapper{
private Text text = new Text();
public void map(NullWritable key, OrcStruct value,
Context output) throws IOException, InterruptedException {
StringBuffer bf = new StringBuffer();
for(int i=0;i
WritableComparable fieldValue = value.getFieldValue(i);
bf.append(fieldValue.toString()).append("\t");
}
text.set(bf.toString());
output.write(NullWritable.get(),text);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
conf.set("fs.defaultFS", "hdfs://hadoop:9000");
conf.set("mapreduce.application.classpath", System.getProperty("user.dir"));
System.setProperty("HADOOP_USER_NAME", "root");
//设置开发环境变量
System.setProperty("hadoop.home.dir", "/opt/hadoop-2.7.3/");
Job job = Job.getInstance(conf);
job.setJarByClass(OrcReaderMR.class);
job.setJobName("OrcReaderMR");
job.setMapperClass(OrcMap.class);
job.setInputFormatClass(OrcInputFormat.class);
job.setNumReduceTasks(0);
job.setOutputFormatClass(TextOutputFormat.class);
// 指定该mapreduce程序数据的输入路径
Path inputPath = new Path("/user/hive/warehouse/test_orc_300");
// 指定该mapreduce程序数据的输出路径
Path outputPath = new Path("/user/hive/warehouse/test_orc_300_out");
FileSystem fs = FileSystem.get(conf);
if (fs.exists(outputPath)) {
fs.delete(outputPath, true);
}
FileInputFormat.setInputPaths(job, inputPath);
FileOutputFormat.setOutputPath(job, outputPath);
boolean waitForCompletion = job.waitForCompletion(true);
System.exit(waitForCompletion ? 0 : 1);
}
}
运行完后查看文件内容
hdfs dfs -cat hdfs://hadoop:9000/user/hive/warehouse/test_orc_300_out/part-m-00000
ORC写文件
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.orc.OrcConf;
import org.apache.orc.TypeDescription;
import org.apache.orc.mapred.OrcStruct;
import org.apache.orc.mapreduce.OrcOutputFormat;
import parquet.filter2.predicate.Operators;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class OrcWriterMR {
public static class OrcWriterMapper
extends Mapper {
private TypeDescription schema =
TypeDescription.fromString("struct" );
private OrcStruct pair = (OrcStruct) OrcStruct.createValue(schema);
private final NullWritable nada = NullWritable.get();
// private IntWritable age = new IntWritable();
public void map(LongWritable key, Text value,
Context output
) throws IOException, InterruptedException {
List list =new ArrayList();
for(int i=0;i<300;i++){
Text text = new Text();
list.add(text);
}
if(!"".equals(value.toString())){
String[] arr = value.toString().split("\t");
for(int n=0;n<300;n++) {
list.get(n).set(arr[n]);
pair.setFieldValue(n, list.get(n));
}
output.write(nada, pair);
}
}
}
public static void main(String[] args) throws Exception {
// 指定mapreduce运行的hdfs相关的参数
Configuration conf = new Configuration();
conf.set("fs.defaultFS", "hdfs://hadoop:9000");
conf.set("mapreduce.application.classpath", System.getProperty("user.dir"));
OrcConf.MAPRED_OUTPUT_SCHEMA.setString(conf,"struct" );
//分布式集群设置
// conf.set("mapred.jar", System.getProperty("user.dir")+"/WordCount.jar");
System.setProperty("HADOOP_USER_NAME", "root");
//设置开发环境变量
System.setProperty("hadoop.home.dir", "/opt/hadoop-2.7.3/");
// 设置mapreduce运行模式,这也是默认值
// conf.set("mapreduce.framework.name", "yarn");
// conf.set("yarn.resourcemanager.hostname", "hadoop");
// 获取job对象
Job job = Job.getInstance(conf);
// 设置jar包所在路径
job.setJarByClass(OrcWriterMR.class);
job.setJobName("OrcWriterMR");
job.setNumReduceTasks(0);
// 指定mapper类和reducer类
job.setMapperClass(OrcWriterMapper.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(OrcOutputFormat.class);
// 指定该mapreduce程序数据的输入路径
Path inputPath = new Path("/tmp/input/300.txt");
// 指定该mapreduce程序数据的输出路径
Path outputPath = new Path("/user/hive/warehouse/test_orc_300");
FileSystem fs = FileSystem.get(conf);
if (fs.exists(outputPath)) {
fs.delete(outputPath, true);
}
FileInputFormat.setInputPaths(job, inputPath);
FileOutputFormat.setOutputPath(job, outputPath);
boolean waitForCompletion = job.waitForCompletion(true);
System.exit(waitForCompletion ? 0 : 1);
}
}
运行完后查看表的内容
select * from test_orc_300;