下面纯属原创,只是实现其功能,性能上对于大规模数据尚有欠缺,直接上代码:
public ImportDataResponse importData(String connectionID, String dataBase, String tableName, String sourceFileType, String sourceFilePath) throws Exception {
ImportDataResponse importDataResponse = new ImportDataResponse();
orc orc = new orc();
File file = new File(sourceFilePath);
BufferedReader reader = null;
String temp = null;
String rowkey = null;
String tmpvalue = null;
try {
if (ConnectionImpl.mapConnPool.get(connectionID) == null) {
importDataResponse.setTaskStatus(0);
importDataResponse.setErrorMsg("Please establish a connection first");
} else {
Configuration conf = (Configuration) ConnectionImpl.mapConnPool.get(connectionID);
Connection conn = ConnectionFactory.createConnection(conf);
Table table = conn.getTable(TableName.valueOf("default", tableName));
List listPut = new ArrayList();
if (sourceFileType.equals("TXT") || sourceFileType.equals("CSV")) {
reader = new BufferedReader(new FileReader(file));
while ((temp = reader.readLine()) != null) {
rowkey = temp.split(",")[0];
Put put = new Put(rowkey.getBytes());
tmpvalue = temp.substring(rowkey.length() + 1);
put.addColumn("family".getBytes(), ("field").getBytes(), tmpvalue.getBytes());
listPut.add(put);
}
} else if (sourceFileType.equals("ORC")) {
String reader1 = orc.readerOrc(sourceFilePath);
String[] split = reader1.substring(0, reader1.length() - 1).split("\t");
for (int i = 0; i < split.length; i++) {
rowkey = split[i].split("&")[0];
Put put = new Put(rowkey.getBytes());
tmpvalue = split[i].split("&")[1];
put.addColumn("family".getBytes(), ("field").getBytes(), tmpvalue.getBytes());
listPut.add(put);
}
}else if(sourceFileType.equals("Parquet")){
Path file1 = new Path(sourceFilePath);
Builder builder = ParquetReader.builder(new GroupReadSupport(), file1);
ParquetReader reader1 = builder.build();
Group line=null;
while((line=reader1.read())!=null){
rowkey = line.getString("rowkey", 0);
// line.getString("family",0);
// line.getString("field",0);
tmpvalue = line.getString("value", 0);
Put put = new Put(rowkey.getBytes());
put.addColumn("family".getBytes(), ("field").getBytes(), tmpvalue.getBytes());
listPut.add(put);
System.out.println("success!");
}
}else {
System.out.println("please input right Document Format!");
}
table.put(listPut);
table.close();
importDataResponse.setTaskStatus(1);
importDataResponse.setTable(tableName);
}
} catch (Exception e) {
e.printStackTrace();
importDataResponse.setTaskStatus(0);
importDataResponse.setErrorCode(11);
importDataResponse.setErrorMsg(e.getMessage());
}
return importDataResponse;
}
orc类:
package com.Distributedcolumndatabase.Management;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.io.Writable;
import org.apache.orc.OrcFile;
import org.apache.orc.Reader;
import org.apache.orc.RecordReader;
import java.util.List;
import java.util.Properties;
import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat;
import org.apache.hadoop.hive.ql.io.orc.OrcSerde;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Reporter;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat;
import org.apache.hadoop.hive.ql.io.orc.OrcSerde;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputFormat;
import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapred.Reporter;
public class orc {
String path;
/**
* 导入orc文件
*
* @param path
* @throws IOException
* @throws SerDeException
*/
public String readerOrc(String path) throws IOException, SerDeException {
String str = null;
JobConf conf = new JobConf();
Path testFilePath = new Path(path);
Properties p = new Properties();
OrcSerde serde = new OrcSerde();
p.setProperty("columns", "rowkey,family,field,value");
p.setProperty("columns.types", "string:string:string:string");
serde.initialize(conf, p);
StructObjectInspector inspector = (StructObjectInspector) serde.getObjectInspector();
InputFormat in = new OrcInputFormat();
FileInputFormat.setInputPaths(conf, testFilePath.toString());
InputSplit[] splits = in.getSplits(conf, 1);
System.out.println("splits.length==" + splits.length);
conf.set("hive.io.file.readcolumn.ids", "1");
org.apache.hadoop.mapred.RecordReader reader = in.getRecordReader(splits[0], conf, Reporter.NULL);
Object key = reader.createKey();
Object value = reader.createValue();
List extends StructField> fields = inspector.getAllStructFieldRefs();
long offset = reader.getPos();
while (reader.next(key, value)) {
Object rowkey = inspector.getStructFieldData(value, fields.get(0));
Object field_value = inspector.getStructFieldData(value, fields.get(3));
offset = reader.getPos();
str+=rowkey+"&"+field_value+"\t";
}
reader.close();
return str;
}
/**
* 导出orc文件
*
* @param path
* @param myrow
* @throws IOException
*/
public void writerOrc(Path path, MyRow myrow) throws IOException {
JobConf conf = new JobConf();
FileSystem fs = FileSystem.get(conf);
// Path outputPath = new Path(path);
StructObjectInspector inspector =
(StructObjectInspector) ObjectInspectorFactory
.getReflectionObjectInspector(MyRow.class,
ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
OrcSerde serde = new OrcSerde();
OutputFormat outFormat = new OrcOutputFormat();
RecordWriter writer = outFormat.getRecordWriter(fs, conf,
path.toString(), Reporter.NULL);
writer.write(NullWritable.get(),
serde.serialize(myrow, inspector));
writer.close(Reporter.NULL);
fs.close();
System.out.println("write success .");
}
static class MyRow implements Writable {
String rowkey;
String family;
String field;
String value;
public MyRow(String rowkey, String family, String field,String value) {
this.rowkey = rowkey;
this.family = family;
this.field = field;
this.value = value;
}
public String getValue() {
return value;
}
public void setValue(String value) {
this.value = value;
}
public String getRowkey() {
return rowkey;
}
public void setRowkey(String rowkey) {
this.rowkey = rowkey;
}
public String getFamily() {
return family;
}
public void setFamily(String family) {
this.family = family;
}
public String getField() {
return field;
}
public void setField(String field) {
this.field = field;
}
@Override
public void write(DataOutput dataOutput) throws IOException {
throw new UnsupportedOperationException("no read");
}
@Override
public void readFields(DataInput dataInput) throws IOException {
throw new UnsupportedOperationException("no write");
}
}
}
public ExportDataResponse exportData(String connectionID, String dataBase, String tableName, String destFileType, String destFilePath) {
ExportDataResponse exportDataResponse = new ExportDataResponse();
orc orc = new orc();
if (ConnectionImpl.mapConnPool.get(connectionID) == null) {
exportDataResponse.setTaskStatus(0);
exportDataResponse.setErrorMsg("Please establish a connection first");
} else {
Configuration conf = (Configuration) ConnectionImpl.mapConnPool.get(connectionID);
Connection conn = null;
try {
conn = ConnectionFactory.createConnection(conf);
Table table = conn.getTable(TableName.valueOf("default", tableName));
//Collection families = table.getTableDescriptor().getFamilies();
ResultScanner resultScanner;
try {
Scan scan = new Scan();
resultScanner = table.getScanner(scan);
} finally {
table.close();
}
if (destFileType.equals("TXT") || destFileType.equals("CSV")) {
File file = new File(destFilePath);
FileWriter writer = new FileWriter(file);
for (Result result : resultScanner) {
List keyValueList = result.listCells();
for (Cell cell : keyValueList) {
System.out.println("row=" + new String(CellUtil.cloneRow(cell)) + " " + "columefamily=" + new String(CellUtil.cloneFamily(cell)) + " " + "colume = " + new String(CellUtil.cloneQualifier(cell)) + " " + "value=" + new String(CellUtil.cloneValue(cell)));
String str = new String(CellUtil.cloneRow(cell)) + "," + new String(CellUtil.cloneFamily(cell)) + "," + new String(CellUtil.cloneQualifier(cell)) + "," + new String(CellUtil.cloneValue(cell)) + "\n";
writer.write(str);
}
}
writer.flush();
writer.close();
} else if (destFileType.equals("ORC")) {
//Path outputPath = new Path(destFilePath);
JobConf conf1 = new JobConf();
FileSystem fs = FileSystem.get(conf1);
Path outputPath = new Path(destFilePath);
StructObjectInspector inspector =
(StructObjectInspector) ObjectInspectorFactory
.getReflectionObjectInspector(com.paas.Distributedcolumndatabase.Management.orc.MyRow.class,
ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
OrcSerde serde = new OrcSerde();
OutputFormat outFormat = new OrcOutputFormat();
RecordWriter writer = outFormat.getRecordWriter(fs, conf1,
outputPath.toString(), Reporter.NULL);
for (Result result : resultScanner) {
List keyValueList = result.listCells();
for (Cell cell : keyValueList) {
// System.out.println("row=" + new String(CellUtil.cloneRow(cell)) + " " + "columefamily=" + new String(CellUtil.cloneFamily(cell)) + " " + "colume = " + new String(CellUtil.cloneFamily(cell) + " " + "value=" + new String(CellUtil.cloneValue(cell)));
String str = new String(CellUtil.cloneRow(cell)) + "," + new String(CellUtil.cloneFamily(cell)) + "," + new String(CellUtil.cloneQualifier(cell)) + "," + new String(CellUtil.cloneValue(cell)) + "\n";
//orc.writerOrc(outputPath, new orc.MyRow(new String(CellUtil.cloneRow(cell)), new String(CellUtil.cloneFamily(cell)), new String(CellUtil.cloneFamily(cell)), new String(CellUtil.cloneValue(cell))));
writer.write(NullWritable.get(),
serde.serialize(new orc.MyRow(new String(CellUtil.cloneRow(cell)), new String(CellUtil.cloneFamily(cell)), new String(CellUtil.cloneQualifier(cell)), new String(CellUtil.cloneValue(cell))), inspector));
}
}
writer.close(Reporter.NULL);
fs.close();
System.out.println("write success .");
}else if(destFileType.equals("Parquet")){
//构建schema
String schemaStr = "message schema {" + "repeated binary rowkey;"
+"repeated binary family;"+ "repeated binary colume;"+"repeated binary value;}";
MessageType schema = MessageTypeParser.parseMessageType(schemaStr);
Path file = new Path(destFilePath);
ExampleParquetWriter.Builder builder = ExampleParquetWriter
.builder(file).withWriteMode(ParquetFileWriter.Mode.CREATE)
.withWriterVersion(ParquetProperties.WriterVersion.PARQUET_1_0)
.withCompressionCodec(CompressionCodecName.SNAPPY)
//.withConf(configuration)
.withType(schema);
ParquetWriter writer = builder.build();
SimpleGroupFactory groupFactory = new SimpleGroupFactory(schema);
for (Result result : resultScanner) {
List keyValueList = result.listCells();
for (Cell cell : keyValueList) {
System.out.println("row=" + new String(CellUtil.cloneRow(cell)) + " " + "columefamily=" + new String(CellUtil.cloneFamily(cell)) + " " + "colume = " + new String(CellUtil.cloneQualifier(cell)) + " " + "value=" + new String(CellUtil.cloneValue(cell)));
String[] str = {new String(CellUtil.cloneRow(cell)) , new String(CellUtil.cloneFamily(cell)), new String(CellUtil.cloneQualifier(cell)),new String(CellUtil.cloneValue(cell)) };
writer.write(groupFactory.newGroup()
.append("rowkey",str[0])
.append("family",str[1])
.append("colume",str[2])
.append("value",str[3]));
System.out.println("=======================");
}
}
writer.close();
}else {
System.out.println("please input right Document Format!");
}
exportDataResponse.setTaskStatus(1);
exportDataResponse.setTable(tableName);
exportDataResponse.setErrorCode(0);
exportDataResponse.getErrorMsg();
} catch (IOException e) {
exportDataResponse.setTaskStatus(0);
exportDataResponse.setTable(tableName);
exportDataResponse.setErrorMsg(e.getMessage());
exportDataResponse.setErrorCode(0);
}
}
return exportDataResponse;
} | | |