1.需求,从kafka采集数据然后以orc格式的文件写往hdfs。然后hdfs上的表以orc格式存储,然后绑定分区可以查询出数据。
2.解决需求
1) 使用flume 采集。写完hdfs。但是无法写orc格式。
2 ) logstach 可以写往hdfs。但是无法写orc格式。
3) datax 没有用过 不知道能不能写orc
3.自己写代码实现。写一个kafka的消费者。然后调用hive的api来写orc格式
package rongan.kafka;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat;
import org.apache.hadoop.hive.ql.io.orc.OrcSerde;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputFormat;
import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapred.Reporter;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import rongan.commos.PropertiesUtil;
import rongan.constants.Constans;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.net.URI;
import java.text.SimpleDateFormat;
import java.util.*;
public class KafkaConsumer {
private static Properties properties = PropertiesUtil.getProperties("commerce.properties");
private static JobConf configuration = new JobConf();
private static FileSystem fs = null;
private static FSDataOutputStream outputStream = null;
private static Path writePath = null;
private static String hdfsBasicPath = properties.getProperty(Constans.HDFS_PATH);
private static OrcSerde serde = new OrcSerde();
private static OutputFormat outputFormat = new OrcOutputFormat();
private static StructObjectInspector inspector =
(StructObjectInspector) ObjectInspectorFactory
.getReflectionObjectInspector(RsdTornadoEvent.class,
ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
public static void main(String[] args) throws IOException {
//1.获取kafka消费者
org.apache.kafka.clients.consumer.KafkaConsumer consumer = getConsumer();
//2.获取fs
fs = getFileSystem();
//3.获取当前时间
Long lastTime = System.currentTimeMillis();
//4.获取数据写入全路径(hdsfPath/yyyyMM/dd/HHmm)
String totalPath = getTotalPath(lastTime);
System.out.println(totalPath);
//5.根据路径创建Path对象
writePath = new Path(totalPath);
RecordWriter write = null;
//6.创建文件流
write = getWriter(writePath);
//7.开始拉取数据
startCollect(consumer, lastTime, write);
}
/**
* 开始拉取数据
* @param consumer
* @param lastTime
* @param write
* @throws IOException
*/
private static void startCollect(org.apache.kafka.clients.consumer.KafkaConsumer consumer, Long lastTime, RecordWriter write) throws IOException {
while (true) {
ConsumerRecords records = consumer.poll(2000);
System.out.println("开始拉取数据 " + new Date());
int count = 0;
for (ConsumerRecord record : records) {
count++;
if (System.currentTimeMillis() - lastTime > 720000) {
write.close(Reporter.NULL);
System.out.println("滚动文件夹" + new Date().toString());
//获取当前时间
Long currentTime = System.currentTimeMillis();
//获取path
String newPath = getTotalPath(currentTime);
writePath = new Path(newPath);
write = getWriter(writePath);
lastTime = currentTime;
}
String[] values = record.value().split("\t");
System.out.println(record.value());
if (values.length < 33) {
continue;
}
write.write(NullWritable.get(), serde.serialize(new RsdTornadoEvent(values[0], values[1], values[2], values[3], values[4], values[5], values[6],
values[7], values[8], values[9], values[10], values[11], values[12], values[13], values[14], values[15], values[16], values[17], values[18], values[19], values[20], values[21],
values[22], values[23], values[24], values[25], values[26], values[27], values[28], values[29], values[30], values[31], values[32], values[33]), inspector));
}
System.out.println("本次拉取完毕 " + new Date() + "拉取" + count + "条");
}
}
private static RecordWriter getWriter(Path writePath) {
try {
if (fs.exists(writePath)) {
System.out.println(writePath.toString() + "已经存在");
return outputFormat.getRecordWriter(fs, configuration, writePath.toString() + "/" + System.currentTimeMillis(), Reporter.NULL);
} else {
System.out.println(writePath.toString() + "不存在");
fs.mkdirs(writePath);
return outputFormat.getRecordWriter(fs, configuration, writePath.toString() + "/" + System.currentTimeMillis(), Reporter.NULL);
}
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
/**
* 获取文件系统
* @return
*/
private static FileSystem getFileSystem() {
try {
// 获取HDFS文件系统
fs = FileSystem.get(new URI(properties.getProperty(Constans.HDFS_ADDRESS)), configuration);
return fs;
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
/**
* 获取kafka消费者
* @return
*/
private static org.apache.kafka.clients.consumer.KafkaConsumer getConsumer() {
// 创建配置
Properties properties1 = new Properties();
properties1.put("bootstrap.servers", properties.getProperty(Constans.KAFKA_BROKER_LIST));
properties1.put("group.id", "getEsEvent");
properties1.put("zookeeper.session.timeout.ms", "1000");
properties1.put("zookeeper.sync.time.ms", "250");
properties1.put("auto.commit.interval.ms", "1000");
properties1.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
properties1.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
// 创建消费者连接器
// 消费者客户端会通过消费者连接器(ConsumerConnector)连接ZK集群
// 获取分配的分区,创建每个分区对应的消息流(MessageStream),最后迭代消息流,读取每条消息
// ConsumerConnector consumer = Consumer.createJavaConsumerConnector(new ConsumerConfig(properties));
org.apache.kafka.clients.consumer.KafkaConsumer consumer = new org.apache.kafka.clients.consumer.KafkaConsumer<>(properties1);
consumer.subscribe(Arrays.asList("t_rsd_tornado_event"));
return consumer;
}
private static void save(String log) {
try {
} catch (Exception e) {
e.printStackTrace();
}
}
private static String timeTransform(Long timeInMills) {
Date time = new Date(timeInMills);
String formatDate = "";
try {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd-HHmm");
formatDate = sdf.format(time);
} catch (Exception e) {
e.printStackTrace();
}
return formatDate;
}
/**
* 根据当前的日期,获取总的路径名。
* @param date
* @return
*/
private static String getDirectoryFromDate(String date) {
// yyyy-MM-dd-HHmm
// date.split("-") ["yyyy","MM", "dd", "HHmm"]
String[] directories = date.split("-");
// yyyyMM/dd/
String directory = directories[0] + "/" + directories[1] + "/" + directories[2];
return directory;
}
/**
*
* @param date
* @return
*/
private static String getFileName(String date) {
// HHmm
String[] dateSplit = date.split("-");
// HHmm
String fileName = dateSplit[2];
return fileName;
}
/**
* 获取按照当前时间拼接的地址
* @param lastTime
* @return
*/
private static String getTotalPath(Long lastTime) {
// 时间格式转换(yyyyMM-dd-HHmm)
String formatDate = timeTransform(lastTime);
// 提取目录(yyyy/MM/dd/)
String directory = getDirectoryFromDate(formatDate);
// 提取文件名称(HHmm)
String fileName = getFileName(formatDate);
// 全路径(yyyyMM/dd/HHmm)
String totalPath = hdfsBasicPath + directory;
return totalPath;
}
/**
* 对应hive表的模型
*/
static class RsdTornadoEvent implements Writable {
String id;
String device_Id;
String src_Obj;
String dest_Obj;
String src_Ip;
String dest_Ip;
String src_Mac;
String dest_Mac;
String protocol;
String app_Layer_Protocol;
String src_Domain;
String dest_Domain;
String ip_Version;
String src_Port;
String dest_Port;
String packet_Size;
String package_Data;
String payload;
String sig_Id;
String signame;
String match_Point;
String match_Data;
String action;
String incident_Level;
String incident_Time;
String risk_level;
String incident_Type;
String active;
String lastUpdate_Time;
String lastUpdate_User;
String create_Time;
String creator;
String data_From;
String send_Time;
public RsdTornadoEvent() {
}
public RsdTornadoEvent(String id, String device_Id, String src_Obj, String dest_Obj, String src_Ip, String dest_Ip, String src_Mac, String dest_Mac, String protocol, String app_Layer_Protocol, String src_Domain, String dest_Domain, String ip_Version, String src_Port, String dest_Port, String packet_Size, String package_Data, String payload, String sig_Id, String signame, String match_Point, String match_Data, String action, String incident_Level, String incident_Time, String risk_level, String incident_Type, String active, String lastUpdate_Time, String lastUpdate_User, String create_Time, String creator, String data_From, String send_Time) {
this.id = id;
this.device_Id = device_Id;
this.src_Obj = src_Obj;
this.dest_Obj = dest_Obj;
this.src_Ip = src_Ip;
this.dest_Ip = dest_Ip;
this.src_Mac = src_Mac;
this.dest_Mac = dest_Mac;
this.protocol = protocol;
this.app_Layer_Protocol = app_Layer_Protocol;
this.src_Domain = src_Domain;
this.dest_Domain = dest_Domain;
this.ip_Version = ip_Version;
this.src_Port = src_Port;
this.dest_Port = dest_Port;
this.packet_Size = packet_Size;
this.package_Data = package_Data;
this.payload = payload;
this.sig_Id = sig_Id;
this.signame = signame;
this.match_Point = match_Point;
this.match_Data = match_Data;
this.action = action;
this.incident_Level = incident_Level;
this.incident_Time = incident_Time;
this.risk_level = risk_level;
this.incident_Type = incident_Type;
this.active = active;
this.lastUpdate_Time = lastUpdate_Time;
this.lastUpdate_User = lastUpdate_User;
this.create_Time = create_Time;
this.creator = creator;
this.data_From = data_From;
this.send_Time = send_Time;
}
@Override
public void write(DataOutput dataOutput) throws IOException {
throw new UnsupportedOperationException("no write");
}
@Override
public void readFields(DataInput dataInput) throws IOException {
throw new UnsupportedOperationException("no read");
}
}
}