package com.weshare.bigdata.ods.handler; import com.alibaba.fastjson.JSONObject; import com.weshare.bigdata.entity.ClusterEnvirEntity; import com.weshare.bigdata.facility.ClusterEnvirFacility; import com.weshare.bigdata.ods.constant.DetailConstant; import com.weshare.bigdata.ods.utils.DateUtils; import com.weshare.dataframework.spark.DfSparkSession; import com.weshare.dataframework.spark.entity.SparkApplication; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.RecordWriter; import org.apache.hadoop.mapred.TextOutputFormat; import org.apache.hadoop.mapred.lib.MultipleTextOutputFormat; import org.apache.hadoop.util.Progressable; import org.apache.kafka.clients.consumer.ConsumerRecord; import org.apache.kafka.common.serialization.StringDeserializer; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.api.java.function.VoidFunction; import org.apache.spark.broadcast.Broadcast; import org.apache.spark.sql.*; import org.apache.spark.streaming.Durations; import org.apache.spark.streaming.api.java.JavaInputDStream; import org.apache.spark.streaming.api.java.JavaStreamingContext; import org.apache.spark.streaming.kafka010.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import scala.Tuple2; import scala.reflect.ClassTag; import scala.reflect.ClassTag$; import java.io.IOException; import java.util.*; public class KafkaDataClassifyBySelf { private static Logger logger = LoggerFactory.getLogger(KafkaDataClassifyBySelf.class); static JavaInputDStream> directStream; static String hdfspath="/user/admin/FA_OFFLINE/"; static String classifyTbl; static String kuduMaster; static String bootstrap; public static void main(String[] args){ //System.setProperty("hadoop.home.dir","etc/dtconf/bin"); SparkApplication application = new SparkApplication(); application.setAppName("DemoStreaming"); application.setSerializer("org.apache.spark.serializer.KryoSerializer"); SparkSession sparkSession = DfSparkSession.bulid(application); JavaSparkContext javaSparkContext = new JavaSparkContext(sparkSession.sparkContext()); JavaStreamingContext jssc = new JavaStreamingContext(javaSparkContext, Durations.seconds(600)); //设置kudumaster和时间切片配置表 ClusterEnvirFacility clusterEnvirFacility = new ClusterEnvirFacility(); ClusterEnvirEntity environmentInfo = clusterEnvirFacility.getEnvironmentInfo(sparkSession); kuduMaster=environmentInfo.getKuduMaster(); classifyTbl="impala::config.bus_calibration_time"; //获取kafka节点信息 bootstrap=environmentInfo.getBootstrap(); HashMap kafkaMap = new HashMap<>(); //Kafka服务监听端口 kafkaMap.put("bootstrap.servers",bootstrap); kafkaMap.put("key.deserializer", StringDeserializer.class); kafkaMap.put("value.deserializer", StringDeserializer.class); //消费者ID,随意指定 kafkaMap.put("group.id", "KafkaDataClassifyData"); //指定从latest(最新,其他版本的是largest这里不行)earliest(最早)处开始读取数据 kafkaMap.put("auto.offset.reset", "earliest"); //如果true,consumer定期地往zookeeper写入每个分区的offset kafkaMap.put("enable.auto.commit", "false"); Map regularTime = KafkaDataClassifyBySelf.getRegularTime(sparkSession,kuduMaster,classifyTbl); ClassTag
//重写RDDMultipleTextOutputFormat、AppendTextOutputFormat方法,实现按时间建目录,和数据增量写入文件
public class RDDMultipleTextOutputFormat extends MultipleTextOutputFormat{ private AppendTextOutputFormat theTextOutputFormat = null; public String generateFileNameForKeyValue(String key, String value, String name) { //输出格式 /ouput/key/key.csv return key + "/"+name; } @Override protected RecordWriter getBaseRecordWriter(FileSystem fs, JobConf job, String name, Progressable progressable) throws IOException { if (this.theTextOutputFormat == null) { this.theTextOutputFormat = new AppendTextOutputFormat(); } return this.theTextOutputFormat.getRecordWriter(fs, job, name, progressable); } }
public class AppendTextOutputFormat extends TextOutputFormat{ protected static class MyLineRecordWriter implements RecordWriter { private static final byte[] NEWLINE; protected DataOutputStream out; private final byte[] keyValueSeparator; public MyLineRecordWriter(DataOutputStream out, String keyValueSeparator) { this.out = out; this.keyValueSeparator = keyValueSeparator.getBytes(StandardCharsets.UTF_8); } public MyLineRecordWriter(DataOutputStream out) { this(out, "\t"); } private void writeObject(Object o) throws IOException { if (o instanceof Text) { Text to = (Text)o; this.out.write(to.getBytes(), 0, to.getLength()); } else { this.out.write(o.toString().getBytes(StandardCharsets.UTF_8)); } } public synchronized void write(K key, V value) throws IOException { boolean nullKey = key == null || key instanceof NullWritable; boolean nullValue = value == null || value instanceof NullWritable; if (!nullKey || !nullValue) { /*if (!nullKey) { this.writeObject(key); } if (!nullKey && !nullValue) { this.out.write(this.keyValueSeparator); }*/ if (!nullValue) { this.writeObject(value); } this.out.write(NEWLINE); } } public synchronized void close(Reporter reporter) throws IOException { this.out.close(); } static { NEWLINE = "\n".getBytes(StandardCharsets.UTF_8); } } @Override public RecordWriter getRecordWriter(FileSystem ignored, JobConf job, String name, Progressable progress) throws IOException { boolean isCompressed = getCompressOutput(job); String keyValueSeparator = job.get("mapreduce.output.textoutputformat.separator", "\t"); if (!isCompressed) { Path file = FileOutputFormat.getTaskOutputPath(job, name); FileSystem fs = file.getFileSystem(job); Path newFile = new Path(FileOutputFormat.getOutputPath(job), name); FSDataOutputStream fileOut = null; if (fs.exists(newFile)) { //存在,追加写 fileOut = fs.append(newFile); } else { fileOut = fs.create(file, progress); } return new AppendTextOutputFormat.MyLineRecordWriter(fileOut, keyValueSeparator); } else { Class extends CompressionCodec> codecClass = getOutputCompressorClass(job, GzipCodec.class); CompressionCodec codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, job); Path file = FileOutputFormat.getTaskOutputPath(job, name + codec.getDefaultExtension()); FileSystem fs = file.getFileSystem(job); Path newFile = new Path(FileOutputFormat.getOutputPath(job), name); FSDataOutputStream fileOut = null; if (fs.exists(newFile)) { //存在,追加写 fileOut = fs.append(newFile); } else { fileOut = fs.create(file, progress); } return new AppendTextOutputFormat.MyLineRecordWriter(new DataOutputStream(codec.createOutputStream(fileOut)), keyValueSeparator); } } }