Spark Streaming 消费kafka有两种形式,这里使用no receivers方式:
spark版本:2.4.0-cdh6.2.0
使用spark streaming消费kafka数据并写入hive中:
package com.pica.bi.zhuque.stream
import java.sql.Timestamp
import java.util.Date
import com.alibaba.fastjson.{JSON, JSONArray, JSONObject}
import org.apache.hadoop.hive.ql.exec.UDF
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.spark.SparkConf
import org.apache.spark.sql.{DataFrame, Row, SparkSession, types}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, TimestampType}
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import scala.collection.mutable.{ArrayBuffer, ListBuffer}
object ParseLog {
val traceFields = List(
"id",
"remark1",
"remark2",
"remark3",
"remark4",
"remark5",
"created_time")
def main(args: Array[String]): Unit = {
//入口
val spark = SparkSession
.builder()
// .master("local[*]")
.appName("Spark SQL To Hive")
.enableHiveSupport()
.getOrCreate()
spark.conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
val ssc = new StreamingContext(spark.sparkContext,Seconds(120))
val kafkaParams = Map(
"bootstrap.servers" -> "broker1:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "gp1101",
//如果没有记录偏移量,就消费最新的数据
"auto.offset.reset" -> "earliest",
//spark 消费kafka中的偏移量自动维护: kafka 0.10之前的版本自动维护在zookeeper kafka 0.10之后偏移量自动维护topic(__consumer_offsets)
//开启自动维护偏移量
"enable.auto.commit" -> (true: java.lang.Boolean)
)
val topics = Array("test1")
//直连方式
val stream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String,String](ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String,String](topics,kafkaParams))
stream.map(cr => cr.value()).filter(_.toString().contains("json:")).foreachRDD(rs=> {
val rdd = rs.flatMap(line=>{
//去掉行首[行尾]
var linex = line.toString()
if(line.toString().endsWith("]") || line.toString().endsWith(")")){
linex = line.toString().dropRight(1)
}
val jsonStr = linex.toString().split("json:").apply(1)
// println(s"jsonStr:${jsonStr}")
val jsonObj = JSON.parseObject(jsonStr)
val lines = new ListBuffer[Row]()
if(jsonObj.containsKey("datas")){
val jsonArr:JSONArray = jsonObj.getJSONArray("datas")
val fieldValues = ArrayBuffer[Any]()
fieldValues.append(0)//id值默认为0
if(jsonArr.size()>0){
for(i <- 0 to jsonArr.size()-1){
val eachJson:JSONObject = jsonArr.getJSONObject(i)
for(field <- traceFields){
if(field.equals("created_time")){
fieldValues.append( new Timestamp(new Date().getTime()))
}else if(eachJson.containsKey(field)){
fieldValues.append(eachJson.getString(field))
}else{
fieldValues.append("")
}
}
lines.append(Row.fromSeq(fieldValues.toSeq))
}
}
}
lines.toList
})
val df = createDf(spark,rdd)
writeToHive(spark,df)
})
ssc.start()
ssc.awaitTermination()
}
def createDf(spark:SparkSession,rdd: RDD[Row]): DataFrame ={
val schemaList = new ListBuffer[StructField]
traceFields.map(eachField=>{
var struct:StructField = null
if(eachField.equals("created_time")){
struct = StructField(eachField, TimestampType, false)
}else if(eachField.equals("id")){
struct = StructField(eachField, IntegerType, false)
}else {
struct = StructField(eachField, StringType, false)
}
schemaList.append(struct)
})
val schema = types.StructType(schemaList.toList)
println(schema)
val resDF = spark.createDataFrame(rdd,schema)
// resDF.printSchema()
// resDF.show(false)
return resDF
}
def writeToHive(spark: SparkSession, df: DataFrame): Unit = {
df.createOrReplaceTempView("temp_table")
val tday = "2019-11-08"
val sql = s"insert into pica_temp.table partition(created_day='${tday}') select * from temp_table"
println(s"准备执行:${sql}")
spark.sql(sql)
}
}