4.0.0
hive_sql
hive_sql
1.0-SNAPSHOT
2008
2.7.0
scala-tools.org
Scala-Tools Maven2 Repository
http://scala-tools.org/repo-releases
scala-tools.org
Scala-Tools Maven2 Repository
http://scala-tools.org/repo-releases
org.scala-lang
scala-library
2.12.12
com.alibaba
fastjson
1.2.47
org.apache.spark
spark-sql_2.12
2.4.3
org.apache.spark
spark-hive_2.12
2.4.3
org.apache.spark
spark-core_2.12
2.4.3
org.apache.spark
spark-streaming_2.12
2.4.3
org.apache.spark
spark-streaming-kafka-0-10_2.12
2.4.3
src/main/scala
src/test/scala
org.scala-tools
maven-scala-plugin
compile
testCompile
${scala.version}
-target:jvm-1.5
org.apache.maven.plugins
maven-eclipse-plugin
true
ch.epfl.lamp.sdt.core.scalabuilder
ch.epfl.lamp.sdt.core.scalanature
org.eclipse.jdt.launching.JRE_CONTAINER
ch.epfl.lamp.sdt.launching.SCALA_CONTAINER
org.scala-tools
maven-scala-plugin
${scala.version}
package hive_sql
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.sql._
import org.apache.spark.sql.SparkSession
// streaming
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType, DoubleType, FloatType}
import com.alibaba.fastjson.{JSON,JSONObject,JSONArray}
import org.apache.spark.streaming.kafka010._
import org.apache.kafka.common.serialization.StringDeserializer
import scala.collection.mutable.ArrayBuffer
import scala.util.Try
object hiv_sql {
def main(args: Array[String]): Unit = {
// spark config
val conf = new SparkConf().setAppName("oe streaming")
conf.setMaster("local")
conf.set("spark.testing.memory","2147480000")
conf.set("spark.executor.instances", "4")
conf.set("spark.executor.memory", "2g")
conf.set("spark.driver.allowMultipleContexts", "true")
conf.set("hive.metastore.uris", "thrift://IP:9083")
conf.set("spark.sql.warehouse.dir", "hdfs://IP:8020/user/hive/warehouse")
// spark sql
val spark = SparkSession
.builder()
.config(conf)
.enableHiveSupport()
.getOrCreate()
val sql = "show databases"
val r = spark.sql(sql)
r.show()
spark.sql("use report")
// streaming connect
val ssc = new StreamingContext(spark.sparkContext, Seconds(5))
ssc.sparkContext.setLogLevel("ERROR")
val bootstrapServers = "IP:9092,IP:9092,IP:9092"
val groupId = "test-group-id7"
val topicName = "event_track"
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> bootstrapServers,
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> groupId,//消费者组名
"auto.offset.reset" -> "earliest", //latest自动重置偏移量为最新的偏移量
"enable.auto.commit" -> (false: java.lang.Boolean) //如果是true,则这个消费者的偏移量会在后台自动提交
)
val kafkaTopicDS = KafkaUtils.createDirectStream(
ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](Set(topicName), kafkaParams)
)
// df schema
val schema = StructType(
List(
StructField("brandCode", IntegerType, true),
// StructField("brandCode", IntegerType, true),
StructField("brandName", StringType, true),
StructField("categoryName", StringType, true),
StructField("cityName", StringType, true),
StructField("date", StringType, true),
StructField("endTime", StringType, true),
StructField("nlogoCode", IntegerType, true),
StructField("nlogoName", StringType, true),
StructField("oe", StringType, true),
StructField("proviceName", StringType, true),
StructField("startTime", StringType, true),
StructField("status", StringType, true),
StructField("userID", StringType, true),
StructField("userPurchaseId", IntegerType, true),
StructField("vinCode", StringType, true),
)
)
// kafkaTopicDS.map(_.value).print()
// transform
kafkaTopicDS.map(_.value).foreachRDD(rdd=>{
// trdd = each rdd total
val trdd = rdd.map(x => JSON.parseObject(x.toString).getJSONArray("value"))
.map(x=>format_value(x))
.flatMap(x => x)
.map(x=>Row(
tryToInt(x.getString("brandCode")).getOrElse(null),
// x.getString("brandCode"),
x.getString("brandName"),
x.getString("categoryName"),
x.getString("cityName"),
x.getString("date"),
x.getString("endTime"),
tryToInt(x.getString("nlogoCode")).getOrElse(null),
// x.getString("nlogoCode").toInt,
x.getString("nlogoName"),
x.getString("oe"),
x.getString("proviceName"),
x.getString("startTime"),
x.getString("status"),
x.getString("userID"),
// x.getString("userPurchaseId"),
tryToInt(x.getString("userPurchaseId")).getOrElse(null),
x.getString("vinCode"),
))
val rddDF = spark.createDataFrame(trdd,schema)
if (rddDF.count()>0){
//写入到hdfs
val value = rddDF.coalesce(1)
value.write.format("parquet").mode("append").save("hdfs://IP:8020/user/hive/warehouse/report.db/oe_kafka")
// 直接写hive
rddDF.createOrReplaceTempView("v_table")
spark.sql("select * from v_table").write.mode("append").saveAsTable("oe_kafka")
}
})
def tryToInt( s: String ) = Try(s.toInt).toOption
// json format
def format_value(t:JSONArray):ArrayBuffer[JSONObject]={
val result = ArrayBuffer[JSONObject]()
for(i<- 1 to t.size)result.append(t.getJSONObject(i-1))
result
}
// run daemon kafka
ssc.start()
ssc.awaitTermination()
}
}