Spark Structured Streaming读写delta lake

1.Delta Lake的使用


1.1.启动带有Delta Lake的spark-shell

spark-shell --packages io.delta:delta-core_2.11:0.1.0

如果碰到以下错误,可能是你的 spark 和 Delta-core 使用不同的 scala 版本编译造成的,spark官网提供的spark安装包除了spark2.4.2用的是scala 2.12,其他版本用的都是scala 2.11,我们用的是spark2.4.3,所以这里delta的scala版本也要用2.11的

java.util.ServiceConfigurationError: org.apache.spark.sql.sources.DataSourceRegister: Providerorg.apache.spark.sql.delta.sources.DeltaDataSourcecouldnotbeinstantiated

1.2.maven配置

io.delta

delta-core_2.11

0.1.0

1.3.delta lake与dataframe和hive表的相互转换

//生成随机dataset

valdata=spark.range(0, 5)

scala>data.show

+---+

|id|

+---+

| 0|

| 1|

| 2|

| 3|

| 4|

+---+

//将数据以delta lake的格式存入本地目录

data.write.format("delta").save("/tmp/delta-table")

//使用spark读取delta lake的格式的数据,生成dataframe

valdf=spark.read.format("delta").load("/tmp/delta-table")

df.show()

scala>df.show()

+---+

|id|

+---+

| 3|

| 4|

| 1|

| 2|

| 0|

+---+

importorg.apache.spark.sql.SaveMode

//将数据存为hive表

df.write.mode(SaveMode.Overwrite).saveAsTable("test.delta_table")

//查询表中数据

spark.sql("select * from test.delta_table").show(5)

scala>spark.sql("select * from test.delta_table").show(5)

+---+

|id|

+---+

| 3|

| 4|

| 1|

| 2|

| 0|

+---+

//或者使用以下语句建表

spark.sql("CREATE TABLE delta_table USING DELTA LOCATION '/tmp/delta-table'")

//Reading a file

valdf=spark.read.option("header",true).csv("/opt/data/testdata.csv")

//Creating a table

df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save("/delta-table/product")

//Reading a table

valdf1=spark.read.format("delta").load("/delta-table/product")

df1.show()

+---------+---+

| name|age|

+---------+---+

|xiaoming|20|

|xiaoqiang|21|

|xiaohong|19|

| xiaoli|18|

+---------+---+

//Adding column to table

valnewDF=df.withColumn("Country",lit("China"))

newDF.write.format("delta").mode("overwrite").option("mergeSchema", "true").save("/delta-table/product")

//New Table

valdf2=spark.read.format("delta").load("/delta-table/product")

df2.show()

+---------+---+-------+

| name|age|Country|

+---------+---+-------+

|xiaoming|20| China|

|xiaoqiang|21| China|

|xiaohong|19| China|

| xiaoli|18| China|

+---------+---+-------+

//查询指定版本的delta快照数据

valtimeTravelDF_1=spark.read.format("delta").option("versionAsOf", 0).load("/delta-table/product")

timeTravelDF_1.show()

+---------+---+

| name|age|

+---------+---+

|xiaoming|20|

|xiaoqiang|21|

|xiaohong|19|

| xiaoli|18|

+---------+---+

valtimeTravelDF_2=spark.read.format("delta").option("versionAsOf", 1).load("/delta-table/product")

timeTravelDF_2.show()

+---------+---+-------+

| name|age|Country|

+---------+---+-------+

|xiaoming|20| China|

|xiaoqiang|21| China|

|xiaohong|19| China|

| xiaoli|18| China|

+---------+---+-------+

1.4.Structured Streaming 读写Delta

valstreamingDf=spark.readStream.format("rate").load()

//将dataframe以delta lake的格式实时写入本地文件

valstream=streamingDf.select($"value"as"id").writeStream.format("delta").option("checkpointLocation", "/tmp/checkpoint").start("/tmp/delta-table")

//使用Structured Streaming读取delta格式的数据并输出到终端

valstream2=spark.readStream.format("delta").load("/tmp/delta-table").writeStream.format("console").start()

2.Structured Streaming的使用


2.1使用Structured Streaming读取kafka数据并写入delta lake


packagetest

importorg.apache.spark.sql.execution.streaming.FileStreamSource.Timestamp

importorg.apache.spark.sql.streaming.StreamingQuery

importorg.apache.spark.sql.{Dataset, SparkSession, functions}

/**

* Created by kuangbin on 2019/6/10.

*/

objectStructuredStreamingToDeltaLakeDemo {

defmain(args: Array[String]): Unit= {

//创建SparkSession对象

valspark: SparkSession=SparkSession.builder().master("local[*]").appName("test").getOrCreate()

spark.sparkContext.setLogLevel("WARN")

importspark.implicits._

valdf=spark

.readStream

.format("kafka")

.option("kafka.bootstrap.servers", "192.168.232.25:9092")

.option("subscribe", "kuangtopic")

.load()

valdf2: Dataset[(String,Timestamp)] =df.selectExpr("CAST(value AS STRING)","CAST(timestamp AS Timestamp)")

.as[(String,Timestamp)]

valdf3=df2.as[(String,java.sql.Timestamp)]

.flatMap(line=>

line._1.split(" ").map(word=> (word, line._2))

).toDF("word", "timestamp")

.withWatermark("timestamp", "10 seconds")

.groupBy(functions.window($"timestamp", "10 seconds", "5 seconds"),$"word").count()

// val query: StreamingQuery = df3.select("*").writeStream.format("delta").option("checkpointLocation", "hdfs://flink:9000/tmp/checkpoint4").start("hdfs://flink:9000/tmp/delta-table4")

valquery: StreamingQuery=df3.writeStream.outputMode("complete")

.format("console")

.start()

query.awaitTermination()

}

}

3.模拟生产环境


3.1生成模拟日志文件


importjava.util.Random;

/**

*

* Created by kuangbin on 2019/6/20.

*/

publicclassMockLogData {

publicstaticvoidmain(String[] args) throwsException {

BufferedWriterbw=newBufferedWriter(newOutputStreamWriter(newFileOutputStream("a.log"),"utf-8"));

Randomrandom=newRandom();

String[] provinces=newString[]{"Jiangsu", "Hubei", "Hunan", "Henan", "Hebei"};

HashMapprovinceCityMap=newHashMap();

provinceCityMap.put("Jiangsu", newString[]{"Nanjing", "Suzhou"});

provinceCityMap.put("Hubei", newString[]{"Wuhan", "Jingzhou"});

provinceCityMap.put("Hunan", newString[]{"Changsha", "Xiangtan"});

provinceCityMap.put("Henan", newString[]{"Zhengzhou", "Luoyang"});

provinceCityMap.put("Hebei", newString[]{"Shijiazhuang", "Tangshan"});

while (true){

Stringprovince=provinces[random.nextInt(5)];

Stringcity=provinceCityMap.get(province)[random.nextInt(2)];

//val userid = UUID.randomUUID().toString.replaceAll("-","").substring(0,8)

Stringuserid=String.valueOf(random.nextInt(1000000)+100000);

//val adid = UUID.randomUUID().toString.replaceAll("-","").substring(0,8)

Stringadid=String.valueOf(random.nextInt(1000000)+100000);

//数据格式:timestamp province city userid adid

Stringlog=newDate().getTime() +" "+province+" "+city+" "+userid+" "+adid;

System.out.println(log);

bw.write(log);

bw.newLine();

bw.flush();

try {

Thread.sleep(100);

} catch (InterruptedExceptione) {

e.printStackTrace();

}

}

}

}

3.2使用flume将日志采集至kafka


配置log-kafka.conf文件,内容如下:

agent.sources = s1

agent.channels = c1

agent.sinks = k1

agent.sources.s1.type=exec

agent.sources.s1.command=tail -F /root/data/a.log

agent.sources.s1.channels=c1

agent.channels.c1.type=memory

agent.channels.c1.capacity=10000

agent.channels.c1.transactionCapacity=100

agent.sinks.k1.type= org.apache.flume.sink.kafka.KafkaSink

agent.sinks.k1.brokerList=192.168.232.25:9092

agent.sinks.k1.topic=kuangtopic

agent.sinks.k1.serializer.class=kafka.serializer.StringEncoder

agent.sinks.k1.channel=c1

执行bin/flume-ng agent --conf-file conf/log-kafka.conf -c conf/ --name agent -Dflume.root.logger=DEBUG,console,启动flume任务,将log日志文件采集至kafka

3.3使用Structured Streaming读取kafka数据并进行业务处理


packagespark

importjava.util.Date

importorg.apache.spark.sql.{DataFrame, Dataset, SparkSession}

importorg.apache.spark.sql.execution.streaming.FileStreamSource.Timestamp

importorg.apache.spark.sql.streaming.{GroupState, GroupStateTimeout, OutputMode, StreamingQuery}

importutils.DateUtils

importscala.collection.mutable

/**

* Created by kuangbin on 2019/6/14.

*/

objectAdClickRealTimeStatSpark {

valspark: SparkSession=SparkSession.builder().master("local[*]").appName("test").getOrCreate()

spark.sparkContext.setLogLevel("WARN")

importspark.implicits._

defmain(args: Array[String]): Unit= {

valdf: DataFrame=spark

.readStream

.format("kafka")

.option("kafka.bootstrap.servers", "192.168.232.25:9092")

.option("subscribe", "kuangtopic")

.load()

valdf5=calculateRealTimeStat(df)

valdf7=calculateDailyStat(df)

// val query: StreamingQuery = df7.writeStream.outputMode("complete")

// .format("console")

// .start()

//

// val query2: StreamingQuery = df5.writeStream.outputMode("complete")

// .format("console")

// .start()

valquery3: StreamingQuery=df5.writeStream.outputMode("update")

.format("console")

.start()

// query.awaitTermination()

// query2.awaitTermination()

query3.awaitTermination()

}

/**

* 计算广告点击流实时统计

* @param df

* @return

*/

defcalculateRealTimeStat(df: DataFrame): DataFrame= {

//从kafka获取到数据,数据格式为:(String,Timestamp)

valdf2: Dataset[(String, Timestamp)] =df.selectExpr("CAST(value AS STRING)", "CAST(timestamp AS Timestamp)")

.as[(String, Timestamp)]

//得到用户访问数据,数据格式为:(clickTimestamp,provice,city,userid,adid,timestamp)

valdf3=df2.as[(String, java.sql.Timestamp)].map(a=> {

valsplitString: Array[String] =a._1.split(" ")

(splitString(0), splitString(1), splitString(2), splitString(3), splitString(4), a._2)

})

valdf4=df3.toDF("clickTimestamp", "provice", "city", "userid", "adid", "timestamp")

//将数据转化we:(key,1)的格式

// val df4 = df3.map(a => {

// val timestamp:String = a._1

// val provice:String = a._2

// val city:String = a._3

// val adid:String = a._5

// val key:String = timestamp + "_" + provice + "_" + city + "_" + adid

// (key, 1L)

// }).as[(String,Long)]

// .toDF("key","conut")

valdf5: DataFrame=df4.groupBy("clickTimestamp", "provice", "city", "adid").count()

df5

}

/**

*计算每天每个省每个广告的统计

* @param df

* @return

*/

defcalculateDailyStat(df: DataFrame): DataFrame= {

//从kafka获取到数据,数据格式为:(String,Timestamp)

valdf2: Dataset[(String, Timestamp)] =df.selectExpr("CAST(value AS STRING)", "CAST(timestamp AS Timestamp)")

.as[(String, Timestamp)]

//得到用户访问数据,数据格式为:(clickTimestamp,provice,city,userid,adid,timestamp)

valdf3=df2.as[(String, java.sql.Timestamp)].map(a=> {

valsplitString: Array[String] =a._1.split(" ")

(splitString(0), splitString(1), splitString(2), splitString(3), splitString(4), a._2)

})

valdf6=df3.map(a=> {

valdate=newDate(a._1.toLong)

valdateKey: String=DateUtils.formatDateKey(date)

valprovice=a._2

valadid=a._5

(dateKey, provice, adid)

}).toDF("dateKey", "provice", "adid")

valdf7=df6.groupBy("dateKey", "provice", "adid").count()

df7

}

/**

* 计算每天各省份的top3热门广告

* @param df

* @return

*/

defcalculateProvinceTop3Ad(df:DataFrame): Dataset[Update]={

vallogs: Dataset[Update] =df.as[log]

.map(_.value.split(" ")).map(v=>Events(getTimepstamp(v(0)), v(1), v(2), v(3).toLong, v(4).toLong))

// .withWatermark("timeStamp", "1 minutes")

.groupByKey(event=>event.province)

.flatMapGroupsWithState(outputMode=OutputMode.Update(), timeoutConf=GroupStateTimeout.NoTimeout())(topNCountPerProvince)

logs

}

/**

*

* @param Province key

* @param events values

* @param state the state

* @return top2 city of each province

*/

deftopNCountPerProvince(Province:String, events: Iterator[Events], state: GroupState[State]): Iterator[Update] ={

valoldState=if (state.exists) state.getelseState(Province, mutable.Map[String, Int]())

valcityMaps=oldState.cityCounts

//

events

.toSeq

.groupBy(events=>events.city)

.map(f=> (f._1, f._2.size))

.foreach(v=> {

valcity=v._1

valcount=v._2

if (cityMaps.contains(city)){

cityMaps(city) +=count

}else{

cityMaps.getOrElseUpdate(city, count)

}

})

valnewState=State(Province, cityMaps)

state.update(newState)

/**

* 这里取count前二的city,count一样的同时取出来,大概长这样

* +----------+-----------+---------+

* | province | city | count |

* +----------+-----------+---------+

* | beijing | xicheng | 400 |

* +----------+-----------+---------+

* | beijing | dongcheng | 332 |

* +----------+-----------+---------+

* | beijing | chaoyang | 332 |

* +----------+-----------+---------+

*/

valoutput=cityMaps.groupBy(_._2)

.toList

.sortWith(_._1>_._1)

.take(2)

.flatMap(f=>f._2.toSeq)

.map(v=>Update(Province, v._1, v._2))

output.toIterator

}

defgetTimepstamp(tm: String): java.sql.Timestamp={

newjava.sql.Timestamp(tm.toLong)

}

caseclassState(province: String, cityCounts: mutable.Map[String, Int])

caseclasslog(key: String, value: String)

caseclassmsgs(key: String, value :String)

caseclassEvents(timeStamp:java.sql.Timestamp, province:String, city:String, userid:Long, adId: Long)

caseclassUpdate(province: String, city: String, count: Int)

}

你可能感兴趣的:(scala,spark,hive,big,data)