spark-shell --packages io.delta:delta-core_2.11:0.1.0
如果碰到以下错误,可能是你的 spark 和 Delta-core 使用不同的 scala 版本编译造成的,spark官网提供的spark安装包除了spark2.4.2用的是scala 2.12,其他版本用的都是scala 2.11,我们用的是spark2.4.3,所以这里delta的scala版本也要用2.11的
java.util.ServiceConfigurationError: org.apache.spark.sql.sources.DataSourceRegister: Providerorg.apache.spark.sql.delta.sources.DeltaDataSourcecouldnotbeinstantiated
//生成随机dataset
valdata=spark.range(0, 5)
scala>data.show
+---+
|id|
+---+
| 0|
| 1|
| 2|
| 3|
| 4|
+---+
//将数据以delta lake的格式存入本地目录
data.write.format("delta").save("/tmp/delta-table")
//使用spark读取delta lake的格式的数据,生成dataframe
valdf=spark.read.format("delta").load("/tmp/delta-table")
df.show()
scala>df.show()
+---+
|id|
+---+
| 3|
| 4|
| 1|
| 2|
| 0|
+---+
importorg.apache.spark.sql.SaveMode
//将数据存为hive表
df.write.mode(SaveMode.Overwrite).saveAsTable("test.delta_table")
//查询表中数据
spark.sql("select * from test.delta_table").show(5)
scala>spark.sql("select * from test.delta_table").show(5)
+---+
|id|
+---+
| 3|
| 4|
| 1|
| 2|
| 0|
+---+
//或者使用以下语句建表
spark.sql("CREATE TABLE delta_table USING DELTA LOCATION '/tmp/delta-table'")
//Reading a file
valdf=spark.read.option("header",true).csv("/opt/data/testdata.csv")
//Creating a table
df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save("/delta-table/product")
//Reading a table
valdf1=spark.read.format("delta").load("/delta-table/product")
df1.show()
+---------+---+
| name|age|
+---------+---+
|xiaoming|20|
|xiaoqiang|21|
|xiaohong|19|
| xiaoli|18|
+---------+---+
//Adding column to table
valnewDF=df.withColumn("Country",lit("China"))
newDF.write.format("delta").mode("overwrite").option("mergeSchema", "true").save("/delta-table/product")
//New Table
valdf2=spark.read.format("delta").load("/delta-table/product")
df2.show()
+---------+---+-------+
| name|age|Country|
+---------+---+-------+
|xiaoming|20| China|
|xiaoqiang|21| China|
|xiaohong|19| China|
| xiaoli|18| China|
+---------+---+-------+
//查询指定版本的delta快照数据
valtimeTravelDF_1=spark.read.format("delta").option("versionAsOf", 0).load("/delta-table/product")
timeTravelDF_1.show()
+---------+---+
| name|age|
+---------+---+
|xiaoming|20|
|xiaoqiang|21|
|xiaohong|19|
| xiaoli|18|
+---------+---+
valtimeTravelDF_2=spark.read.format("delta").option("versionAsOf", 1).load("/delta-table/product")
timeTravelDF_2.show()
+---------+---+-------+
| name|age|Country|
+---------+---+-------+
|xiaoming|20| China|
|xiaoqiang|21| China|
|xiaohong|19| China|
| xiaoli|18| China|
+---------+---+-------+
valstreamingDf=spark.readStream.format("rate").load()
//将dataframe以delta lake的格式实时写入本地文件
valstream=streamingDf.select($"value"as"id").writeStream.format("delta").option("checkpointLocation", "/tmp/checkpoint").start("/tmp/delta-table")
//使用Structured Streaming读取delta格式的数据并输出到终端
valstream2=spark.readStream.format("delta").load("/tmp/delta-table").writeStream.format("console").start()
packagetest
importorg.apache.spark.sql.execution.streaming.FileStreamSource.Timestamp
importorg.apache.spark.sql.streaming.StreamingQuery
importorg.apache.spark.sql.{Dataset, SparkSession, functions}
/**
* Created by kuangbin on 2019/6/10.
*/
objectStructuredStreamingToDeltaLakeDemo {
defmain(args: Array[String]): Unit= {
//创建SparkSession对象
valspark: SparkSession=SparkSession.builder().master("local[*]").appName("test").getOrCreate()
spark.sparkContext.setLogLevel("WARN")
importspark.implicits._
valdf=spark
.readStream
.format("kafka")
.option("kafka.bootstrap.servers", "192.168.232.25:9092")
.option("subscribe", "kuangtopic")
.load()
valdf2: Dataset[(String,Timestamp)] =df.selectExpr("CAST(value AS STRING)","CAST(timestamp AS Timestamp)")
.as[(String,Timestamp)]
valdf3=df2.as[(String,java.sql.Timestamp)]
.flatMap(line=>
line._1.split(" ").map(word=> (word, line._2))
).toDF("word", "timestamp")
.withWatermark("timestamp", "10 seconds")
.groupBy(functions.window($"timestamp", "10 seconds", "5 seconds"),$"word").count()
// val query: StreamingQuery = df3.select("*").writeStream.format("delta").option("checkpointLocation", "hdfs://flink:9000/tmp/checkpoint4").start("hdfs://flink:9000/tmp/delta-table4")
valquery: StreamingQuery=df3.writeStream.outputMode("complete")
.format("console")
.start()
query.awaitTermination()
}
}
importjava.util.Random;
/**
*
* Created by kuangbin on 2019/6/20.
*/
publicclassMockLogData {
publicstaticvoidmain(String[] args) throwsException {
BufferedWriterbw=newBufferedWriter(newOutputStreamWriter(newFileOutputStream("a.log"),"utf-8"));
Randomrandom=newRandom();
String[] provinces=newString[]{"Jiangsu", "Hubei", "Hunan", "Henan", "Hebei"};
HashMap
provinceCityMap.put("Jiangsu", newString[]{"Nanjing", "Suzhou"});
provinceCityMap.put("Hubei", newString[]{"Wuhan", "Jingzhou"});
provinceCityMap.put("Hunan", newString[]{"Changsha", "Xiangtan"});
provinceCityMap.put("Henan", newString[]{"Zhengzhou", "Luoyang"});
provinceCityMap.put("Hebei", newString[]{"Shijiazhuang", "Tangshan"});
while (true){
Stringprovince=provinces[random.nextInt(5)];
Stringcity=provinceCityMap.get(province)[random.nextInt(2)];
//val userid = UUID.randomUUID().toString.replaceAll("-","").substring(0,8)
Stringuserid=String.valueOf(random.nextInt(1000000)+100000);
//val adid = UUID.randomUUID().toString.replaceAll("-","").substring(0,8)
Stringadid=String.valueOf(random.nextInt(1000000)+100000);
//数据格式:timestamp province city userid adid
Stringlog=newDate().getTime() +" "+province+" "+city+" "+userid+" "+adid;
System.out.println(log);
bw.write(log);
bw.newLine();
bw.flush();
try {
Thread.sleep(100);
} catch (InterruptedExceptione) {
e.printStackTrace();
}
}
}
}
配置log-kafka.conf文件,内容如下:
agent.sources = s1
agent.channels = c1
agent.sinks = k1
agent.sources.s1.type=exec
agent.sources.s1.command=tail -F /root/data/a.log
agent.sources.s1.channels=c1
agent.channels.c1.type=memory
agent.channels.c1.capacity=10000
agent.channels.c1.transactionCapacity=100
agent.sinks.k1.type= org.apache.flume.sink.kafka.KafkaSink
agent.sinks.k1.brokerList=192.168.232.25:9092
agent.sinks.k1.topic=kuangtopic
agent.sinks.k1.serializer.class=kafka.serializer.StringEncoder
agent.sinks.k1.channel=c1
执行bin/flume-ng agent --conf-file conf/log-kafka.conf -c conf/ --name agent -Dflume.root.logger=DEBUG,console,启动flume任务,将log日志文件采集至kafka
packagespark
importjava.util.Date
importorg.apache.spark.sql.{DataFrame, Dataset, SparkSession}
importorg.apache.spark.sql.execution.streaming.FileStreamSource.Timestamp
importorg.apache.spark.sql.streaming.{GroupState, GroupStateTimeout, OutputMode, StreamingQuery}
importutils.DateUtils
importscala.collection.mutable
/**
* Created by kuangbin on 2019/6/14.
*/
objectAdClickRealTimeStatSpark {
valspark: SparkSession=SparkSession.builder().master("local[*]").appName("test").getOrCreate()
spark.sparkContext.setLogLevel("WARN")
importspark.implicits._
defmain(args: Array[String]): Unit= {
valdf: DataFrame=spark
.readStream
.format("kafka")
.option("kafka.bootstrap.servers", "192.168.232.25:9092")
.option("subscribe", "kuangtopic")
.load()
valdf5=calculateRealTimeStat(df)
valdf7=calculateDailyStat(df)
// val query: StreamingQuery = df7.writeStream.outputMode("complete")
// .format("console")
// .start()
//
// val query2: StreamingQuery = df5.writeStream.outputMode("complete")
// .format("console")
// .start()
valquery3: StreamingQuery=df5.writeStream.outputMode("update")
.format("console")
.start()
// query.awaitTermination()
// query2.awaitTermination()
query3.awaitTermination()
}
/**
* 计算广告点击流实时统计
* @param df
* @return
*/
defcalculateRealTimeStat(df: DataFrame): DataFrame= {
//从kafka获取到数据,数据格式为:(String,Timestamp)
valdf2: Dataset[(String, Timestamp)] =df.selectExpr("CAST(value AS STRING)", "CAST(timestamp AS Timestamp)")
.as[(String, Timestamp)]
//得到用户访问数据,数据格式为:(clickTimestamp,provice,city,userid,adid,timestamp)
valdf3=df2.as[(String, java.sql.Timestamp)].map(a=> {
valsplitString: Array[String] =a._1.split(" ")
(splitString(0), splitString(1), splitString(2), splitString(3), splitString(4), a._2)
})
valdf4=df3.toDF("clickTimestamp", "provice", "city", "userid", "adid", "timestamp")
//将数据转化we:(key,1)的格式
// val df4 = df3.map(a => {
// val timestamp:String = a._1
// val provice:String = a._2
// val city:String = a._3
// val adid:String = a._5
// val key:String = timestamp + "_" + provice + "_" + city + "_" + adid
// (key, 1L)
// }).as[(String,Long)]
// .toDF("key","conut")
valdf5: DataFrame=df4.groupBy("clickTimestamp", "provice", "city", "adid").count()
df5
}
/**
*计算每天每个省每个广告的统计
* @param df
* @return
*/
defcalculateDailyStat(df: DataFrame): DataFrame= {
//从kafka获取到数据,数据格式为:(String,Timestamp)
valdf2: Dataset[(String, Timestamp)] =df.selectExpr("CAST(value AS STRING)", "CAST(timestamp AS Timestamp)")
.as[(String, Timestamp)]
//得到用户访问数据,数据格式为:(clickTimestamp,provice,city,userid,adid,timestamp)
valdf3=df2.as[(String, java.sql.Timestamp)].map(a=> {
valsplitString: Array[String] =a._1.split(" ")
(splitString(0), splitString(1), splitString(2), splitString(3), splitString(4), a._2)
})
valdf6=df3.map(a=> {
valdate=newDate(a._1.toLong)
valdateKey: String=DateUtils.formatDateKey(date)
valprovice=a._2
valadid=a._5
(dateKey, provice, adid)
}).toDF("dateKey", "provice", "adid")
valdf7=df6.groupBy("dateKey", "provice", "adid").count()
df7
}
/**
* 计算每天各省份的top3热门广告
* @param df
* @return
*/
defcalculateProvinceTop3Ad(df:DataFrame): Dataset[Update]={
vallogs: Dataset[Update] =df.as[log]
.map(_.value.split(" ")).map(v=>Events(getTimepstamp(v(0)), v(1), v(2), v(3).toLong, v(4).toLong))
// .withWatermark("timeStamp", "1 minutes")
.groupByKey(event=>event.province)
.flatMapGroupsWithState(outputMode=OutputMode.Update(), timeoutConf=GroupStateTimeout.NoTimeout())(topNCountPerProvince)
logs
}
/**
*
* @param Province key
* @param events values
* @param state the state
* @return top2 city of each province
*/
deftopNCountPerProvince(Province:String, events: Iterator[Events], state: GroupState[State]): Iterator[Update] ={
valoldState=if (state.exists) state.getelseState(Province, mutable.Map[String, Int]())
valcityMaps=oldState.cityCounts
//
events
.toSeq
.groupBy(events=>events.city)
.map(f=> (f._1, f._2.size))
.foreach(v=> {
valcity=v._1
valcount=v._2
if (cityMaps.contains(city)){
cityMaps(city) +=count
}else{
cityMaps.getOrElseUpdate(city, count)
}
})
valnewState=State(Province, cityMaps)
state.update(newState)
/**
* 这里取count前二的city,count一样的同时取出来,大概长这样
* +----------+-----------+---------+
* | province | city | count |
* +----------+-----------+---------+
* | beijing | xicheng | 400 |
* +----------+-----------+---------+
* | beijing | dongcheng | 332 |
* +----------+-----------+---------+
* | beijing | chaoyang | 332 |
* +----------+-----------+---------+
*/
valoutput=cityMaps.groupBy(_._2)
.toList
.sortWith(_._1>_._1)
.take(2)
.flatMap(f=>f._2.toSeq)
.map(v=>Update(Province, v._1, v._2))
output.toIterator
}
defgetTimepstamp(tm: String): java.sql.Timestamp={
newjava.sql.Timestamp(tm.toLong)
}
caseclassState(province: String, cityCounts: mutable.Map[String, Int])
caseclasslog(key: String, value: String)
caseclassmsgs(key: String, value :String)
caseclassEvents(timeStamp:java.sql.Timestamp, province:String, city:String, userid:Long, adId: Long)
caseclassUpdate(province: String, city: String, count: Int)
}