下载链接:events
提取码: ngs3
sbin/start-all.sh
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._
hdfs dfs -cat /eventData/users.csv |head -n 10
val dfUsers = spark.read.format("csv").option("header","true").load("hdfs:///eventData/users.csv")
dfUsers.show
dfUsers.printSchema
需求1:查看 user.csv 是否有重复数据
查看总行数与 distinct usre_id 的数量是否一致,一致说明无重复数据,user_id数量少说明有重复数据
dfUsers.count
dfUsers.select("user_id").distinct.count
需求2:有多少用户没有输入或输入了无效的出生年份?
val df1 = dfUsers.select(col("user_id"),col("birthyear").cast(IntegerType).as("f_birthyear"),col("birthyear"))
df1.show
val df2 = df1.filter(col("f_birthyear").isNull)
df2.show
df1.count
df2.count
df2.select("birthyear").distinct.show
注: 这里解释以下为什么 df2 要查询三个字段 user_id , f_birthyear , birthyear
需求3:使用用户的平均出生年份来替换用户无效的、缺失的birthyear数据
val df3 = dfUsers.withColumn("birthyear",col("birthyear").cast(IntegerType))
df3.show
val dfAvgAge = df3.select(avg(col("birthyear")).cast(IntegerType).as("avg_year"))
dfAvgAge.show
val df4 = df3.crossJoin(dfAvgAge).withColumn("new_birthyear",when(col("birthyear").isNull,col("avg_year")).otherwise(col("birthyear")))
df4.show
val df5 = df3.withColumn("birthyear",when(col("birthyear").isNull,lit(1988)).otherwise(col("birthyear")))
df5.show
需求4:查询性别,发现"gender"字段中显示为 null,female,male,规范化,将null替换为unknown
dfUsers.groupBy($"gender").agg(count($"user_id")).show
val df6 = dfUsers.withColumn("gender",when(($"gender").isNull,lit("unknown")).otherwise($"gender"))
df6.select("gender").distinct.show
hdfs dfs -cat /eventData/events.csv | head -n 3
val dfEvents = spark.read.format("csv").option("header","true").load("hdfs:///eventData/events.csv")
需求1:查询数据总量(不重复)
dfEvents.cache.count
dfEvents.select("event_id").distinct.count
需求2:查询事件表中有没有用户id与用户表用户id一致
dfEvents.createOrReplaceTempView("events")
select user_id,count(1) as count_id from events group by user_id order by count_id desc limit 10
dfUsers.createOrReplaceTempView("users")
select count(1) from events e join users u on e.user_id = u.user_id
需求3:查询有没有无效的 start_time 的时间
时间格式为:2020-10-31T00:00:00:001Z
由需求1查询 event.csv 数据总量为:3137972
select count(1) from events where start_time regexp ('^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}.*')
注: 使用 spark 查询有效的起始时间数据总量:
dfEvents.filter($"start_time".rlike("^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}.*")).count
hdfs dfs -cat /eventData/user_friends.csv | head -n 3
val dfUserFriends = spark.read.format("csv").option("header","true").load("hdfs:///eventData/user_friends.csv")
需求1:查询数据总量
dfUserFriends.count
val df = dfUserFriends.withColumnRenamed("user","user_id").withColumn("friend_id",explode(split($"friends"," "))).drop("friends")
//查看表头
df.printSchema
df.show(20)
dfUserFriends.createOrReplaceTempView("user_friends")
select
user user_id,
friend_id
from user_friends
lateral view explode(split(friends,' ')) as friend_id
需求3:查询行转列之后的表中有多少有效数据 (friend_id非空)
df.filter($"friend_id".isNull).show
df.filter($"friend_id".isNotNull).distinct.count
df.cache
df.count
可以看到 friend_id 有效数据比总数据少16条,说明 friend_id 中存在重复数据
df.createOrReplaceTempView("user_friends_1")
select user_id,friend_id,count(1) from user_friends_1 group by user_id,friend_id having count(1) > 1
df.groupBy($"user_id",$"friend_id").agg(count($"user_id").as("cnt")).filter($"cnt">lit(1)).show
df.printSchema
dfUsers.printSchema
val df2 = df.alias("uf").join(dfUsers.alias("u"),$"uf.user_id"===$"u.user_id","inner").select($"uf.user_id",$"uf.friend_id").distinct
.groupBy($"user_id").agg(count($"friend_id").as("cnt")).orderBy($"cnt".desc)
df2.show
df.distinct.alias("uf").join(dfUsers.alias("u"),$"uf.user_id"===$"u.user_id","inner").select($"uf.user_id",$"uf.friend_id")
.groupBy($"user_id").agg(count($"friend_id").as("cnt")).orderBy($"cnt".desc).show
df.distinct.groupBy($"user_id").agg(count($"friend_id").as("cnt"))
.join(dfUsers,"user_id").select("user_id","cnt")
.orderBy($"cnt".desc).show
注: 这里推荐使用方法③ ,sql语句 join 查询核心思想是减少数据量
hdfs dfs -cat /eventData/event_attendees.csv | head -n 2
val dfEvent_attendees = spark.read.format("csv").option("header","true").load("hdfs:///eventData/event_attendees.csv")
dfEvent_attendees.show(2)
dfEvent_attendees.printSchema
需求1:表头为event,yes,maybe,invited, no, 将表头格式改为event_id,user_id,attend_type
// event,yes
val dfYes = dfEvent_attendees.select("event","yes").withColumnRenamed("event","event_id").withColumn("user_id",explode(split($"yes"," "))).withColumn("attend_type",lit("yes")).drop("yes")
dfYes.show
//event,maybe
val dfMaybe = dfEvent_attendees.select("event","maybe").withColumnRenamed("event","event_id").withColumn("user_id",explode(split($"maybe"," "))).withColumn("attend_type",lit("maybe")).drop("maybe")
//event,invited
val dfInvited= dfEvent_attendees.select("event","invited").withColumnRenamed("event","event_id").withColumn("user_id",explode(split($"invited"," "))).withColumn("attend_type",lit("invited")).drop("invited")
//event,no
val dfNo = dfEvent_attendees.select("event","no").withColumnRenamed("event","event_id").withColumn("user_id",explode(split($"no"," "))).withColumn("attend_type",lit("no")).drop("no")
val dfResult = dfYes.union(dfMaybe).union(dfInvited).union(dfNo)
dfResult.count
dfResult.distinct.count
val dfFinal = Seq("yes","maybe","invited","no").map(at => dfEvent_attendees.select($"event".as("event_id"),col(at)).withColumn("user_id",explode(split(col(at)," "))).drop(col(at)).withColumn("attend_type",lit(at))).reduce((x,y) => x.union(y))
dfFinal.show(10)
dfFinal.distinct.count
dfEvent_attendees.createOrReplaceTempView("event_attendees")
%sql
with final as
(
select distinct
event as event_id,
user_id,
'yes' as attend_type
from event_attendees
lateral view explode(split(yes," ")) t as user_id
union all
select distinct
event as event_id,
user_id,
'maybe' as attend_type
from event_attendees
lateral view explode(split(maybe," ")) t as user_id
union all
select distinct
event as event_id,
user_id,
'invited' as attend_type
from event_attendees
lateral view explode(split(invited," ")) t as user_id
union all
select distinct
event as event_id,
user_id,
'no' as attend_type
from event_attendees
lateral view explode(split(no," ")) t as user_id
) select * from final
hdfs dfs -cat /eventData/train.csv | head -n 3
val dfTrain = spark.read.format("csv").option("header","true").load("hdfs:///eventData/train.csv")
dfTrain.printSchema
dfTrain.show
dfTrain.count
dfTrain.select("user","event").distinct.count
需求2:如果存在,找出他们并且进行分析
dfTrain.groupBy($"user",$"event").agg(count($"user").as("cnt")).filter($"cnt" > lit(1)).show
dfTrain.filter($"user" === lit("661151794") && $"event" === lit("187728438")).show(false)
dfTrain.filter($"user" === lit("661151794") && $"event" === lit("187728438")).orderBy($"timestamp").show(false)
dfTrain.dropDuplicates("user","event").filter($"user" === lit("661151794") && $"event" === lit("187728438")).orderBy($"timestamp".asc).show(false)
dfTrain.orderBy($"timestamp".desc).dropDuplicates("user","event").filter($"user" === lit("661151794") && $"event" === lit("187728438")).show(false)
dropDuplicates() 算子去重,可以指定具体字段,无论多少条重复数据,默认保留第一条
distinct 去重,根据每一条数据,进行完整内容的比对和去重
去重操作③:使用窗口函数排序,指定序号去重
导入窗口函数:
import org.apache.spark.sql.expressions.Window
使用窗口函数查询:
指定保留 rn=1
val df5 = dfTrain.withColumn("rn",row_number() over Window.partitionBy($"user",$"event").orderBy($"timestamp".desc))//.filter($"rn" === lit(1).drop("rn"))
df5.filter($"user" === lit("661151794") && $"event" === lit("187728438")).show(false)
df5.filter(($"user" === lit("661151794") && $"event" === lit("187728438")) || ($"user" === lit("1895679477") && $"event" === lit("2019748690"))).show(false)
dfTrain.filter($"timestamp".rlike("^\\d{4}-\\d{2}-\\d{2}\\s\\d{2}:\\d{2}:\\d{2}.*")).count
注: