导读
本项目是 SparkSQL 阶段的练习项目, 主要目的是夯实同学们对于 SparkSQL 的理解和使用
数据集
2013年纽约市出租车乘车记录
需求
统计出租车利用率, 到某个目的地后, 出租车等待下一个客人的间隔
现在有2013年纽约市出租车乘车记录表,表中字段如下:
字段 示例 示意
hack_license
BA96DE419E711691B9445D6A6307C170
执照号, 可以唯一标识一辆出租车
pickup_datetime
2013-01-01 15:11:48
上车时间
dropoff_datetime
2013-01-01 15:18:10
下车时间
pickup_longitude
-73.978165
上车点
pickup_latitude
40.757977
上车点
dropoff_longitude
-73.989838
下车点
dropoff_latitude
40.751171
下车点
其中有三个点需要注意
hack_license
是出租车执照, 可以唯一标识一辆出租车
pickup_datetime
和dropoff_datetime
分别是上车时间和下车时间, 通过这个时间, 可以获知行车时间
pickup_longitude
和dropoff_longitude
是经度, 经度所代表的是横轴, 也就是 X 轴
pickup_latitude
和dropoff_latitude
是纬度, 纬度所代表的是纵轴, 也就是 Y 轴
我们需要统计出租车在每一个地区的平均等待时间。
涉及相关知识点:
通过本项目希望大家能够掌握:
工程搭建 -> 数据读取 -> 数据清理 -> 行政区域处理 -> 会话统计
TaxiAnalysisRunner.scala
package cn.happyvicky.taxi
import java.text.SimpleDateFormat
import java.util.Locale
import java.util.concurrent.TimeUnit
import com.esri.core.geometry.{GeometryEngine, Point, SpatialReference}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import scala.io.Source
object TaxiAnalysisRunner {
def main(args: Array[String]): Unit = {
// 1. 创建 SparkSession
val spark = SparkSession.builder()
.master("local[6]")
.appName("taxi")
.getOrCreate()
// 2. 导入函数和隐式转换
import spark.implicits._
import org.apache.spark.sql.functions._
// 3. 读取文件
val taxiRaw: DataFrame = spark.read
.option("header", value = true)
.csv("sparktaix/dataset/half_trip.csv")
// taxiRaw.show()
// taxiRaw.printSchema()
//4.转换操作
val taxiParsed: RDD[Either[Trip, (Row, Exception)]] = taxiRaw.rdd.map(safe(parse))
//可以通过如下方式过滤出来所有异常的row
//taxiParsed.filter( e => e.isRight)
// .map( e => e.right.get._1)
val taxiGood: Dataset[Trip] = taxiParsed.map( either => either.left.get).toDS()
//5.绘制时长直方图
//5.1编写UDF 完成时长计算,将毫秒转为小时单位
val hours = (pickUpTime: Long, dropOffTime: Long) => {
val duration = dropOffTime - pickUpTime
val hours = TimeUnit.HOURS.convert(duration,TimeUnit.MILLISECONDS)
hours
}
val hoursUDF = udf(hours)
//5.2进行统计
taxiGood.groupBy(hoursUDF($"pickUpTime",$"dropOffTime") as "duration")
.count()
.sort("duration")
.show()
//6.根据直方图的显示,查看数据分布后,剪除反常数据
spark.udf.register("hours",hours)
val taxiClean = taxiGood.where("hours(pickUpTime,dropOffTime) BETWEEN 0 AND 3")
// taxiClean.show()
//7.增加行政区信息
//7.1 读取数据集
val geoJson = Source.fromFile("sparktaix/dataset/nyc-borough-boundaries-polygon.geojson").mkString
val featureCollection = FeatureExtraction.parseJson(geoJson)
//7.2 排序 (后续需要得到每一个出租车在哪个行政区,拿到经纬度,遍历features搜索其所在的行政区)
//(在搜索的过程中,行政区越大命中越高,所以把大的行政区放在前面,更容易命中,减少遍历次数)
val sortedFeatures = featureCollection.features.sortBy(
feature => {
(feature.properties("boroughCode"), - feature.getGeometry().calculateArea2D())
}
)
//7.3 广播
val featuresBC = spark.sparkContext.broadcast(sortedFeatures)
//7.4 UDF创建,完成功能
val boroughLookUp = (x: Double, y: Double) => {
//7.4.1 搜索经纬度所在的行政区
val featureHit: Option[Feature] = featuresBC.value.find(feature => {
GeometryEngine.contains(feature.getGeometry(), new Point(x,y), SpatialReference.create(4326))
})
//7.4.2 转为行政区信息
val borough = featureHit.map(feature => feature.properties("borough")).getOrElse("NA")
borough
}
//7.5 统计信息
// val boroughUDF = udf(boroughLookUp)
// taxiClean.groupBy(boroughUDF('dropOffX,'dropOffY))
// .count()
// .show()
//8.1 过滤没有经纬度的数据
//8.2 会话分析
val session = taxiClean.where("dropOffX != 0 and dropOffY != 0 and pickUpX != 0 and pickUpY != 0")
.repartition('license)
.sortWithinPartitions('license,'pickUpTime)
//8.3 求得时间差
def boroughDuration(t1: Trip,t2: Trip): (String, Long) = {
val borough = boroughLookUp(t1.dropOffX,t1.dropOffY)
val duration = (t2.pickUpTime - t1.pickUpTime) / 1000
(borough,duration)
}
val boroughtDuration = session.mapPartitions( trips => {
val viter = trips.sliding(2)
.filter(_.size == 2)
.filter( p => p.head.license == p.last.license)
viter.map( p => boroughDuration(p.head, p.last))
}).toDF("borough","seconds")
boroughtDuration.where("seconds > 0")
.groupBy("borough")
.agg(avg('seconds),stddev('seconds))
.show()
}
/**
* 作用就是封装parse方法,捕获异常
*/
def safe[P,R](f: P => R): P => Either[R,(P,Exception)] = {
new Function[P,Either[R,(P,Exception)]] with Serializable {
override def apply(param: P): Either[R, (P, Exception)] = {
try{
Left(f(param))
} catch {
case e: Exception => Right((param,e))
}
}
}
}
/**
* 求上车时间和下车时间的差值
* 转成小时
* 行政区查找
* 有类型的操作用DataSet,无类型的用DataFrame
* 这里使用DataSet,使用样例类创建Trip对象
*/
/**
* 将row转成Trip
* @param row
* @return
*/
def parse(row:Row): Trip = {
val richRow = new RichRow(row)
val license = richRow.getAs[String]("hack_license").orNull
val pickUpTime = parseTime(richRow, "pickup_datetime")
val dropOffTime = parseTime(richRow, "dropoff_datetime")
val pickUpX = parseLocation(richRow, "pickup_longitude")
val pickUpY = parseLocation(richRow, "pickup_latitude")
val dropOffX = parseLocation(richRow, "dropoff_longitude")
val dropOffY = parseLocation(richRow, "dropoff_latitude")
Trip(license,pickUpTime,dropOffTime,pickUpX,pickUpY,dropOffX,dropOffY)
}
def parseTime(row: RichRow, field: String): Long = {
//1.表示出来时间类型的格式 SimpleDateFormat
val pattern = "yyyy-MM-dd HH:mm:ss"
val formatter = new SimpleDateFormat(pattern,Locale.ENGLISH)
//2.执行转换,获取Date对象,getTime获取时间戳
val time: Option[String] = row.getAs[String](field)
val timeOption: Option[Long] = time.map(time => formatter.parse(time).getTime)
timeOption.getOrElse(0L)
}
def parseLocation(row: RichRow, field: String): Double = {
//1.获取数据
val location = row.getAs[String](field)
//2.转换数据
val locationOption = location.map( loc => loc.toDouble)
locationOption.getOrElse(0.0D)
}
}
/**
* DataFrame中Row的包装类型,主要为了包装getAs方法
* @param row
*/
class RichRow(row : Row){
/**
* 为了返回Option提醒外面处理空值,提供处理方式
*/
def getAs[T](field: String): Option[T] = {
//1.判断row.getAs 是否为空,row中对应的field是否为空
if(row.isNullAt(row.fieldIndex(field))){
//2.null -> 返回None
None
} else {
//3.not null -> 返回Some
Some(row.getAs[T](field))
}
}
}
/**
* 代表一个行程, 是集合中的一条记录
*/
case class Trip(
license: String,//出租车执照号
pickUpTime: Long,//上车时间
dropOffTime: Long,//下车时间
pickUpX: Double,//上车地点的经度
pickUpY: Double,//上车地点的纬度
dropOffX: Double,//下车地点的经度
dropOffY: Double//下车地点的纬度
)
Features.scala
package cn.happyvicky.taxi
import com.esri.core.geometry.{Geometry, GeometryEngine}
import org.json4s.JsonAST.JObject
import org.json4s.NoTypeHints
import org.json4s.jackson.Serialization
case class FeatureCollection(features: List[Feature])
case class Feature(properties: Map[String,String], geometry: JObject){
def getGeometry(): Geometry = {
import org.json4s.jackson.JsonMethods._
val mapGeo = GeometryEngine.geoJsonToGeometry(compact(render(geometry)),0,Geometry.Type.Unknown)
mapGeo.getGeometry
}
}
object FeatureExtraction{
//完成具体的JSON解析工作
def parseJson(json: String): FeatureCollection = {
//1.导入一个formats隐式转换
implicit val formats = Serialization.formats(NoTypeHints)
//2.JSON -> obj
import org.json4s.jackson.Serialization.read
val featureCollection = read[FeatureCollection](json)
featureCollection
}
}
相关测试代码:
EitherTest
package cn.happyvicky.taxi
object EitherTest {
def main(args: Array[String]): Unit = {
/**
* 相对于Parse方法
*/
def process(b:Double): Double = {
val a = 10.0
a / b
}
//Either => Left or Right
//Option => Some None
def safe(f: Double => Double, b: Double):Either[Double,(Double,Exception)] = {
try{
val result = f(b)
Left(result)
}catch {
case e:Exception => Right(b,e)
}
}
val result = safe(process,0)
result match{
case Left(r) => println(r)
case Right((b,e)) => println(b,e)
}
}
}
JsonTest.scala
package cn.happyvicky.taxi
import org.json4s.jackson.Serialization
object JsonTest {
def main(args: Array[String]): Unit = {
val product =
"""
|{"name":"Toy","price":35.35}
""".stripMargin
import org.json4s._
import org.json4s.jackson.JsonMethods._
//println(parse(product) \ "name")
//隐式转换的形式提供格式工具,例如:如何解析时间字符串
implicit val formats = Serialization.formats(NoTypeHints)
val productObj1: Product = parse(product).extract[Product]
println(productObj1)
//另一种方式 字符串->对象
import org.json4s.jackson.Serialization.{read,write}
val productObj2: Product = read[Product](product)
println(productObj2)
//将对象转为JSON字符串
val productObj3 = new Product("电视",2999.8)
val jsonStr = write(productObj3)
println(jsonStr)
}
}
case class Product(name: String,price: Double)
如此复杂的代码,目前也是第一次接触,仍需多看多理解。