spark 实战笔记case2

val t1 = sc.textFile("/tmp/db_case1/order_created/*").map(line => line.split("\t"))
val t2 = sc.textFile("/tmp/db_case1/order_picked/*").map(line => line.split("\t"))
val t3 = sc.textFile("/tmp/db_case1/order_shipped/*").map(line => line.split("\t"))

val t1kv = t1.map(line => ( line(0), ( "created", line(1).substring(0, 19) ) ))
val t2kv = t2.map(line => ( line(0), ( "picked" , line(1).substring(0, 19) ) ))
val t3kv = t3.map(line => ( line(0), ( "shipped", line(1).substring(0, 19) ) ))

def flatValues ( events:((String, (((String, String), Option[(String, String)]), Option[(String, String)]))) ): (String, String, String, String) = {
  return ( events._1, events._2._1._1._2, events._2._1._2.getOrElse(("",""))._2, events._2._2.getOrElse(("",""))._2 )
}

t1kv.leftOuterJoin(t2kv).leftOuterJoin(t3kv).map(flatValues).take(10).foreach(println)


def flatValues ( events:((String, (((String, String), Option[(String, String)]), Option[(String, String)]))) ): (String, (String, String), (String, String), (String, String)) = {
  return ( events._1, events._2._1._1, events._2._1._2.getOrElse(("","")), events._2._2.getOrElse(("","")) )
}

t1kv.leftOuterJoin(t2kv).leftOuterJoin(t3kv).map(flatValues).take(10).foreach(println)


def flatValues ( events:(String, ((String, String), Option[(String, String)])) ): (String, Seq[(String,String)]) = {
  return ( events._1, Seq(events._2._1, events._2._2.getOrElse(("","")) ) )
}

t1kv.leftOuterJoin(t2kv).map(flatValues).take(10).foreach(println)

def flatList ( events:(String, (Seq[(String, String)], Option[(String, String)])) ): (String, Seq[(String,String)]) = {
  if (events._2._2.isEmpty)
    return ( events._1, events._2._1 )
  else
    return ( events._1, events._2._1 :+ events._2._2.get )
}

t1kv.leftOuterJoin(t2kv).map(flatValues).leftOuterJoin(t3kv).map(flatList).take(10).foreach(println)


def filterSLA ( events:Seq[(String,String)] ): Boolean = {
  val eventMap = events.toMap
  if ( (eventMap contains "created") && (eventMap contains "picked") ) {
    val format = new java.text.SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
    val ts1 = format.parse(eventMap("created")).getTime
    val ts2 = format.parse(eventMap("picked")).getTime
    if (ts2 - ts1 < 7200000)
      false
    else
      true
  } else {
    true
  }
}

t1kv.leftOuterJoin(t2kv).map(flatValues).leftOuterJoin(t3kv).map(flatList).filter( kv => filterSLA(kv._2) ).collect.foreach(println)

t1kv.map(kv => (kv._1, Seq(kv._2))).leftOuterJoin(t2kv).map(flatList).leftOuterJoin(t3kv).map(flatList).filter( kv => filterSLA(kv._2) ).collect.foreach(println)

t1kv.map(kv => (kv._1, Seq(kv._2))).
leftOuterJoin(
  t2kv.groupByKey.map(kv => (kv._1, (kv._2.toSeq.sortBy(_._2).toMap.keys.head, kv._2.toSeq.sortBy(_._2).toMap.values.head)))
).map(flatList).
leftOuterJoin(
  t3kv.groupByKey.map(kv => (kv._1, (kv._2.toSeq.sortBy(_._2).toMap.keys.head, kv._2.toSeq.sortBy(_._2).toMap.values.head)))
).map(flatList).
collect.foreach(println)

def flatGroup ( groups:(String, (Iterable[(String, String)])) ): (String, (String,String)) = {
  val groupsMap = groups._2.toSeq.sortBy(_._2).toMap
  return ( groups._1,  (groupsMap.keys.head, groupsMap.values.head) )
}

t1kv.map(kv => (kv._1, Seq(kv._2))).
leftOuterJoin(
  t2kv.groupByKey.map(flatGroup)
).map(flatList).
leftOuterJoin(
  t3kv.groupByKey.map(flatGroup)
).map(flatList).
collect.foreach(println)


t1kv.map(kv => (kv._1, Seq(kv._2))).
leftOuterJoin(
  t2kv.groupByKey.map(flatGroup)
).map(flatList).
leftOuterJoin(
  t3kv.groupByKey.map(flatGroup)
).map(flatList).
filter( kv => filterSLA(kv._2) ).
collect.foreach(println)

你可能感兴趣的:(spark)