spark 实战笔记case4

package de.zalando.sla_spark_sql

import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf

object SLA_parquetSQL {

  def main(args: Array[String]) {
    val sc = new SparkContext(new SparkConf().setAppName("SLA Filter"))
    val sqlContext = new org.apache.spark.sql.SQLContext(sc)
    import sqlContext._
    val suffix = args(0)
    sqlContext.parquetFile("/user/hive/warehouse/sla_parquet.db/e60001_shipment_exported_" + suffix).registerAsTable("e60001_shipment_exported")
    sqlContext.parquetFile("/user/hive/warehouse/sla_parquet.db/e62005_shipment_shipped_and_closed_" + suffix).registerAsTable("e62005_shipment_shipped_and_closed")
    sqlContext.parquetFile("/user/hive/warehouse/sla_parquet.db/e62006_shipment_canceled_and_closed_" + suffix).registerAsTable("e62006_shipment_canceled_and_closed")

    var e60001_shipment_exported = sql("select ordernumber, type_id, event_time from e60001_shipment_exported").map(line => (line(0), (line(1).toString, line(2).toString.substring(0, 19)) ) )
    var e62005_shipment_shipped_and_closed = sql("select ordernumber, type_id, event_time from e62005_shipment_shipped_and_closed").map(line => (line(0), (line(1).toString, line(2).toString.substring(0, 19)) ) )
    var e62006_shipment_canceled_and_closed = sql("select ordernumber, type_id, event_time from e62006_shipment_canceled_and_closed").map(line => (line(0), (line(1).toString, line(2).toString.substring(0, 19)) ) )

    var un = e60001_shipment_exported.union(e62005_shipment_shipped_and_closed).union(e62006_shipment_canceled_and_closed)

    un.groupByKey.filter( kv => FilterSLA.filterSLA(kv._2.toSeq) ).map( kv => kv._1 + "\t" + Utils.flatValues(kv._2.toSeq) ).saveAsTextFile(args(1))
  }
}
package de.zalando.sla_spark_sql

import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf

object SLA {

  def main(args: Array[String]) {
    val sc = new SparkContext(new SparkConf().setAppName("SLA Filter Parquet"))
    val sqlContext = new org.apache.spark.sql.SQLContext(sc)
    val suffix = args(0)
    val e60001_shipment_exported = sqlContext.parquetFile("/user/hive/warehouse/sla_parquet.db/e60001_shipment_exported_" + suffix).map(line => ( if (line(0).toString=="598017" || line(0).toString=="614420" || line(0).toString=="614421") line(5).toString else line(4).toString, ( line(0).toString, line(2).toString.substring(0, 19) ) ))
    val e62005_shipment_shipped_and_closed = sqlContext.parquetFile("/user/hive/warehouse/sla_parquet.db/e62005_shipment_shipped_and_closed_" + suffix).map(line => ( if (line(0).toString=="598017" || line(0).toString=="614420" || line(0).toString=="614421") line(5).toString else line(4).toString, ( line(0).toString, line(2).toString.substring(0, 19) ) ))
    val e62006_shipment_canceled_and_closed = sqlContext.parquetFile("/user/hive/warehouse/sla_parquet.db/e62006_shipment_canceled_and_closed_" + suffix).map(line => ( if (line(0).toString=="598017" || line(0).toString=="614420" || line(0).toString=="614421") line(5).toString else line(4).toString, ( line(0).toString, line(2).toString.substring(0, 19) ) ))

    var un = e60001_shipment_exported.union(e62005_shipment_shipped_and_closed).union(e62006_shipment_canceled_and_closed)

    un.groupByKey.filter( kv => FilterSLA.filterSLA(kv._2.toSeq) ).map( kv => kv._1 + "\t" + Utils.flatValues(kv._2.toSeq) ).saveAsTextFile(args(1))
  }
}
package de.zalando.sla_spark_sql

import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf

object SLA_csvFile {

  def main(args: Array[String]) {
    val sc = new SparkContext(new SparkConf().setAppName("SLA Filter CSV"))
    val mapper = sc.textFile(args(0)).map(line => line.split("\073")).map(line => (if (line(0) == "598017" || line(0) == "614420" || line(0) == "614421") line(5) else line(4), (line(0), line(2).substring(0, 19))))

    mapper.groupByKey.filter(kv => FilterSLA.filterSLA(kv._2.toSeq)).map(kv => kv._1 + "\t" + Utils.flatValues(kv._2.toSeq)).saveAsTextFile(args(1))
  }
}
package de.zalando.sla_spark_sql

object Utils {

  def flatValues(events: Seq[(String, String)]): String = {
    val eventMap = events.sortBy(_._2).toMap
    return (if (eventMap contains "614401") eventMap("614401") else "NULL") + "\t" + (if (eventMap contains "401411") eventMap("401411") else "NULL") + "\t" + (if (eventMap contains "393217") eventMap("393217") else "NULL") + "\t" + (if (eventMap contains "614405") eventMap("614405") else "NULL")
  }
}
package de.zalando.sla_spark_sql

object FilterSLA {

  def filterSLA(events: Seq[(String, String)]): Boolean = {
    val eventMap = events.sortBy(_._2).toMap
    if (eventMap contains "614401") {
      val format = new java.text.SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
      val ts_614401 = format.parse(eventMap("614401")).getTime
      if (eventMap contains "401411") {
        val ts_401411 = format.parse(eventMap("401411")).getTime
        if (ts_401411 - ts_614401 > 10 * 60 * 1000) return true
      }
      if (eventMap contains "393217") {
        val ts_393217 = format.parse(eventMap("393217")).getTime
        if (ts_393217 - ts_614401 > 60 * 60 * 1000) return true
      }
      if (eventMap contains "614405") {
        val ts_614405 = format.parse(eventMap("614405")).getTime
        if (ts_614405 - ts_614401 > 48 * 60 * 60 * 1000) return true
      }
      return false
    } else {
      return false
    }
  }
}

你可能感兴趣的:(spark)