spark 实战笔记case3

val mapper = sc.textFile("/user/hive/warehouse/sla.db/e*_30days/").map(line => line.split("\073")).map(line => ( if (line(0)=="598017" || line(0)=="614420" || line(0)=="614421") line(5) else line(4), ( line(0), line(2).substring(0, 19) ) ))

def filterSLA ( events:Seq[(String,String)] ): Boolean = {
  val eventMap = events.sortBy(_._2).toMap
  if (eventMap contains "614401") {
    val format = new java.text.SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
    val ts_614401 = format.parse(eventMap("614401")).getTime
    if (eventMap contains "401411") {
      val ts_401411 = format.parse(eventMap("401411")).getTime
      if (ts_401411 - ts_614401 > 10*60*1000) return true
    }
    if (eventMap contains "393217") {
      val ts_393217 = format.parse(eventMap("393217")).getTime
      if (ts_393217 - ts_614401 > 60*60*1000) return true
    }
    if (eventMap contains "614405") {
      val ts_614405 = format.parse(eventMap("614405")).getTime
      if (ts_614405 - ts_614401 > 48*60*60*1000) return true
    }
    return false
  } else {
    return false
  }
}

def flatValues ( events:Seq[(String,String)] ): String = {
  val eventMap = events.sortBy(_._2).toMap
  return (if (eventMap contains "614401") eventMap("614401") else "NULL") + "\t" + (if (eventMap contains "401411") eventMap("401411") else "NULL") + "\t" + (if (eventMap contains "393217") eventMap("393217") else "NULL") + "\t" + (if (eventMap contains "614405") eventMap("614405") else "NULL")
}

mapper.groupByKey.filter( kv => filterSLA(kv._2.toSeq) ).map( kv => kv._1 + "\t" + flatValues(kv._2.toSeq) ).saveAsTextFile("/tmp/opt")
import org.apache.hadoop.hbase.{HBaseConfiguration, HTableDescriptor}
import org.apache.hadoop.hbase.mapreduce.TableInputFormat

import org.apache.spark.rdd.NewHadoopRDD

val conf = HBaseConfiguration.create()
conf.set(TableInputFormat.INPUT_TABLE, "tmp")

var hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat], classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable], classOf[org.apache.hadoop.hbase.client.Result])

hBaseRDD.count()

import scala.collection.JavaConverters._

hBaseRDD.map(tuple => tuple._2).map(result => result.getColumn("cf".getBytes(), "val".getBytes())).map(keyValues => {
( keyValues.asScala.reduceLeft {
    (a, b) => if (a.getTimestamp > b.getTimestamp) a else b
  }.getRow,
  keyValues.asScala.reduceLeft {
    (a, b) => if (a.getTimestamp > b.getTimestamp) a else b
  }.getValue
)
}).take(10)

hBaseRDD.map(tuple => tuple._2).map(result => (result.getRow, result.getColumn("cf".getBytes(), "val".getBytes()))).map(row => {
(
  row._1.map(_.toChar).mkString,
  row._2.asScala.reduceLeft {
    (a, b) => if (a.getTimestamp > b.getTimestamp) a else b
  }.getValue.map(_.toChar).mkString
)
}).take(10)


conf.set(TableInputFormat.INPUT_TABLE, "test1")

//var hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat], classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable], classOf[org.apache.hadoop.hbase.client.Result])

hBaseRDD.map(tuple => tuple._2).map(result => (result.getRow, result.getColumn("lf".getBytes(), "app1".getBytes()))).map(row => if (row._2.size > 0) {
(
  row._1.map(_.toChar).mkString,
  row._2.asScala.reduceLeft {
    (a, b) => if (a.getTimestamp > b.getTimestamp) a else b
  }.getValue.map(_.toInt).mkString
)
}).take(10)

import java.nio.ByteBuffer
hBaseRDD.map(tuple => tuple._2).map(result => (result.getRow, result.getColumn("lf".getBytes(), "app1".getBytes()))).map(row => if (row._2.size > 0) {
(
  row._1.map(_.toChar).mkString,
  ByteBuffer.wrap(row._2.asScala.reduceLeft {
    (a, b) => if (a.getTimestamp > b.getTimestamp) a else b
  }.getValue).getLong
)
}).take(10)


//conf.set(TableInputFormat.SCAN_COLUMN_FAMILY, "lf")
conf.set(TableInputFormat.SCAN_COLUMNS, "lf:app1")

//var hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat], classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable], classOf[org.apache.hadoop.hbase.client.Result])

import java.nio.ByteBuffer
hBaseRDD.map(tuple => tuple._2).map(result => {
  ( result.getRow.map(_.toChar).mkString,
    ByteBuffer.wrap(result.value).getLong
  )
}).take(10)


val conf = HBaseConfiguration.create()
conf.set(TableInputFormat.INPUT_TABLE, "test1")

var hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat], classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable], classOf[org.apache.hadoop.hbase.client.Result])

var rows = hBaseRDD.map(tuple => tuple._2).map(result => result.getRow.map(_.toChar).mkString)
rows.map(row => row.split("\\|")).map(r => if (r.length > 1) (r(0), r(1)) else (r(0), "") ).groupByKey.take(10)

你可能感兴趣的:(spark)