kafka通过sparkstreaming连oracle模版

首先定义一个KafkaInfo对象



import java.util.Properties


object KafkaInfo {

  val brokerList = 。。。

  val topicTest = "test"
  val topic = ""

  val url = ""
  val prop = new Properties()
  prop.setProperty("user", "")
  prop.setProperty("password", "")

  //thin:表示连接时采用thin模式,oracle中有两种模式
  //jdbc:oracle:thin:是一种jni方式的命名
  //8523:端口
  val urlBiBeta = "jdbc:oracle:thin:@aaa:8001/BIDEV1"
  val propBiBeta = new Properties()
  propBiBeta.setProperty("user", "app")
  propBiBeta.setProperty("password", "app")

}

然后定义一个KafkaToOracle对象:

处理的大体逻辑是kafka的日志数据要实时读进oracle的log表中,但是读的时候由于采用direct方式,会可能有dataloss,所以设置一个offsettable

来维持offset信息,因为我们都知道,同一个topic和partition不同时候offset不同,我们就需要把它存进数据库中,如果刚开始的时候offsettable表中有数据,就要从它开始读,如果没有topic,partition,offset 数据就从头开始读,当把kafka的内容信息做etl存入到oracle的日志表后,对原油的offsettable进行update。

import java.math.BigDecimal
import java.sql.DriverManager

import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import kafka.utils.Json
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaUtils}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}

import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer
import scala.util.parsing.json.JSON

/**
  * Created by zengxiaosen on 16/9/14.
  */
object KafkaToOracle {

  private val offsetTable = "mdw"
  private val logTable = "o"

  def main(args: Array[String]): Unit ={

    val timeInteval = args(0).toLong
    val conf = new SparkConf().setAppName(this.getClass.toString)
    val sc = new SparkContext(conf)
    val ssc = new StreamingContext(sc, Seconds(timeInteval))

    val kafkaParams = Map("metadata.broker.list" -> KafkaInfo.brokerList)
    val fromOffsets = getTopicAndPartitionOffset

    var kafkaStream : InputDStream[(String, String)] = null
    if (fromOffsets.nonEmpty){
	//为了避免dataloss
      kafkaStream = KafkaUtils.createDirectStream[
        String,
        String,
        StringDecoder,
        StringDecoder,
        (String, String)
        ](
        ssc,
        kafkaParams,
        fromOffsets, //Map[TopicAndPartition, Long],TopicAndPartition就是(topic,partition),Long就是offset

        (mmd: MessageAndMetadata[String, String]) => (mmd.key(), mmd.message()))
    }
    else{
      kafkaStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
        ssc,
        kafkaParams,
        Set(KafkaInfo.topicTest))
    }

    //get table information
    val (schema, insertSql) = getTableInfo(logTable)

    kafkaStream.foreachRDD(rdd =>
      if (!rdd.isEmpty()) {

        val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
        rdd.mapPartitionsWithIndex((i, ite) => {
          if(ite.nonEmpty) {
            val topic = offsetRanges(i).topic
            val partition = offsetRanges(i).partition
            val offset = offsetRanges(i).untilOffset
            try{
              val conn = createConnection
              conn.setAutoCommit(false)
              val stmt = conn.prepareStatement(insertSql)

              for((_, line) <- ite) {
                val rows = mutable.Map[String, String]()
                for(col <- schema.split(",")){
                  rows(col) = ""
                }
                val values = parseJson(line, rows)
                for (i <- values.indices){
                  stmt.setString(i+1, values(i))
                }
                stmt.addBatch()
              }
              stmt.executeBatch()
              //update topicAndPartition -> offset
              val sql =
              s"""
                 |merge into $offsetTable a
                 |using (select '$topic" as topic, $partition as partition, $offset as offset
                 |from dual) b
                 |on (a.topic = b.topic AND a.partition = b.partition)
                 |when matched then
                 |update set a.offset = b.offset
                 |when not matched then
                 |insert (topic, partition, offset) values (b.topic, b.partition, b.offset)
               """.stripMargin

              stmt.execute(sql)
              conn.commit()
              conn.close()
            }catch {
              case e: Exception =>
                e.printStackTrace()
            }
          }
          Iterator.empty
        }).foreach{
          (_: Nothing) => ()
        }
      }
    )
    ssc.start()
    ssc.awaitTerminationOrTimeout((timeInteval * 1000 * 1.75).toLong)



  }

  /*
  Parse json string to Map object, then get map values
  @param line: kafka value, which must by Json format
  e.g.,
  {"id":30221920, "os": "Android", "net_info": {"net_type": "WIFI"}}
  @param rows: the key is the field of oracle table, and value is inited to ""
  @throws java.lang.Exveption if json parse faild, throw Exveption
  @return Map values
  e.g.,
  {{{"id": 30221920, "os": "Android", "net_info" {"net_type": "WIFI"}} ==> ArrayBuffer(3.022192E7, "Android", "WIFI")
  }}}
   */


  def parseJson(line: String, rows: mutable.Map[String, String]): ArrayBuffer[String] = {
    JSON.parseFull(line) match {
      case Some(map: Map[String, Any]) =>
        parseFields(rows, map)
        var timestamp: String = null
        if (rows.contains("timestamp") && rows.get("timestamp").get.length > 10) {
          timestamp = new java.math.BigDecimal(rows.get("timestamp").get).toPlainString
        }else if (rows.contains("user_timestamp") && rows.get("user_timestamp").get.length > 10)
        {
          timestamp = new java.math.BigDecimal(rows.get("user_timestamp").get).toPlainString
        }

        if (timestamp != null){
          rows("date_id") = timestamp.substring(0, 8)
          rows("hour") = timestamp.substring(8, 10)
        }

        val values = ArrayBuffer[String]()
        for (k <- rows.keys.toSeq.sorted) {
          values.append(rows.getOrElse(k, ""))
        }
        values
      case _=> throw new Exception("Parse json failed")

    }
  }

  /*
  parse map object recursively
  if value is MapType and key is "domain", convert value to Json string
  e.g.,
  {{{
  Map("id" -> 3.021192E7, "os" -> "Android", "domain" -> Map("name" -> "Robin")) ==>
  Map("id" -> 3.021192E7, "os" -> "Android", "domain" -> {"name": "Robin"})
  }}}

   if value is MapType and key is not "domain", convert (prekey -> (currkey -> value)) to
   (preKey_currkey -> value) recursively
   e.g.,
   {{{
   Map("id" -> 3.022192E7, "os" -> "Android", "net_info" -> Map("net_type" -> "WIFI")) ==>
   Map("id" -> 3.022192E7, "os" -> "Android", "net_info_net_type" -> "WIFI")
   }}}
   @param rows Parsed result
   @param obj Map object to be parsed
   @param prekey Parent key
   @throws java.lang.Exception if convert MapType to Json string,failed, throe exception

   */

  def parseFields(rows: mutable.Map[String, String],
                  obj: Map[String, Any],
                  preKey: String = null
                 ): Unit = {
    for (k <- obj.keys){
      val key = k.toLowerCase()
      val value = obj.get(k)
      value match {
        case Some(map: Map[String, Any]) =>
          if (key == "domain") {
            val domainValue = Json.encode(map)
            rows(key) = domainValue
          }else {
            val pk = if (preKey == null) key else preKey + "_" + key
            parseFields(rows, map, pk)
          }
        case Some(v: Any) =>
          val pk = if (preKey == null) key else preKey + "_" + key
          rows(pk) = v.toString
        case _=> throw new Exception("parse failed")
      }
    }

  }




  /*
  Map the columns of {{table}} to map object keys,  init the values of this map object to ""
  e.g.,
  {{{
  [id, name, os, domain] ==> Map("id" -> "", "name" -> "", "os" -> "", "domain" -> "")
  }}}
  @param table Oracle table
  @return inited map object
   */

  private def getTableInfo(table: String) = {
    val conn = createConnection
    val stmt = conn.createStatement()
    val query =
      s"""
         |select column_name
         |from user_tab_columns
         |where lower(table_name) = lower('$table')
         |order by column_id
       """.stripMargin
    val rs = stmt.executeQuery(query)
    val columns = ArrayBuffer[String]()
    while(rs.next()){
      columns += rs.getString(1).toLowerCase()
    }
    conn.close()

    val schema = columns.sorted.mkString(",")
    val valuesPlaceHolder = "?," * (columns.length - 1) + "?"
    val sql = s"insert into $logTable($schema) values($valuesPlaceHolder)"

    (schema, sql)
  }


  def getTopicAndPartitionOffset : Map[TopicAndPartition, Long] = {

    val conn = createConnection
    val stmt = conn.createStatement()
    val sql =
      s"""
         |select topic, partition, offset
         |from $offsetTable
         |where topic = '${KafkaInfo.topicTest}'
       """.stripMargin
    val resultSet = stmt.executeQuery(sql)

    var fromOffsets = Map[TopicAndPartition, Long]()
    while(resultSet.next()){
      fromOffsets += (TopicAndPartition(resultSet.getString(1),
        resultSet.getInt(2)) -> resultSet.getLong(3))
    }

    conn.close()
    fromOffsets

  }

  private def createConnection = {
    //JdbcUtils.createConnectionFactory(KafkaInfo.urlBiBeta, KafkaInfo.propBiBeta)
    DriverManager.getConnection(KafkaInfo.urlBiBeta, KafkaInfo.propBiBeta)


  }

}

你可能感兴趣的:(spark)