SparkSql 读写Hive 分区表(数据清洗类)

主要使用 SparkSql 对 Hive 分区表使用动态分区进行操作,根据规则对数据进行清洗等,除了刚导入数据时指定date范围清洗,后期按天进行清洗。

 

package com.sm.cleandata

import java.io.File
import java.util.Properties

import com.sm.conf.ConfigurationManager
import com.sm.constants.Constants
import com.sm.utils.DateUtils
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.{Dataset, Row, SaveMode, SparkSession}
import org.apache.spark.storage.StorageLevel
import org.slf4j.LoggerFactory

/**
  * 二.
  * 数据清洗类
  * 将ods层数据进行清洗,导入dwd 层,以及配置汇总表的生成
  *
  * create by LiuJinHe 2019/10/23
  */
object CleanOdsToDwd {
  private val warehouseLocation = "hdfs://cdh-slave01:9870/user/hive/warehouse"
  //  private val warehouseLocation = new File("spark-warehouse").getAbsolutePath
  private val logger = LoggerFactory.getLogger("CleanOdsToDwd")
  private var prop: Properties = new Properties
  private var yesterday: String = _
  var startDate = ""
  var endDate = ""

  def main(args: Array[String]): Unit = {
    Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
    Logger.getLogger("org.apache.spark").setLevel(Level.INFO)
    Logger.getLogger("org.spark_project.jetty").setLevel(Level.WARN)

    // 获取当天和前一天日期

    yesterday = DateUtils.getYesterdayDate

    if (args.length == 2) {
      yesterday = args(0)
    }
    // startDate = "2019-01-01"
    // endDate = "2019-10-22"

    // 初始化spark
    val spark = initSparkSession

    logger.info("===============> 开始生成 conf_game_package_channel 配置汇总表 <===============")
    makePackageConf(spark)

    // 读取ODS 层数据,解析导入 DWD 层事件表
    val start = System.currentTimeMillis()
    logger.info(s"===================> 开始加载Hive ODS层数据进行清洗 <===================")

    prop = loadProp(Constants.HIVE_DW)

    // 清洗数据
    val database = prop.getProperty("hive.database")
    var odsTables = ""

    odsTables = prop.getProperty("ods.sdk.user.login.table")
    cleanData(spark, database, odsTables)

    odsTables = prop.getProperty("ods.sdk.active.table")
    cleanData(spark, database, odsTables)

    val end = System.currentTimeMillis()
    println("耗时:" + (end - start))

    logger.info("===================> 清洗结果数据写入Hive dwd层完成 <===================")
    spark.stop()
  }

  /**
    * 数据清洗
    */
  def cleanData(spark: SparkSession, db: String, table: String): Unit = {
    /**
      * 清洗规则:
      * ① time过滤 0000-00-00 00:00:00
      * ② 过滤 game_id <= 0, game_id =339, package_id <=9 的数据
      * ③ 所有出现字符串都统⼀转成小写格式
      * ④ 通过gameId与conf_base_game_表匹配获得cp_game_id,过滤未匹配到,或cp_game_id<=0 的数据
      * ⑤ login_account后面带\t 等特殊切割符合的数据进行过滤(会判断为非时间类型)
      * 其中,屏幕宽高是两个单独的字段,合并为一个字段
      */

    // 加载 gameId 配置表
    val gameIdTable = "conf_base_game"
    spark.table(s"$db.$gameIdTable").createOrReplaceTempView("confTable")

    // 加载日志数据
    val dataFrame = spark.sql(s"select * from $db.$table where `date`='$yesterday'")
    // var sqlStr = s"select * from $db.$table where `date`>= '$startDate' and `date` < '$endDate'"
    val dataFrame = spark.sql(sqlStr)
      .persist(StorageLevel.MEMORY_ONLY)
    dataFrame.createOrReplaceTempView("tmpTable")

    var destTable = ""

    // 清洗数据
    if (table.equals("ods_sdk_user_login")) {
      destTable = prop.getProperty("dwd.sdk.user.login.table")
      sqlStr =
        s"""
           |select
           | a.id,
           | a.game_id,
           | a.package_id,
           | conf.cp_game_id,
           | lower(rtrim("\t",a.login_account)) as login_account,
           | lower(a.core_account) as core_account,
           | a.time,
           | a.time_server,
           | lower(a.device_id) as  device_id,
           | lower(a.md5_device_id) as  md5_device_id,
           | lower(a.device_code) as  device_code,
           | lower(a.device_key) as  device_key,
           | lower(a.android_id) as  android_id,
           | lower(a.useragent) as  useragent,
           | lower(a.device_type) as  device_type,
           | a.os,
           | a.os_version,
           | a.sdk_version,
           | a.game_version,
           | lower(a.network_type) as  network_type,
           | a.mobile_type,
           | concat(a.screen_width,(case a.screen_height when '' then '' else 'x' end),a.screen_height) as width_height,
           | a.ip,
           | a.lbs,
           | lower(a.refer) as  refer,
           | lower(a.refer_param) as refer_param,
           | a.channel_alter,
           | a.date
           |from confTable as conf join tmpTable as a
           | where a.game_id = conf.game_id
           | and a.game_id > 0
           | and a.game_id != 339
           | and a.package_id > 9
           | and conf.cp_game_id > 0
           | and a.time is not null
            """.stripMargin
    } else if (table.equals("ods_sdk_active_log")) {
      destTable = prop.getProperty("dwd.sdk.active.table")
      sqlStr =
        s"""
           |select
           | a.id,
           | a.game_id,
           | a.package_id,
           | conf.cp_game_id,
           | a.time,
           | a.time_server,
           | lower(a.device_id) as device_id,
           | lower(a.md5_device_id) as md5_device_id,
           | lower(a.device_code) as device_code,
           | lower(a.android_id) as android_id,
           | lower(serial_number) as serial_number,
           | lower(a.device_key) as device_key,
           | lower(a.useragent) as useragent,
           | lower(a.device_type) as device_type,
           | a.os,
           | a.os_version,
           | a.sdk_version,
           | a.game_version,
           | lower(a.network_type) as network_type,
           | a.mobile_type,
           | concat(a.screen_width,(case a.screen_height when '' then '' else 'x' end),a.screen_height) as width_height,
           | a.ip,
           | a.lbs,
           | lower(a.refer) as refer,
           | lower(a.refer_param) as refer_param,
           | a.channel_id,
           | a.sv_key,
           | a.click_id,
           | a.click_time,
           | date
           |from confTable as conf join tmpTable as a
           | where a.game_id = conf.game_id
           | and a.game_id > 0
           | and a.game_id != 339
           | and a.package_id > 9
           | and conf.cp_game_id > 0
           | and a.time is not null
        """.stripMargin
    }

    import spark.sql

    // 数据清洗后写入Hive dwd层表当日分区中
    // 重分区,会将提交时候设置的num-executors计算结果合并为一个,输出每个date分区一个文件
    sql(sqlStr).repartition(1).createOrReplaceTempView("resultTable")
    sqlStr =
      s"""
         |insert overwrite table $db.$dayTable
         | partition(`date`)
         |select * from resultTable
       """.stripMargin
    sql(sqlStr)

    // 或者,重分区比较耗时
    // sql(sqlStr).coalesce(1).write.mode(SaveMode.Append).insertInto(s"$db.$destTable")


    // 或者
    // sql(sqlStr).coalesce(1).createOrReplaceTempView("resultTable")
    // sqlStr = "select * from resultTable"
    // sql(sqlStr).write.mode(SaveMode.Append).insertInto(s"$db.$destTable")

    // 或者
    // sql(sqlStr).coalesce(1).createOrReplaceTempView("resultTable")
    // spark.table("resultTable").write.mode(SaveMode.Append).insertInto(s"$db.$destTable")

    dataFrame.unpersist(true)
  }

  /**
    * 根据配置表生成Hive汇总表 conf_game_package_channel
    */
  def makePackageConf(spark: SparkSession): Unit = {
    prop = loadProp(Constants.HIVE_CONF_TABLE)
    val db = prop.getProperty("hive.database")
    val packageIdTable = prop.getProperty("conf.base.package.table")
    val gameIdTable = prop.getProperty("conf.base.game.table")
    val channelIdTable = prop.getProperty("conf.base.channel.table")
    val summaryConfTable = prop.getProperty("conf.game.package.channel.table")

    val packageDF = spark.table(s"$db.$packageIdTable")
    val gameDF = spark.table(s"$db.$gameIdTable")
    val channelDF = spark.table(s"$db.$channelIdTable")

    val summaryConfDF = packageDF.join(gameDF, "game_id").join(channelDF, "CHANNEL_ID")
    summaryConfDF.createOrReplaceTempView("tmpConfTable")

    val sqlStr =
      s"""
         |insert overwrite table $db.$summaryConfTable
         |select
         | package_id,game_id,cp_game_id,sm_game_name,
         | popularize_v1_id,popularize_v2_id,channel_main_id,channel_main_name,channel_id,channel_name,
         | chan_id,channel_code,channel_label,platform_id,promotion_way_id
         |from tmpConfTable
      """.stripMargin

    spark.sql(sqlStr)
    logger.info(s"========== 生成 $summaryConfTable 汇总表成功! ==========")
  }

  // 加载配置
  def loadProp(properties: String): Properties = {
    val props = new Properties()
    val in = ConfigurationManager.getClass.getClassLoader.getResourceAsStream(properties)
    props.load(in)
    props
  }

  def initSparkSession: SparkSession = SparkSession.builder()
    .appName(this.getClass.getSimpleName)
    .master(Constants.SPARK_YARN_CLIENT_MODE)
    .config("spark.sql.warehouse.dir", warehouseLocation)
    .config("hive.exec.dynamic.partition", "true")
    .config("hive.exec.dynamic.partition.mode", "nonstrict")
    .config("hive.exec.max.dynamic.partitions", 2000)
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .config("spark.kryoserializer.buffer", "1024m")
    .config("spark.kryoserializer.buffer.max", "2046m")
    .config("spark.io.compression.codec", "snappy")
    .config("spark.sql.codegen", "true")
    .config("spark.sql.unsafe.enabled", "true")
    .config("spark.shuffle.manager", "tungsten-sort")
    .enableHiveSupport()
    .getOrCreate()
}

 

你可能感兴趣的:(Spark)