Spark读取hdfs文件并写入hive表中

package com.job

import org.apache.commons.cli.{BasicParser, CommandLine, Options}
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.types.{StringType, StructType}

case class Transfer2HiveConfig(query_day: String)

object Transfer2HiveJob {
  val QUERY_DAY = "query-day"

  def parseAsConfig(commandLine: CommandLine): Option[Transfer2HiveConfig] = {
    val queryDay = commandLine.getOptionValue(QUERY_DAY)
    Some(Transfer2HiveConfig(queryDay))
  }

  def main(args: Array[String]): Unit = {
    val parser = new BasicParser()
    val options = new Options()
    options.addOption("qd", QUERY_DAY, true, "query day")
    val commandLine = parser.parse(options, args)

    parseAsConfig(commandLine) match {
      case Some(config) =>
        val spark = SparkSession.builder()
          .appName(s"app-name")
          .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
          .enableHiveSupport()
          .getOrCreate()
        try {
          new Transfer2HiveJob(spark, config).run()
        } catch {
          case ex: Throwable =>
            println(s"error running ${classOf[Transfer2HiveJob].getSimpleName}, $ex")
            sys.exit(1)
        }
      case None => sys.exit(1)
    }
  }
}

class Transfer2HiveJob(spark: SparkSession, config: Transfer2HiveConfig) {

  import spark.implicits._

  def run(): Unit = {
    // path也可以通过参数传入
    transfer2hive("path")
  }

  def transfer2hive(path: String): Unit = {
    println(s"-----------------path=$path")
    // 创建Schema
    val dataSchema: StructType = new StructType()
      .add("column1", StringType)
      .add("column2", StringType)

    // 使用哪个db
    spark.sql("use xxx")

    // 读取hdfs文件的数据
    val data = spark.sparkContext.textFile(path)
      .map(item => item.split("\t"))
      .filter(item => item.length == 2)
      .map(item => Row(item(0), item(1)))

    // 基于读取的数据和对应的schema创建临时表
    spark.createDataFrame(data, dataSchema)
      .createTempView(s"table_temp")

    // 删除分区
    val dropPartitionSQL = s"alter table table_name drop if exists partition(p_day='${config.query_day}')"
    spark.sql(dropPartitionSQL)

    // 将数据写入hive表对应的分区
    val insert2hiveSQL = s"insert into table_name " +
      s"partition(p_day='${config.query_day}') " +
      s"select column1, column2 from table_temp"
    spark.sql(insert2hiveSQL)
  }
}

 

你可能感兴趣的:(hadoop)