sparksql dataFrame 关于列的增删改操作

最近项目中用到spark和ES的集成,涉及到一系列对dataFrame列的操作,时间主要花在列的增删改上面。

整个类采用的是spark+ES+Redis架构,ES存基础数据,redis 存储条件,根据redis的条件从ES筛选出符合条件的记录。

val sqlContex = SparkSession.builder().master("local[2]")
  .appName("spark2ES")
  // .config("hive.metastore.uris", "thrift://172.1.1.199:9083") //集成hive
  .config("spark.testing.memory", "2147480000")
  .config("es.index.auto.create", "true")
  .config("es.nodes", "xxxx")
  // .enableHiveSupport() // 增加hive支持
  .getOrCreate()

// 读取es数据

val optionMap = Map("path" -> "ods_wj_apk_index/docs", "pushdown" -> "true")
val esDF = sqlContex.read.format("org.elasticsearch.spark.sql").options(optionMap).load()

// 增加列操作

 
  
// 方法一 sql的自定义函数
 val topic ="topic123"
tempDataFrame.createOrReplaceTempView("temp")
sqlContex.sqlContext.udf.register("replaceCol", (str:String) => topic)
val addDf =sqlContex.sqlContext.sql(s"select *,replaceCol(content) as topicName from temp")
addDf.show()

// 方法二 dataFrame  withcolumn 自定义函数
val topic ="topic123"
val replace = (x:String)=>{topic}
val replaceCol = udf(replace)
val data = tempDataFrame.withColumn("topicName",replaceCol(tempDataFrame("content")))
data.show()
// 修改列值 重新构造dataFrame,新dataFrame先取不用改变的列,编写自定义函数构造新列,再加到新
dataFrame

      esDF.createOrReplaceTempView("temp")

      // 创建自定义函数
      sqlContex.udf.register("Time2Time", Time2Time _)
   
      // 获取colName
      val linkedColNames = getLinkedColNames(esDF.schema.fieldNames)
    
      val addDf = sqlContex.sqlContext.sql(s"select $linkedColNames,Time2Time(update_time) AS update_time,Time2Time(create_time) AS create_time,from temp where $conditon")
//     addDf.saveToEs("ods_wj_scenes_detail/docs")
      addDf.select("update_time").show(50)
//删除列操作
tempDataFrame.drop("topicName")
 
  

下面是整代码

object EsFilterforNetworkMonitoring {
  def main(args: Array[String]): Unit = {
    val sqlContex = SparkSession.builder().master("local[2]")
      .appName("spark2ES")
      // .config("hive.metastore.uris", "thrift://172.1.1.199:9083")
      .config("spark.testing.memory", "2147480000")
      .config("es.index.auto.create", "true")
      .config("es.nodes", "172.10.4.4:9200,172.10.4.5:9200,172.10.4.6:9200")
      // .enableHiveSupport()
      .getOrCreate()

      
    // section 1: 读取ES数据
    val optionMap = Map("path" -> "ods_wj_apk_index/docs", "pushdown" -> "true")
    val esDF = sqlContex.read.format("org.elasticsearch.spark.sql").options(optionMap).load()
    val topicMap = Map("topic123456789" -> "1==1&&2==2")

    // 筛选出满足条件的数据
    for ((k, v) <- topicMap) {
      val topic = k
      val conditon = v.replaceAll("==", "=").replaceAll("&&", " and ").replace("||", " or ")
      println("============================= " + conditon)
      esDF.createOrReplaceTempView("temp")

      // 创建自定义函数
      sqlContex.udf.register("Time2Time", Time2Time _)
      sqlContex.sqlContext.udf.register("replaceCol", (str: String) => topic)

      // 获取colName
      val linkedColNames = getLinkedColNames(esDF.schema.fieldNames)
      println( "linkedNames : "+linkedColNames)

      val addDf = sqlContex.sqlContext.sql(s"select $linkedColNames,Time2Time(update_time) AS update_time,Time2Time(create_time) AS create_time,replaceCol(apk_name) as topicName from temp where $conditon")
//     addDf.saveToEs("ods_wj_scenes_detail/docs")
      addDf.select("update_time").show(50)

  }

  def Time2Time(tp: java.sql.Timestamp): String = {
    if (tp == null ) {
      null
    } else {
      try {
        val sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'")
        val datestr: String = sdf.format(tp.getTime)
        datestr
      } catch {
        case e: Exception =>
          e.printStackTrace()
          null
      }
    }
  }

    //  拼接字符串
    def getLinkedColNames(arr:Array[String]):String={
      var linkedColName =""
      for(ele<-arr if(!("update_time".equals(ele)||("create_time".equals(ele))))){
        if(linkedColName.length==0){
          linkedColName=ele
        }else{
          linkedColName=linkedColName+","+ele
        }
      }
      linkedColName
    }
  }






你可能感兴趣的:(sparksql dataFrame 关于列的增删改操作)