spark 填充缺失值系列

填充均值

  //连续值填充均值
      def ContinuousMissValueProcessingMean(df:DataFrame,douCols:Array[String]):DataFrame= {
            println("----连续值填充均值----开始-----")
            val meanDF = df.select((douCols).map(c => col(c)): _*)
            val df_temp = df.na.fill(meanDF.columns.zip(
                  meanDF.select(meanDF.columns.map(mean(_)): _*).first.toSeq
            ).toMap)
            df_temp
      }

填充最大值

     //连续值填充最大值
      def ContinuousMissValueProcessingMax(df:DataFrame,douCols:Array[String]):DataFrame={
            println("----连续值填充均值----开始-----")
            val meanDF= df.select((douCols).map(c => col(c)): _*)
            val df_temp=df.na.fill(meanDF.columns.zip(
                  meanDF.select(meanDF.columns.map(max(_)): _*).first.toSeq
            ).toMap)
            df_temp
      }

填充最小值

 //连续值填充最小值
      def ContinuousMissValueProcessingMin(df:DataFrame,douCols:Array[String]):DataFrame={
            println("----连续值填充均值----开始-----")
            val meanDF= df.select((douCols).map(c => col(c)): _*)
            val df_temp=df.na.fill(meanDF.columns.zip(
                  meanDF.select(meanDF.columns.map(min(_)): _*).first.toSeq
            ).toMap)
            df_temp
      }

你可能感兴趣的:(spark,spark,缺失值填充,dataframe)