建df时的空值表示形式为:null
null
val df = Seq("a", null, "c", "b").toDF("col1")
df.show()
var df4 = df.na.fill(value="qqq",Array[String]("col1"))
df4.show()
df: org.apache.spark.sql.DataFrame = [col1: string]
+----+
|col1|
+----+
| a|
|null|
| c|
| b|
+----+
df4: org.apache.spark.sql.DataFrame = [col1: string]
+----+
|col1|
+----+
| a|
| qqq|
| c|
| b|
+----+
此时的空值形式为
“null”
val df2 = df.withColumn("col1", regexp_replace(col("col1"), "NullNone", "null"))
df2.show()
df2: org.apache.spark.sql.DataFrame = [col1: string]
+----+
|col1|
+----+
| a|
|null|
| c|
| b|
+----+
val df3 = df2.na.fill(value="qqq",Array[String]("col1"))
df3.show()
df3: org.apache.spark.sql.DataFrame = [col1: string]
+----+
|col1|
+----+
| a|
| qqq|
| c|
| b|
+----+
相比于String ,需要在最后进行特征列类型改变
import spark.implicits._
var data1 = Seq(
("0.0", "1002", "1", "1.5", "bai"),
("1.0", "2004", "2", "2.1", "wang"),
("0.0", "3007", "2", "2.1", "wang"),
("0.0", "4004", "3", "3.4", "wa"),
("1.0", "5007", "3", "3.4", "wa"),
("1.0", "17009", null, "5.9", "wei"),
("0.0","18010", "12", "5.9", "wei")
).toDF("label", "AMOUNT", "Pclass", "name", "MAC_id")
data1 = data1.withColumn("Pclass", col("Pclass").cast("double"))
data1.show()
+-----+------+------+----+------+
|label|AMOUNT|Pclass|name|MAC_id|
+-----+------+------+----+------+
| 0.0| 1002| 1| 1.5| bai|
| 1.0| 2004| 2| 2.1| wang|
| 0.0| 3007| 2| 2.1| wang|
| 0.0| 4004| 3| 3.4| wa|
| 1.0| 5007| 3| 3.4| wa|
| 1.0| 17009| null| 5.9| wei|
| 0.0| 18010| 12| 5.9| wei|
+-----+------+------+----+------+
var result_data = data1
result_data = result_data.na.fill(value="-100.0".toDouble,ever_colName_list)
result_data.show()
println(result_data.dtypes.toMap)
for(cln <- ever_colName_list){
result_data = result_data.withColumn(cln, regexp_replace(col(cln), "-100.0", "null"))
result_data = result_data.withColumn(cln, col(cln).cast("double"))
}
result_data.show()
println(result_data.dtypes.toMap)
结果:
+-----+------+------+----+------+
|label|AMOUNT|Pclass|name|MAC_id|
+-----+------+------+----+------+
| 0.0| 1002| 1.0| 1.5| bai|
| 1.0| 2004| 2.0| 2.1| wang|
| 0.0| 3007| 2.0| 2.1| wang|
| 0.0| 4004| 3.0| 3.4| wa|
| 1.0| 5007| 3.0| 3.4| wa|
| 1.0| 17009|-100.0| 5.9| wei|
| 0.0| 18010| 12.0| 5.9| wei|
+-----+------+------+----+------+
Map(name -> StringType, label -> StringType, Pclass -> DoubleType, AMOUNT -> StringType, MAC_id -> StringType)
+-----+------+------+----+------+
|label|AMOUNT|Pclass|name|MAC_id|
+-----+------+------+----+------+
| 0.0| 1002| 1.0| 1.5| bai|
| 1.0| 2004| 2.0| 2.1| wang|
| 0.0| 3007| 2.0| 2.1| wang|
| 0.0| 4004| 3.0| 3.4| wa|
| 1.0| 5007| 3.0| 3.4| wa|
| 1.0| 17009| null| 5.9| wei|
| 0.0| 18010| 12.0| 5.9| wei|
+-----+------+------+----+------+
Map(name -> StringType, label -> StringType, Pclass -> DoubleType, AMOUNT -> StringType, MAC_id -> StringType)