在我们拿到的数据集中常常会存在某个属性的数值缺失这种情况。面对这种情况有两种办法:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("dataDeal").getOrCreate()
df_miss = spark.createDataFrame([
(1, 143.5, 5.6, 28, 'M', 100000),
(2, 167.2, 5.4, 45, 'M', None),
(3, None , 5.2, None, None, None),
(4, 144.5, 5.9, 33, 'M', None),
(5, 133.2, 5.7, 54, 'F', None),
(6, 124.1, 5.2, None, 'F', None),
(7, 129.2, 5.3, 42, 'M', 76000)
], ['id', 'weight', 'height', 'age', 'gender', 'income'])
根据上面的数据,我们可以直观看出: