准备数据建模:
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext('local')
spark = SparkSession(sc)
df = spark.createDataFrame([
(1, 144.5, 5.9, 33, 'M'),
(2, 167.2, 5.4, 45, 'M'),
(3, 124.1, 5.2, 23, 'F'),
(4, 144.5, 5.9, 33, 'M'),
(5, 133.2, 5.7, 54, 'F'),
(3, 124.1, 5.2, 23, 'F'),
(5, 129.2, 5.3, 42, 'M'),
], ['id', 'weight', 'height', 'age', 'gender'])
df.show()
+---+------+------+---+------+
| id|weight|height|age|gender|
+---+------+------+---+------+
| 1| 144.5| 5.9| 33| M|
| 2| 167.2| 5.4| 45| M|
| 3| 124.1| 5.2| 23| F|
| 4| 144.5| 5.9| 33| M|
| 5| 133.2| 5.7| 54| F|
| 3| 124.1| 5.2| 23| F|
| 5| 129.2| 5.3| 42| M|
+---+------+------+---+------+
print(df.count()) #打印出行数--- # 7
print(df.distinct().count()) # 6
#删除重复的样本
df = df.dropDuplicates()
df.show()
+---+------+------+---+------+
| id|weight|height|age|gender|
+---+------+------+---+------+
| 5| 133.2| 5.7| 54| F|
| 5| 129.2| 5.3| 42| M|
| 1| 144.5| 5.9| 33| M|
| 4| 144.5| 5.9| 33| M|
| 2| 167.2| 5.4| 45| M|
| 3| 124.1| 5.2| 23| F|
+---+------+------+---+------+
#计算id 的总数和 不同id 的个数
import pyspark.sql.functions as F
df.agg(
F.count('id').alias('all'),
F.countDistinct('id').alias('distinct_id')
).show()
+---+-----------+
|all|distinct_id|
+---+-----------+
| 6| 5|
+---+-----------+
#设置唯一的 id 号
df.withColumn('new_id', F.monotonically_increasing_id()).show()
+---+------+------+---+------+-------------+
| id|weight|height|age|gender| new_id|
+---+------+------+---+------+-------------+
| 5| 133.2| 5.7| 54| F| 171798691840|
| 5| 129.2| 5.3| 42| M| 326417514496|
| 1| 144.5| 5.9| 33| M| 481036337152|
| 4| 144.5| 5.9| 33| M| 644245094400|
| 2| 167.2| 5.4| 45| M| 721554505728|
| 3| 124.1| 5.2| 23| F|1623497637888|
+---+------+------+---+------+-------------+
缺失值处理
df_miss = spark.createDataFrame([
(1, 143.5, 5.6, 28, 'M', 100000),
(2, 167.2, 5.4, 45, 'M', None),
(3, None , 5.2, None, None, None),
(4, 144.5, 5.9, 33, 'M', None),
(5, 133.2, 5.7, 54, 'F', None),
(6, 124.1, 5.2, None, 'F', None),
(7, 129.2, 5.3, 42, 'M', 76000),
], ['id', 'weight', 'height', 'age', 'gender', 'income'])
df_miss.show()
+---+------+------+----+------+------+
| id|weight|height| age|gender|income|
+---+------+------+----+------+------+
| 1| 143.5| 5.6| 28| M|100000|
| 2| 167.2| 5.4| 45| M| null|
| 3| null| 5.2|null| null| null|
| 4| 144.5| 5.9| 33| M| null|
| 5| 133.2| 5.7| 54| F| null|
| 6| 124.1| 5.2|null| F| null|
| 7| 129.2| 5.3| 42| M| 76000|
+---+------+------+----+------+------+
#计算出每行的缺失值, * 指示该方法计算所有的列
df_miss.rdd.map(lambda row: (row['id'], sum([c == None for c in row]))).collect()
#计算出每行的缺失值, * 指示该方法计算所有的列 df_miss.rdd.map(lambda row: (row['id'], sum([c == None for c in row]))).collect()
#计算出每行的缺失值, * 指示该方法计算所有的列
df_miss.rdd.map(lambda row: (row['id'], sum([c == None for c in row]))).collect()
Out[9]:
[(1, 0), (2, 1), (3, 4), (4, 1), (5, 1), (6, 2), (7, 0)]
#计算每个特征的缺失率
df_miss.agg(
*[(1 - (F.count(c) / F.count('*'))).alias(c+'_missing') for c in df_miss.columns]
).show()
+----------+------------------+--------------+------------------+------------------+------------------+
|id_missing| weight_missing|height_missing| age_missing| gender_missing| income_missing|
+----------+------------------+--------------+------------------+------------------+------------------+
| 0.0|0.1428571428571429| 0.0|0.2857142857142857|0.1428571428571429|0.7142857142857143|
+----------+------------------+--------------+------------------+------------------+------------------+
#删除income 列
data_drop_income = df_miss.select([c for c in df_miss.columns if c != 'income'])
data_drop_income.show()
+---+------+------+----+------+
| id|weight|height| age|gender|
+---+------+------+----+------+
| 1| 143.5| 5.6| 28| M|
| 2| 167.2| 5.4| 45| M|
| 3| null| 5.2|null| null|
| 4| 144.5| 5.9| 33| M|
| 5| 133.2| 5.7| 54| F|
| 6| 124.1| 5.2|null| F|
| 7| 129.2| 5.3| 42| M|
+---+------+------+----+------+
#删除样本中多于3 个缺失值的样本
data_drop_income.dropna(thresh=3).show()
+---+------+------+----+------+
| id|weight|height| age|gender|
+---+------+------+----+------+
| 1| 143.5| 5.6| 28| M|
| 2| 167.2| 5.4| 45| M|
| 4| 144.5| 5.9| 33| M|
| 5| 133.2| 5.7| 54| F|
| 6| 124.1| 5.2|null| F|
| 7| 129.2| 5.3| 42| M|
+---+------+------+----+------+
#对连续的特征以平均数填充,离散特征 为 missing
means = data_drop_income.agg(*
[F.mean(c).alias(c) for c in data_drop_income.columns if c != 'gender']
).toPandas().to_dict('recordes')[0]
means['gender'] = 'missing'
data_drop_income.fillna(means).show()
+---+------------------+------+---+-------+
| id| weight|height|age| gender|
+---+------------------+------+---+-------+
| 1| 143.5| 5.6| 28| M|
| 2| 167.2| 5.4| 45| M|
| 3|140.28333333333333| 5.2| 40|missing|
| 4| 144.5| 5.9| 33| M|
| 5| 133.2| 5.7| 54| F|
| 6| 124.1| 5.2| 40| F|
| 7| 129.2| 5.3| 42| M|
+---+------------------+------+---+-------+
离群值
#离群值
df_outliers = spark.createDataFrame([
(1, 143.5, 5.3, 28),
(2, 154.2, 5.5, 45),
(3, 342.3, 5.1, 99),
(4, 144.5, 5.5, 33),
(5, 133.2, 5.4, 54),
(6, 124.1, 5.1, 21),
(7, 129.2, 5.3, 42),
], ['id', 'weight', 'height', 'age'])
df_outliers.show()
+---+------+------+---+
| id|weight|height|age|
+---+------+------+---+
| 1| 143.5| 5.3| 28|
| 2| 154.2| 5.5| 45|
| 3| 342.3| 5.1| 99|
| 4| 144.5| 5.5| 33|
| 5| 133.2| 5.4| 54|
| 6| 124.1| 5.1| 21|
| 7| 129.2| 5.3| 42|
+---+------+------+---+
cols = ['weight', 'height', 'age']
bounds = {}
for col in cols:
quan = df_outliers.approxQuantile(col, (0.25, 0.75), 0.05)
IQR = quan[1] - quan[0]
bounds[col] = [quan[0] - 1.5 * IQR, quan[1] + 1.5 * IQR]
bounds
{'weight': [91.69999999999999, 191.7],
'height': [4.499999999999999, 6.1000000000000005],
'age': [-11.0, 93.0]}
# 找出离群点
outliers = df_outliers.select(*['id'] + [
(
(df_outliers[c] < bounds[c][0]) |
(df_outliers[c] > bounds[c][1])
).alias(c + '_o') for c in cols
])
outliers.show()
+---+--------+--------+-----+
| id|weight_o|height_o|age_o|
+---+--------+--------+-----+
| 1| false| false|false|
| 2| false| false|false|
| 3| true| false| true|
| 4| false| false|false|
| 5| false| false|false|
| 6| false| false|false|
| 7| false| false|false|
+---+--------+--------+-----+
# 显示出离群点的值
df_outliers = df_outliers.join(outliers, on='id')
df_outliers.filter('weight_o').select('id', 'weight').show()
df_outliers.filter('age_o').select('id', 'age').show()
+---+------+
| id|weight|
+---+------+
| 3| 342.3|
+---+------+
+---+---+
| id|age|
+---+---+
| 3| 99|
+---+---+