from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.types import *
spark=SparkSession.builder.appName("lz").getOrCreate()
sc = SparkContext.getOrCreate()
#检查重复项、缺失值和异常值
df = spark.createDataFrame([
... (1,144.5,5.9,33,'M'),
... (2,167.2,5.4,45,'M'),
... (3,124.1,5.2,23,'F'),
... (4,144.5,5.9,33,'M'),
... (5,133.2,5.7,54,'F'),
... (3,124.1,5.2,23,'F'),
... (5,129.2,5.3,42,'M'),
... ],['id','weight','height','age','gender'])
print('count of rows: {0}'.format(df.count()))
count of rows: 7
#用.distinct()方法检查
print('count of distinct rows: {0}'.format(df.distinct().count()))
count of distinct rows: 6
df = df.dropDuplicates()#去重 默认根据第一列去除完全重复的行
df.show()
+---+------+------+---+------+
| id|weight|height|age|gender|
+---+------+------+---+------+
| 5| 133.2| 5.7| 54| F|
| 5| 129.2| 5.3| 42| M|
| 1| 144.5| 5.9| 33| M|
| 4| 144.5| 5.9| 33| M|
| 2| 167.2| 5.4| 45| M|
| 3| 124.1| 5.2| 23| F|
+---+------+------+---+------+
print ('count of ids: {0}'.format(df.count()))
count of ids: 6
print('count of distinct ids: {0}'.format(df.select([c for c in df.columns if c!='id']).distinct().count()))
count of distinct ids: 5
df = df.dropDuplicates(subset = [c for c in df.columns if c!='id'])#使用ID列以外的列再次去重。
df.show()
+---+------+------+---+------+
| id|weight|height|age|gender|
+---+------+------+---+------+
| 5| 133.2| 5.7| 54| F|
| 1| 144.5| 5.9| 33| M|
| 2| 167.2| 5.4| 45| M|
| 3| 124.1| 5.2| 23| F|
| 5| 129.2| 5.3| 42| M|
+---+------+------+---+------+
import pyspark.sql.functions as fn
#现在我们知道没有完整的行重复,或任何相同的行只有ID不同,我们来检查是否有重复的ID。
#要在一步中计算ID的总数和不同数量,我们可以使用.agg(…)方法
df.agg(fn.count('id').alias('count'),
... fn.countDistinct('id').alias('distinct')).show()
+-----+--------+
|count|distinct|
+-----+--------+
| 5| 4|
+-----+--------+
#使用 fn.monotonically_increasing_id() 方法重新分配ID。
df.withColumn('new_id',fn.monotonically_increasing_id()).show()
+---+------+------+---+------+-------------+
| id|weight|height|age|gender| new_id|
+---+------+------+---+------+-------------+
| 5| 133.2| 5.7| 54| F| 25769803776|
| 1| 144.5| 5.9| 33| M| 171798691840|
| 2| 167.2| 5.4| 45| M| 592705486848|
| 3| 124.1| 5.2| 23| F|1236950581248|
| 5| 129.2| 5.3| 42| M|1365799600128|
+---+------+------+---+------+-------------+
df_miss = spark.createDataFrame([
... (1,143.5,5.6,28,'M',100000),
... (2,167.2,5.4,45,'M',None),
... (3,None,5.2,None,None,None),
... (4,144.5,5.9,33,'M',None),
... (5,133.2,5.7,54,'F',None),
... (6,124.1,5.2,None,'F',None)],
... ['id','weight','height','age','gender','income'])
#删除'income'列
df_miss_no_income=df_miss.select([c for c in df_miss.columns if c!='income'])
#注意观察thresh这个参数的含义
#注意观察thresh这个参数的含义 每行非空数据的阀值。只要非空数据的数量低于这个阀值,则删除该行
df_miss_no_income.dropna(thresh=3).show()
+---+------+------+----+------+
| id|weight|height| age|gender|
+---+------+------+----+------+
| 1| 143.5| 5.6| 28| M|
| 2| 167.2| 5.4| 45| M|
| 4| 144.5| 5.9| 33| M|
| 5| 133.2| 5.7| 54| F|
| 6| 124.1| 5.2|null| F|
+---+------+------+----+------+
df_miss_no_income.dropna(thresh=2).show()
+---+------+------+----+------+
| id|weight|height| age|gender|
+---+------+------+----+------+
| 1| 143.5| 5.6| 28| M|
| 2| 167.2| 5.4| 45| M|
| 3| null| 5.2|null| null|
| 4| 144.5| 5.9| 33| M|
| 5| 133.2| 5.7| 54| F|
| 6| 124.1| 5.2|null| F|
+---+------+------+----+------+
df_miss_no_income.dropna(thresh=1).show()
+---+------+------+----+------+
| id|weight|height| age|gender|
+---+------+------+----+------+
| 1| 143.5| 5.6| 28| M|
| 2| 167.2| 5.4| 45| M|
| 3| null| 5.2|null| null|
| 4| 144.5| 5.9| 33| M|
| 5| 133.2| 5.7| 54| F|
| 6| 124.1| 5.2|null| F|
+---+------+------+----+------+
df_miss_no_income.dropna(thresh=4).show()
+---+------+------+----+------+
| id|weight|height| age|gender|
+---+------+------+----+------+
| 1| 143.5| 5.6| 28| M|
| 2| 167.2| 5.4| 45| M|
| 4| 144.5| 5.9| 33| M|
| 5| 133.2| 5.7| 54| F|
| 6| 124.1| 5.2|null| F|
+---+------+------+----+------+
df_miss_no_income.dropna(thresh=5).show()
+---+------+------+---+------+
| id|weight|height|age|gender|
+---+------+------+---+------+
| 1| 143.5| 5.6| 28| M|
| 2| 167.2| 5.4| 45| M|
| 4| 144.5| 5.9| 33| M|
| 5| 133.2| 5.7| 54| F|
+---+------+------+---+------+
df_miss_no_income.dropna(thresh=6).show()
+---+------+------+---+------+
| id|weight|height|age|gender|
+---+------+------+---+------+
+---+------+------+---+------+
df_miss_no_income.dropna(thresh=0).show()
+---+------+------+----+------+
| id|weight|height| age|gender|
+---+------+------+----+------+
| 1| 143.5| 5.6| 28| M|
| 2| 167.2| 5.4| 45| M|
| 3| null| 5.2|null| null|
| 4| 144.5| 5.9| 33| M|
| 5| 133.2| 5.7| 54| F|
| 6| 124.1| 5.2|null| F|
+---+------+------+----+------+
df_miss_no_income.dropna(thresh>3).show()
Traceback (most recent call last):
File "", line 1, in
df_miss_no_income.dropna(thresh>3).show()
NameError: name 'thresh' is not defined
df_miss_no_income.show()
+---+------+------+----+------+
| id|weight|height| age|gender|
+---+------+------+----+------+
| 1| 143.5| 5.6| 28| M|
| 2| 167.2| 5.4| 45| M|
| 3| null| 5.2|null| null|
| 4| 144.5| 5.9| 33| M|
| 5| 133.2| 5.7| 54| F|
| 6| 124.1| 5.2|null| F|
+---+------+------+----+------+
df_miss_no_income.dropna(thresh=2).show()
+---+------+------+----+------+
| id|weight|height| age|gender|
+---+------+------+----+------+
| 1| 143.5| 5.6| 28| M|
| 2| 167.2| 5.4| 45| M|
| 3| null| 5.2|null| null|
| 4| 144.5| 5.9| 33| M|
| 5| 133.2| 5.7| 54| F|
| 6| 124.1| 5.2|null| F|
+---+------+------+----+------+
df_miss_no_income.dropna(thresh=3).show()
+---+------+------+----+------+
| id|weight|height| age|gender|
+---+------+------+----+------+
| 1| 143.5| 5.6| 28| M|
| 2| 167.2| 5.4| 45| M|
| 4| 144.5| 5.9| 33| M|
| 5| 133.2| 5.7| 54| F|
| 6| 124.1| 5.2|null| F|
+---+------+------+----+------+
df_miss_no_income.dropna(thresh=1).show()
+---+------+------+----+------+
| id|weight|height| age|gender|
+---+------+------+----+------+
| 1| 143.5| 5.6| 28| M|
| 2| 167.2| 5.4| 45| M|
| 3| null| 5.2|null| null|
| 4| 144.5| 5.9| 33| M|
| 5| 133.2| 5.7| 54| F|
| 6| 124.1| 5.2|null| F|
+---+------+------+----+------+
df_miss_no_income.dropna(thresh=0).show()
+---+------+------+----+------+
| id|weight|height| age|gender|
+---+------+------+----+------+
| 1| 143.5| 5.6| 28| M|
| 2| 167.2| 5.4| 45| M|
| 3| null| 5.2|null| null|
| 4| 144.5| 5.9| 33| M|
| 5| 133.2| 5.7| 54| F|
| 6| 124.1| 5.2|null| F|
+---+------+------+----+------+
df_miss_no_income.dropna(thresh=2).show()
+---+------+------+----+------+
| id|weight|height| age|gender|
+---+------+------+----+------+
| 1| 143.5| 5.6| 28| M|
| 2| 167.2| 5.4| 45| M|
| 3| null| 5.2|null| null|
| 4| 144.5| 5.9| 33| M|
| 5| 133.2| 5.7| 54| F|
| 6| 124.1| 5.2|null| F|
+---+------+------+----+------+
df_miss_no_income.dropna(thresh=3).show()
+---+------+------+----+------+
| id|weight|height| age|gender|
+---+------+------+----+------+
| 1| 143.5| 5.6| 28| M|
| 2| 167.2| 5.4| 45| M|
| 4| 144.5| 5.9| 33| M|
| 5| 133.2| 5.7| 54| F|
| 6| 124.1| 5.2|null| F|
+---+------+------+----+------+
df_miss_no_income.dropna(thresh=1).show()
+---+------+------+----+------+
| id|weight|height| age|gender|
+---+------+------+----+------+
| 1| 143.5| 5.6| 28| M|
| 2| 167.2| 5.4| 45| M|
| 3| null| 5.2|null| null|
| 4| 144.5| 5.9| 33| M|
| 5| 133.2| 5.7| 54| F|
| 6| 124.1| 5.2|null| F|
+---+------+------+----+------+
df_miss.dropna(thresh=2).show()
+---+------+------+----+------+------+
| id|weight|height| age|gender|income|
+---+------+------+----+------+------+
| 1| 143.5| 5.6| 28| M|100000|
| 2| 167.2| 5.4| 45| M| null|
| 3| null| 5.2|null| null| null|
| 4| 144.5| 5.9| 33| M| null|
| 5| 133.2| 5.7| 54| F| null|
| 6| 124.1| 5.2|null| F| null|
+---+------+------+----+------+------+
df_miss.dropna(thresh=3).show()
+---+------+------+----+------+------+
| id|weight|height| age|gender|income|
+---+------+------+----+------+------+
| 1| 143.5| 5.6| 28| M|100000|
| 2| 167.2| 5.4| 45| M| null|
| 4| 144.5| 5.9| 33| M| null|
| 5| 133.2| 5.7| 54| F| null|
| 6| 124.1| 5.2|null| F| null|
+---+------+------+----+------+------+
df_miss.show()
+---+------+------+----+------+------+
| id|weight|height| age|gender|income|
+---+------+------+----+------+------+
| 1| 143.5| 5.6| 28| M|100000|
| 2| 167.2| 5.4| 45| M| null|
| 3| null| 5.2|null| null| null|
| 4| 144.5| 5.9| 33| M| null|
| 5| 133.2| 5.7| 54| F| null|
| 6| 124.1| 5.2|null| F| null|
+---+------+------+----+------+------+
df_miss.dropna(thresh=1).show()
+---+------+------+----+------+------+
| id|weight|height| age|gender|income|
+---+------+------+----+------+------+
| 1| 143.5| 5.6| 28| M|100000|
| 2| 167.2| 5.4| 45| M| null|
| 3| null| 5.2|null| null| null|
| 4| 144.5| 5.9| 33| M| null|
| 5| 133.2| 5.7| 54| F| null|
| 6| 124.1| 5.2|null| F| null|
+---+------+------+----+------+------+
df_miss.dropna().show()
+---+------+------+---+------+------+
| id|weight|height|age|gender|income|
+---+------+------+---+------+------+
| 1| 143.5| 5.6| 28| M|100000|
+---+------+------+---+------+------+
df_miss.dropna().show()
+---+------+------+---+------+------+
| id|weight|height|age|gender|income|
+---+------+------+---+------+------+
| 1| 143.5| 5.6| 28| M|100000|
+---+------+------+---+------+------+
df_miss.dropna(thresh=0).show()
+---+------+------+----+------+------+
| id|weight|height| age|gender|income|
+---+------+------+----+------+------+
| 1| 143.5| 5.6| 28| M|100000|
| 2| 167.2| 5.4| 45| M| null|
| 3| null| 5.2|null| null| null|
| 4| 144.5| 5.9| 33| M| null|
| 5| 133.2| 5.7| 54| F| null|
| 6| 124.1| 5.2|null| F| null|
+---+------+------+----+------+------+
df_miss.dropna(thresh=1).show()
+---+------+------+----+------+------+
| id|weight|height| age|gender|income|
+---+------+------+----+------+------+
| 1| 143.5| 5.6| 28| M|100000|
| 2| 167.2| 5.4| 45| M| null|
| 3| null| 5.2|null| null| null|
| 4| 144.5| 5.9| 33| M| null|
| 5| 133.2| 5.7| 54| F| null|
| 6| 124.1| 5.2|null| F| null|
+---+------+------+----+------+------+
df_miss.dropna(thresh=2).show()
+---+------+------+----+------+------+
| id|weight|height| age|gender|income|
+---+------+------+----+------+------+
| 1| 143.5| 5.6| 28| M|100000|
| 2| 167.2| 5.4| 45| M| null|
| 3| null| 5.2|null| null| null|
| 4| 144.5| 5.9| 33| M| null|
| 5| 133.2| 5.7| 54| F| null|
| 6| 124.1| 5.2|null| F| null|
+---+------+------+----+------+------+
df_miss.dropna(thresh=3).show()
+---+------+------+----+------+------+
| id|weight|height| age|gender|income|
+---+------+------+----+------+------+
| 1| 143.5| 5.6| 28| M|100000|
| 2| 167.2| 5.4| 45| M| null|
| 4| 144.5| 5.9| 33| M| null|
| 5| 133.2| 5.7| 54| F| null|
| 6| 124.1| 5.2|null| F| null|
+---+------+------+----+------+------+
df_miss.dropna(thresh=4).show()
+---+------+------+----+------+------+
| id|weight|height| age|gender|income|
+---+------+------+----+------+------+
| 1| 143.5| 5.6| 28| M|100000|
| 2| 167.2| 5.4| 45| M| null|
| 4| 144.5| 5.9| 33| M| null|
| 5| 133.2| 5.7| 54| F| null|
| 6| 124.1| 5.2|null| F| null|
+---+------+------+----+------+------+
df_miss.dropna(thresh=5).show()
+---+------+------+---+------+------+
| id|weight|height|age|gender|income|
+---+------+------+---+------+------+
| 1| 143.5| 5.6| 28| M|100000|
| 2| 167.2| 5.4| 45| M| null|
| 4| 144.5| 5.9| 33| M| null|
| 5| 133.2| 5.7| 54| F| null|
+---+------+------+---+------+------+
df_miss.dropna(thresh=6).show()
+---+------+------+---+------+------+
| id|weight|height|age|gender|income|
+---+------+------+---+------+------+
| 1| 143.5| 5.6| 28| M|100000|
+---+------+------+---+------+------+