from pyspark.sql import SparkSession, Row
from pyspark.sql.types import *
# 构建SparkSession执行环境入口
spark = SparkSession.builder.\
appName("test").\
master("local[*]").\
config("spark.sql.shuffle.partitions", 2).\
getOrCreate()
sc = spark.sparkContext
rdd = sc.parallelize([(1001, "Tom", 12), (1002, "Jerry", 13), (1003, "John", 14)])
rdd = rdd.map(lambda x: Row(x[0], x[1], x[2]))
'''
StructType(列名, 列类型, 列是否允许为空) 描述df表结构
StructField 描述一列的信息
Row(1, 'a', 12) 记录一行数据
Column 记录一列数据并包含列信息(StructField)
'''
schema = StructType([StructField("id", LongType(), True)
,StructField("name", StringType(), True)
,StructField("age", IntegerType(), True)])
'''
也可以:
schema = StructType().add('id', LongType(), True)\
.add('name', StringType(), True)\
.add('age', IntegerType(), True)
'''
df = spark.createDataFrame(rdd, schema)
# 或 # df = spark.createDataFrame(rdd, schema=['id', 'name', 'age'])
'''
param1: 展示多少数据 默认20
param2: 默认True 进行截断,数据长度超过20,后续内容不显示
'''
df.show()
# 打印表结构
df.printSchema()
'''
+----+-----+---+
| id| name|age|
+----+-----+---+
|1001| Tom| 12|
|1002|Jerry| 13|
|1003| John| 14|
+----+-----+---+
root
|-- id: long (nullable = true)
|-- name: string (nullable = true)
|-- age: integer (nullable = true)
'''
'''
数据类型需要在构建rdd时进行指定的转换
'''
df = rdd.toDF(['id', 'name', 'age'])
df.show()
df.printSchema()
'''
+----+-----+---+
| id| name|age|
+----+-----+---+
|1001| Tom| 12|
|1002|Jerry| 13|
|1003| John| 14|
+----+-----+---+
root
|-- id: long (nullable = true)
|-- name: string (nullable = true)
|-- age: long (nullable = true)
'''
# 也可传入schema,指定类型
schema = StructType().add('id', LongType(), True)\
.add('name', StringType(), True)\
.add('age', IntegerType(), True)
df = rdd.toDF(schema=schema)
df.show()
df.printSchema()
'''
+----+-----+---+
| id| name|age|
+----+-----+---+
|1001| Tom| 12|
|1002|Jerry| 13|
|1003| John| 14|
+----+-----+---+
root
|-- id: long (nullable = true)
|-- name: string (nullable = true)
|-- age: integer (nullable = true)
'''
# pandas DataFrame >>> pyspark.sql DataFrame
import pandas as pd
pdf = pd.DataFrame(
{
'id': [1, 2, 3],
'name': ['a', 'b', 'c'],
'age': [12, 13, 14]
}
)
df = spark.createDataFrame(pdf)
df.show()
df.printSchema()
'''
+---+----+---+
| id|name|age|
+---+----+---+
| 1| a| 12|
| 2| b| 13|
| 3| c| 14|
+---+----+---+
root
|-- id: long (nullable = true)
|-- name: string (nullable = true)
|-- age: long (nullable = true)
'''
'''
text数据源读取数据时,将一整行的数据 作为一个列的一项数据,默认名称value,类型string
可以用schema指定列的名字和类型
'''
schema = StructType().add('data', StringType(), True)
df = spark.read.format('text').\
schema(schema=schema).\
load('/test.text')
'''
+---+---+
| data|
+-------+
| 1 a 12|
| 2 b 13|
| 3 c 14|
+-------+
'''
'''
JSON数据源自带Schema信息
{"name": 'c'}
{"id": 1, "name": 'a'}
{"id": 2, "name": 'b'}
'''
df = spark.read.format('json').load('/test.text')
'''
+----+-----+
| id | name|
+----+-----+
|null| c |
| 1 | a |
| 2 | b |
+----------+
'''
'''
csv数据源
id,name,age
1,a,12
2,b,13
'''
df = spark.read.format('csv')\
.option('sep', ',')\ # 指定分隔符
.option('header', True)\ # csv文件中有表头(列名),True为不用
.option('encoding', 'utf-8')\
.schema('id INT, name STRING, age INT') # 指定列名和列数据类型
.load('/test.text')
'''
+---+----+---+
| id|name|age|
+---+----+---+
| 1| a| 12|
| 2| b| 13|
+---+----+---+
'''
# 构建临时视图表age,后续可用sql查询
df.createOrReplaceTempView("age")
# 构建全局表,可以跨SparkSession对象使用
# 在使用前 global_temp.
df.createGlobalTempView('global_age')
spark.sql(
"""
select * from age limit 2
"""
).show()
'''
+----+----+---+
| id|name|age|
+----+----+---+
|1001| Tom| 12|
|1003|John| 14|
+----+----+---+
'''
spark.sql(
"""
select * from global_temp.global_age
"""
).show()
'''
+----+-----+---+
| id| name|age|
+----+-----+---+
|1001| Tom| 12|
|1002|Jerry| 13|
|1003| John| 14|
|1004| Tom| 15|
+----+-----+---+
'''
data = [(1001, "Tom", 12.0)
,(1002, "Jerry", 13.0)
,(1003, "John", 14.0)
,(1004, 'Tom', 15.0)
,(1003, "John", 17.353)]
rdd = sc.parallelize(data)
rdd = rdd.map(lambda x: Row(x[0], x[1], x[2]))
schema = StructType([StructField("id", LongType(), True)
,StructField("name", StringType(), True)
,StructField("age", FloatType(), True)])
df = spark.createDataFrame(rdd, schema)
df.show()
'''
+----+-----+------+
| id| name| age|
+----+-----+------+
|1001| Tom| 12.0|
|1002|Jerry| 13.0|
|1003| John| 14.0|
|1004| Tom| 15.0|
|1003| John|17.353|
+----+-----+------+
'''
df.select('id', 'age').show()
'''
+----+------+
| id| age|
+----+------+
|1001| 12.0|
|1002| 13.0|
|1003| 14.0|
|1004| 15.0|
|1003|17.353|
+----+------+
'''
df.filter('age < 14').show()
# 同:
df.where('age < 14').show()
'''
+----+-----+----+
| id| name| age|
+----+-----+----+
|1001| Tom|12.0|
|1002|Jerry|13.0|
+----+-----+----+
'''
from pyspark.sql import functions as F
df[df['id'] > 1003].show()
'''
+----+----+----+
| id|name| age|
+----+----+----+
|1004| Tom|15.0|
+----+----+----+
'''
df.where(df['id'] > 1003).show()
'''
+----+----+----+
| id|name| age|
+----+----+----+
|1004| Tom|15.0|
+----+----+----+
'''
df.where((df['id'] < 1003) & (df['id'] > 1001)).show()
'''
+----+-----+----+
| id| name| age|
+----+-----+----+
|1002|Jerry|13.0|
+----+-----+----+
'''
# age大于平均值
df.where(df['age'] > df.select(F.avg(df['age'])).first()['avg(age)']).show()
'''
+----+----+------+
| id|name| age|
+----+----+------+
|1004| Tom| 15.0|
|1003|John|17.353|
+----+----+------+
'''
df.orderBy('age', ascending=False).show()
'''
+----+-----+------+
| id| name| age|
+----+-----+------+
|1003| John|17.353|
|1004| Tom| 15.0|
|1003| John| 14.0|
|1002|Jerry| 13.0|
|1001| Tom| 12.0|
+----+-----+------+
'''
# groupBy
df.groupBy('name').agg({'id': 'max', 'age': 'max'}).show()
'''
+-----+-------+--------+
| name|max(id)|max(age)|
+-----+-------+--------+
| Tom| 1004| 15.0|
| John| 1003| 17.353|
|Jerry| 1002| 13.0|
+-----+-------+--------+
'''
from pyspark.sql import functions as F
df.groupBy('name').agg(
# 取平均后保留2位小数
# alias 对Columns改名
F.round(F.avg('age'), 2).alias('avg_age'),
F.max('age').alias('min_age'),
F.count('id').alias('cnt_id')
).show()
'''
+-----+-------+-------+------+
| name|avg_age|min_age|cnt_id|
+-----+-------+-------+------+
| Tom| 13.5| 15.0| 2|
|Jerry| 13.0| 13.0| 1|
| John| 15.68| 17.353| 2|
+-----+-------+-------+------+
'''
# 创建新列
from pyspark.sql.functions import lit
df1 = df.withColumn("nnumberOne", lit(2))
df1.show()
'''
+----+-----+------+----------+
| id| name| age|nnumberOne|
+----+-----+------+----------+
|1001| Tom| 12.0| 2|
|1002|Jerry| 13.0| 2|
|1003| John| 14.0| 2|
|1004| Tom| 15.0| 2|
|1003| John|17.353| 2|
+----+-----+------+----------+
'''
from pyspark.sql import functions as F
'''
withColumn
对已存在的列操作
如果新列名和原有列名相同则覆盖原列,否则在表后追加新列
'''
df.withColumn('class & No.', F.split(df['id'], '00')).show()
'''
+----+-----+------+-----------+
| id| name| age|class & No.|
+----+-----+------+-----------+
|1001| Tom| 12.0| [1, 1]|
|1002|Jerry| 13.0| [1, 2]|
|1003| John| 14.0| [1, 3]|
|1004| Tom| 15.0| [1, 4]|
|1003| John|17.353| [1, 3]|
+----+-----+------+-----------+
'''
# 删除列
df1.drop('numberOne').show()
'''
+----+-----+------+----------+
| id| name| age|nnumberOne|
+----+-----+------+----------+
|1001| Tom| 12.0| 2|
|1002|Jerry| 13.0| 2|
|1003| John| 14.0| 2|
|1004| Tom| 15.0| 2|
|1003| John|17.353| 2|
+----+-----+------+----------+
'''
# 改名
df.withColumnRenamed('id', 'No.').show()
'''
+----+-----+------+
| No.| name| age|
+----+-----+------+
|1001| Tom| 12.0|
|1002|Jerry| 13.0|
|1003| John| 14.0|
|1004| Tom| 15.0|
|1003| John|17.353|
+----+-----+------+
'''
from pyspark.sql.functions import col
# 转换列的类型
df.withColumn('age_int', col('age').cast('int')).show()
'''
+----+-----+------+-------+
| id| name| age|age_int|
+----+-----+------+-------+
|1001| Tom| 12.0| 12|
|1002|Jerry| 13.0| 13|
|1003| John| 14.0| 14|
|1004| Tom| 15.0| 15|
|1003| John|17.353| 17|
+----+-----+------+-------+
'''
'''
去重
无参数,只保留重复行的第一行;
指定列,只保留重复列值第一次出现所在的行
'''
df.dropDuplicates(['name']).show()
'''
去缺失值
无参数,有缺失值就删除对应行;
有参数,
'''
df.dropna().show()
df.fillna('loss') 填充缺失值为'loss'
df.fillna('loss', subset=['name']) 指定列填充缺失值
df.fillna({'id': 0, 'name': 'loss', 'age': -1}) 同时指定多个列的缺失值填充规则
data = [(1001, "Tom", 12.0)
,(1002, "Jerry", 13.0)
,(1003, "John", 14.0)
,(1004, 'Tom', 15.0)
,(1003, "John", 17.353)]
rdd = sc.parallelize(data)
rdd = rdd.map(lambda x: Row(x[0], x[1], x[2]))
df_l = spark.createDataFrame(rdd, schema=['id', 'name', 'age'])
df_l.show()
'''
+----+-----+------+
| id| name| age|
+----+-----+------+
|1001| Tom| 12.0|
|1002|Jerry| 13.0|
|1003| John| 14.0|
|1004| Tom| 15.0|
|1003| John|17.353|
+----+-----+------+
'''
df_r = df_l.where(df_l['id'] > 1002).\
withColumn('class', F.split(df_l['id'], '00')[0])
df_r.show()
'''
+----+----+------+-----+
| id|name| age|class|
+----+----+------+-----+
|1003|John| 14.0| 1|
|1004| Tom| 15.0| 1|
|1003|John|17.353| 1|
+----+----+------+-----+
'''
df_l.join(df_r[['id', 'class']], how='inner', on='id').show()
'''
+----+----+------+-----+
| id|name| age|class|
+----+----+------+-----+
|1003|John| 14.0| 1|
|1003|John| 14.0| 1|
|1003|John|17.353| 1|
|1003|John|17.353| 1|
|1004| Tom| 15.0| 1|
+----+----+------+-----+
'''
rdd = sc.parallelize([(1001, 'a b c'), (1002, 'd e f')])
rdd = rdd.map(lambda x: (x[0], x[1].split()))
df = rdd.toDF(schema=['id', 'line'])
df.show()
'''
+----+---------+
| id| line|
+----+---------+
|1001|[a, b, c]|
|1002|[d, e, f]|
+----+---------+
'''
from pyspark.sql import functions as F
df.withColumn('explode_line', F.explode(df['line'])).show()
'''
+----+---------+------------+
| id| line|explode_line|
+----+---------+------------+
|1001|[a, b, c]| a|
|1001|[a, b, c]| b|
|1001|[a, b, c]| c|
|1002|[d, e, f]| d|
|1002|[d, e, f]| e|
|1002|[d, e, f]| f|
+----+---------+------------+
'''
from pyspark.sql import SparkSession,
from pyspark.sql import functions as F
from pyspark.sql.types import *
spark = SparkSession.builder.\
appName('test').\
master('local[*]').\
config('spark.sql.shuffle.partitions', 2).\
getOrCreate()
sc = spark.sparkContext
rdd = sc.parallelize([1, 2, 3]).\
map(lambda x: [x])
df = rdd.toDF(['num'])
df.show()
'''
+---+
|num|
+---+
| 1|
| 2|
| 3|
+---+
'''
# sparksession.udf.register()
def num_ride_10(num):
return num * 10
'''
param1: 注册的udf名称,仅可用于SQL风格
param2: 数据处理方法
param3: UDF的返回类型
return: udf对象,仅可以用于DSL风格
'''
udf_dsl = spark.udf.register('udf_sql', num_ride_10, IntegerType())
df.createOrReplaceTempView("num")
spark.sql('select udf_sql(*) from num').show()
# df.selectExpr('udf_sql(num)').show()
'''
+------------+
|udf_sql(num)|
+------------+
| 10|
| 20|
| 30|
+------------+
'''
# DSL 风格的udf只接受Column对象
# 注意:列名是udf_sql, 正常使用时返回对象和注册udf名一致
df.select(udf_dsl(df['num'])).show()
'''
+------------+
|udf_sql(num)|
+------------+
| 10|
| 20|
| 30|
+------------+
'''
def num_ride_10(num):
return num * 10
udf = F.udf(num_ride_10, IntegerType())
df.select(udf(df['num'])).show()
'''
+----------------+
|num_ride_10(num)|
+----------------+
| 10|
| 20|
| 30|
+----------------+
'''
rdd = sc.parallelize([['hello nice good a'], ['hello b'], ['nice c']])
df = rdd.toDF(['line'])
df.show()
'''
+-----------------+
| line|
+-----------------+
|hello nice good a|
| hello b|
| nice c|
+-----------------+
'''
# 返回ArrayType
def split_line(line):
return line.split(' ')
# 规定返回数组类型中装string类型
udf_split = spark.udf.register('udf_split', split_line, ArrayType(StringType()))
# False 展示一行全部数据, 否则过长表示为'...'
df.select(udf_split(df['line'])).show(truncate=False)
'''
+----------------------+
|udf_split(line) |
+----------------------+
|[hello, nice, good, a]|
|[hello, b] |
|[nice, c] |
+----------------------+
'''
import string
'''
返回字典类型
将字典想像成一张表格(json),用StructType()接收
'''
rdd = sc.parallelize([[1], [2], [3]])
df = rdd.toDF(['num'])
def get_letter(num):
return {'num': num, 'letter': string.ascii_letters[num]}
udf= spark.udf.register('udf',
get_letter,
StructType().add('num', IntegerType(), True)\
.add('letter', StringType(), True)
)
df.select(udf(df['num'])).show()
'''
+--------+
|udf(num)|
+--------+
| [1, b]|
| [2, c]|
| [3, d]|
+--------+
'''
# 后期用rdd.mapPartitions()完成聚合, 必须用单分区
single_partition_rdd = df.rdd.repartition(1)
single_partition_rdd.collect()
'''
[Row(num=1), Row(num=2), Row(num=3)]
'''
def process(iter):
sum = 0
for row in iter:
sum += row['num']
# !!!必须嵌套list
return [sum]
single_partition_rdd.mapPartitions(process).collect()
'''
[6]
'''
rdd = sc.parallelize([
(1, 'a', 12),
(2, 'b', 13),
(3, 'c', 14),
(1, 'd', 12),
(2, 'e', 16)
])
schema = StructType([
StructField('class', IntegerType(), True),
StructField('name', StringType(), True),
StructField('age', LongType(), True)
])
df = rdd.toDF(schema=schema)
df.createOrReplaceTempView('stu')
df.show()
'''
+-----+----+---+
|class|name|age|
+-----+----+---+
| 1| a| 12|
| 2| b| 13|
| 3| c| 14|
| 1| d| 12|
| 2| e| 16|
+-----+----+---+
'''
'''
聚合窗口函数
聚合函数(列) over()
avg() 聚合函数 将多行变为一行
over() 窗口函数 将一行变为多行
'''
spark.sql("""
select *, avg(age) over() as avg_age from stu
""").show()
'''
+-----+----+---+-------+
|class|name|age|avg_age|
+-----+----+---+-------+
| 3| c| 14| 13.4|
| 1| d| 12| 13.4|
| 2| e| 16| 13.4|
| 1| a| 12| 13.4|
| 2| b| 13| 13.4|
+-----+----+---+-------+
'''
'''
排序窗口函数
row_number() over(order by age desc) 按照age全局进行降序排序,追加一列名次
dense_rank() over(partition by class order by age desc) 按class分区,区内按age排序,追加一列名次
rank() over(order by age) 按照age全局进行升序排序,追加一列名次
ntile(3) over(order by age desc) 按age均分成3份,每份中的数据都是一个排名
row_number 出现相同会依次排序,
rank 出现相同排序一样
'''
spark.sql("""
select *,
row_number() over(order by age desc) as row_number_rank,
dense_rank() over(partition by class order by age desc) as dense_rank,
rank() over(order by age) as rank
from stu
""").show()
'''
+-----+----+---+---------------+----------+----+
|class|name|age|row_number_rank|dense_rank|rank|
+-----+----+---+---------------+----------+----+
| 1| a| 12| 4| 1| 1|
| 1| d| 12| 5| 1| 1|
| 2| b| 13| 3| 2| 3|
| 3| c| 14| 2| 1| 4|
| 2| e| 16| 1| 1| 5|
+-----+----+---+---------------+----------+----+
'''
spark.sql("""
select *,
ntile(3) over(order by age desc) as ntile_3
from stu
""").show()
'''
+-----+----+---+-------+
|class|name|age|ntile_3|
+-----+----+---+-------+
| 2| e| 16| 1|
| 3| c| 14| 1|
| 2| b| 13| 2|
| 1| a| 12| 2|
| 1| d| 12| 3|
+-----+----+---+-------+
'''
【Spark】PySpark RDD - RDD 缓存
df.cache() # 优先缓存到内存,不够放硬盘