首先引入所需模块
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, pandas_udf, PandasUDFType
conf=SparkConf()
conf.setAppName('my-app')
conf.set('spark.executor.memory', '4g')
conf.set("spark.driver.memory","2g")
conf.set('spark.debug.maxToStringFields', '500')
spark = SparkSession.builder. \
config(conf=conf).\
getOrCreate()
使用 udf 时, 有点类似于 map, 输入/输出都是单个元素且数据类型相同
df = spark.createDataFrame(
[(1, 1.0), (2, 2.0), (3, 3.0)],
("id", "value"))
# udf 输入/输出都是单个 double 类型的值
@udf('double')
def plus_one(value):
return value + 1
df.withColumn('value2', plus_one(df.value)).show()
pandas_udf 用于分组后操作, 需要注意的是输入和输出的pandas DataFrame 的行数可以不同, 但列数必须相同, 将一个普通函数转为 pandas udf 的方法有两种
df = spark.createDataFrame(
[(1, 1.0), (1, 2.0), (2, 2.0), (2, 3.0), (3, 3.0)],
("id", "value"))
@pandas_udf(df.schema, PandasUDFType.GROUPED_MAP)
def subtract_mean(pdf):
return pdf.assign(value=pdf.value - pdf.value.mean())
df.groupby('id').apply(subtract_mean).show()
df = spark.createDataFrame(
[(1, 1.0), (1, 2.0), (2, 2.0), (2, 3.0), (3, 3.0)],
("id", "value"))
def subtract_mean(pdf):
return pdf.assign(value=pdf.value - pdf.value.mean())
udf_func = pandas_udf(subtract_mean, df.schema, PandasUDFType.GROUPED_MAP)
df.groupby('id').apply(udf_func).show()