pyspark学习-自定义udf

#demo1:
from pyspark.sql import SparkSession,Row

if __name__ == '__main__':
    spark = SparkSession.builder.getOrCreate()

    num = spark.sparkContext.parallelize([1,2,3,4,5]).map(lambda x:Row(num = x))
    
    numDF = spark.createDataFrame(num)
    #注册临时表
    numDF.createOrReplaceTempView("number")
    
    #定义UDF
    def square(x):
        return x**2
    #注册UDF
    spark.udf.register("square",square)
    spark.sql("select square(num) as square_num from number").show()

#输出结果
"""
|square_num|
+---------+
|        1|
|        4|
|        9|
|       16|
|       25|
+---------+
"""
#demo2:根据成绩进行数据分类

from pyspark.sql import SparkSession,Row

if __name__ == '__main__':
    spark = SparkSession.builder.getOrCreate()

    #创建测试数据:id,name,score
    data = spark.sparkContext.parallelize(['1 james 32','2 tom 62','3 jack 81','4 danny 90']).map(lambda x:x.split(' ')).map(lambda x:Row(id=x[0],name=x[1],score=int(x[2])))
    
    dataDF = spark.createDataFrame(data)

    dataDF.createOrReplaceTempView("data")
    #定义udf,成绩<60 D 成绩在60到80 C 成绩80-90 B 90+ A
    
    def classification(score):
        score_grade = 'A'
        if score < 60:
            score_grade = 'D'
        elif score >= 60 and score < 80:
            score_grade = 'C'
        elif score >= 80 and score < 90:
            score_grade = 'B'
        else:
            score_grade = 'A'
        return score_grade
    #注册udf
    spark.udf.register('classification',classification)
    spark.sql("select id,name,score,classification(score) as score_grade from data").show()

#输出结果
"""
+---+-----+-----+-----------+
| id| name|score|score_grade|
+---+-----+-----+-----------+
|  1|james|   32|          D|
|  2|  tom|   62|          C|
|  3| jack|   81|          B|
|  4|danny|   90|          A|
+---+-----+-----+-----------+
"""

你可能感兴趣的:(spark,spark)