import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.types import Row
# 创建SparkSession对象,调用.builder类
# .appName("testapp")方法给应用程序一个名字;.getOrCreate()方法创建或着获取一个已经创建的SparkSession
spark = SparkSession.builder.appName("pysaprk").getOrCreate()
from pyspark.sql import functions as F
import pandas as pd
df_pd = pd.DataFrame({'id': ['A', 'B'], 'index': ['1', '2'], 'index1': ['2', '4']})
df_pd
id | index | index1 | |
---|---|---|---|
0 | A | 1 | 2 |
1 | B | 2 | 4 |
df = spark.createDataFrame(df_pd)
df.show()
+---+-----+------+
| id|index|index1|
+---+-----+------+
| A| 1| 2|
| B| 2| 4|
+---+-----+------+
df.printSchema()
root
|-- id: string (nullable = true)
|-- index: string (nullable = true)
|-- index1: string (nullable = true)
def ff(row):
# print([row['id'], row.index, row.index1])
print([row['id'], row['index'], row['index1']])
return [[row['id'], row['index'], row['index1']]]
map_df = spark.createDataFrame(df.rdd.map(lambda x: ff(x)))
map_df.show()
+---------+
| _1|
+---------+
|[A, 1, 2]|
|[B, 2, 4]|
+---------+
map_df.printSchema()
root
|-- _1: array (nullable = true)
| |-- element: string (containsNull = true)
map()是将函数用于RDD中的每个元素,将返回值构成新的RDD
对于map来说,dataframe的一行是元素,
flatmap()是将函数应用于RDD中的每个元素,将返回的迭代器的所有内容构成新的RDD
df.rdd.map(lambda x: ff(x)).toDF().show()
+---------+
| _1|
+---------+
|[A, 1, 2]|
|[B, 2, 4]|
+---------+
schema = StructType([StructField(cn, StringType()) for cn in
['id', 'index', 'index1']])
flatMap_df = spark.createDataFrame(df.rdd.flatMap(lambda x: ff(x)))
flatMap_df.show()
+---+---+---+
| _1| _2| _3|
+---+---+---+
| A| 1| 2|
| B| 2| 4|
+---+---+---+
flatMap_df.printSchema()
root
|-- _1: string (nullable = true)
|-- _2: long (nullable = true)
|-- _3: long (nullable = true)
df.rdd.flatMap(lambda x: ff(x)).toDF().show()
+---+---+---+
| _1| _2| _3|
+---+---+---+
| A| 1| 2|
| B| 2| 4|
+---+---+---+
写的比较抽象,可以看下面第二个blog,有图讲的清晰些
在有些场景下,比如对df做操作,需要保留某些列,并且做些列之间复杂计算时,需要把最终结果再还原成dataframe时,比较好用
[1] https://blog.csdn.net/ten_sory/article/details/80897648
[2] https://cloud.tencent.com/developer/article/1912787
2023-07-26 阴 于南京市江宁区