1、Quickstart: DataFrame
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
方法一:
from datetime import datetime, date
import pandas as pd
from pyspark.sql import Row
df = spark.createDataFrame([
Row(a=1, b=2., c='string1', d=date(2000, 1, 1), e=datetime(2000, 1, 1, 12, 0)),
Row(a=2, b=3., c='string2', d=date(2000, 2, 1), e=datetime(2000, 1, 2, 12, 0)),
Row(a=4, b=5., c='string3', d=date(2000, 3, 1), e=datetime(2000, 1, 3, 12, 0))
])
df
或
df = spark.createDataFrame([
(1, 2., 'string1', date(2000, 1, 1), datetime(2000, 1, 1, 12, 0)),
(2, 3., 'string2', date(2000, 2, 1), datetime(2000, 1, 2, 12, 0)),
(3, 4., 'string3', date(2000, 3, 1), datetime(2000, 1, 3, 12, 0))
], schema='a long, b double, c string, d date, e timestamp')
df
方法二:从pandas引
pandas_df = pd.DataFrame({
'a': [1, 2, 3],
'b': [2., 3., 4.],
'c': ['string1', 'string2', 'string3'],
'd': [date(2000, 1, 1), date(2000, 2, 1), date(2000, 3, 1)],
'e': [datetime(2000, 1, 1, 12, 0), datetime(2000, 1, 2, 12, 0), datetime(2000, 1, 3, 12, 0)]
})
df = spark.createDataFrame(pandas_df)
df
方法三:from rdd
rdd = spark.sparkContext.parallelize([
(1, 2., 'string1', date(2000, 1, 1), datetime(2000, 1, 1, 12, 0)),
(2, 3., 'string2', date(2000, 2, 1), datetime(2000, 1, 2, 12, 0)),
(3, 4., 'string3', date(2000, 3, 1), datetime(2000, 1, 3, 12, 0))
])
df = spark.createDataFrame(rdd, schema=['a', 'b', 'c', 'd', 'e'])
df
Viewing Data
df.show()
df.printSchema()
df.show(1, vertical=True) # 竖着,翻过来显示
df.select("a", "b", "c").describe().show()
df.collect() # 将分布式数据作为Python中的本地数据收集到driver。它会将所有数据从执行程序收集到驱动程序端
df.take(1) # 取前1个
df.tail(5) # 取后5个
df.toPandas() # 转换回pandas。 toPandas还将所有数据收集到驱动程序端,当数据太大而无法放入驱动程序端时,这些数据很容易导致内存不足错误。
Selecting and Accessing Data
df.a
from pyspark.sql import Column
from pyspark.sql.functions import upper
type(df.c) == type(upper(df.c)) == type(df.c.isNull())
df.select(df.c).show()
df.withColumn('upper_c', upper(df.c)).show()
df.filter(df.a == 1).show()
为了更高效的 Spark 计算,我们将启用基于 arrow 的列式数据传输。
spark.conf.set('spark.sql.execution.arrow.enabled', 'true')
Applying a Function
PySpark Pandas UDF(用户定义函数)(一个用于在 Spark 集群上分发 Python 函数的框架)
“Pandas UDF 是用户定义的函数,由 Spark 执行,使用 Arrow 传输数据,Pandas 执行数据,允许向量化操作。Pandas UDF 是使用pandas_udf
作为装饰器或包装函数来定义的,不需要额外的配置。Pandas UDF 通常表现为常规的 PySpark 函数 API。”
Pandas UDF 分组数据允许在数据集的每一组中进行操作。由于 spark 中的分组操作是跨集群节点计算的,因此我们可以以允许在不同节点计算不同模型的方式操作我们的数据集。
import pandas as pd
from pyspark.sql.functions import pandas_udf
@pandas_udf('long')
def pandas_plus_one(series: pd.Series) -> pd.Series:
# Simply plus one by using pandas Series.
return series + 1
df.select(pandas_plus_one(df.a)).show()
# mapInPandas允许用户直接使用pandas DataFrame中的API,而无需任何限制,例如结果长度。
def pandas_filter_func(iterator):
for pandas_df in iterator:
yield pandas_df[pandas_df.a == 1]
df.mapInPandas(pandas_filter_func, schema=df.schema).show()
Grouping Data
df = spark.createDataFrame([
['red', 'banana', 1, 10], ['blue', 'banana', 2, 20], ['red', 'carrot', 3, 30],
['blue', 'grape', 4, 40], ['red', 'carrot', 5, 50], ['black', 'carrot', 6, 60],
['red', 'banana', 7, 70], ['red', 'grape', 8, 80]], schema=['color', 'fruit', 'v1', 'v2'])
df.show()
df.groupby('color').avg().show()
def plus_mean(pandas_df):
return pandas_df.assign(v1=pandas_df.v1 - pandas_df.v1.mean())
df.groupby('color').applyInPandas(plus_mean, schema=df.schema).show()
df1 = spark.createDataFrame(
[(20000101, 1, 1.0), (20000101, 2, 2.0), (20000102, 1, 3.0), (20000102, 2, 4.0)],
('time', 'id', 'v1'))
df2 = spark.createDataFrame(
[(20000101, 1, 'x'), (20000101, 2, 'y')],
('time', 'id', 'v2'))
def asof_join(l, r):
return pd.merge_asof(l, r, on='time', by='id')
df1.groupby('id').cogroup(df2.groupby('id')).applyInPandas(
asof_join, schema='time int, id int, v1 double, v2 string').show()
Getting Data in/out
df.write.csv('foo.csv', header=True)
spark.read.csv('foo.csv', header=True).show()
df.write.parquet('bar.parquet')
spark.read.parquet('bar.parquet').show()
df.write.orc('zoo.orc')
spark.read.orc('zoo.orc').show()
--------------------------------------------------------------------------------
Working with SQL
df.createOrReplaceTempView("tableA")
spark.sql("SELECT count(*) from tableA").show()
@pandas_udf("integer")
def add_one(s: pd.Series) -> pd.Series:
return s + 1
# UDF可以在开箱即用的SQL中注册和调用
spark.udf.register("add_one", add_one)
spark.sql("SELECT add_one(v1) FROM tableA").show()
from pyspark.sql.functions import expr
# 这些SQL表达式可以直接混合并用作PySpark列。
df.selectExpr('add_one(v1)').show()
df.select(expr('count(*)') > 0).show()
--------------------------------------------------------------------------------
2、Quickstart: Pandas API on Spark
import pandas as pd
import numpy as np
import pyspark.pandas as ps
from pyspark.sql import SparkSession
Object Creation
s = ps.Series([1, 3, 5, np.nan, 6, 8])
psdf = ps.DataFrame(
{'a': [1, 2, 3, 4, 5, 6],
'b': [100, 200, 300, 400, 500, 600],
'c': ["one", "two", "three", "four", "five", "six"]},
index=[10, 20, 30, 40, 50, 60])
dates = pd.date_range('20130101', periods=6)
pdf = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
psdf = ps.from_pandas(pdf)
spark = SparkSession.builder.getOrCreate()
sdf = spark.createDataFrame(pdf)
sdf.show()
psdf = sdf.pandas_api()
psdf.dtypes
psdf.head()
psdf.index
psdf.columns
psdf.to_numpy()
psdf.describe()
psdf.T
psdf.sort_index(ascending=False)
psdf.sort_values(by='B')
Missing Data
pdf1 = pdf.reindex(index=dates[0:4], columns=list(pdf.columns) + ['E'])
pdf1.loc[dates[0]:dates[1], 'E'] = 1
psdf1 = ps.from_pandas(pdf1)
psdf1.dropna(how='any')
psdf1.fillna(value=5)
Operations
psdf.mean()
Spark Configurations
prev = spark.conf.get("spark.sql.execution.arrow.pyspark.enabled") # Keep its default value.
ps.set_option("compute.default_index_type", "distributed") # Use default index prevent overhead.
import warnings
warnings.filterwarnings("ignore") # Ignore warnings coming from Arrow optimizations.
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", True)
%timeit ps.range(300000).to_pandas()
略
https://spark.apache.org/docs/latest/api/python/getting_started/quickstart_ps.html#Spark-Configurations
Grouping
psdf.groupby('A').sum()
Plotting
pser = pd.Series(np.random.randn(1000),
index=pd.date_range('1/1/2000', periods=1000))
psser = ps.Series(pser)
psser = psser.cummax() # cummax:用于查找序列的累积最大值。在累积最大值中,返回序列的长度与输入序列的长度相同,并且每个元素都等于当前元素和前一个元素之间的较大者。
psser.plot()
Getting data in/out
psdf.to_csv('foo.csv')
ps.read_csv('foo.csv').head(10)
Spark IO
psdf.to_spark_io('zoo.orc', format="orc")
ps.read_spark_io('zoo.orc', format="orc").head(10)
--------------------------------------------------------------------------------
3、Python Package Management
import pandas as pd
from pyspark.sql.functions import pandas_udf
from pyspark.sql import SparkSession
def main(spark):
df = spark.createDataFrame(
[(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)],
("id", "v"))
@pandas_udf("double")
def mean_udf(v: pd.Series) -> float:
return v.mean()
print(df.groupby("id").agg(mean_udf(df['v'])).collect())
if __name__ == "__main__":
main(SparkSession.builder.getOrCreate())
import os
from pyspark.sql import SparkSession
from app import main
os.environ['PYSPARK_PYTHON'] = "./environment/bin/python"
spark = SparkSession.builder.config(
"spark.archives", # 'spark.yarn.dist.archives' in YARN.
"pyspark_conda_env.tar.gz#environment").getOrCreate()
main(spark)
--------------------------------------------------------------------------------
4、Spark SQL
Apache Arrow是一种内存中的列式数据格式,在Spark中用于在JVM和Python进程之间高效地传输数据。目前,这对使用Pandas/NumPy数据的Python用户最为有利。
pip install PySpark[SQL]
import numpy as np
import pandas as pd
# Enable Arrow-based columnar data transfers
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
# Generate a Pandas DataFrame
pdf = pd.DataFrame(np.random.rand(100, 3))
# Create a Spark DataFrame from a Pandas DataFrame using Arrow
df = spark.createDataFrame(pdf)
# Convert the Spark DataFrame back to a Pandas DataFrame using Arrow
result_pdf = df.select("*").toPandas()
print("Pandas DataFrame result statistics:\n%s\n" % str(result_pdf.describe()))
Pandas UDFs
import pandas as pd
from pyspark.sql.functions import pandas_udf
@pandas_udf("col1 string, col2 long") # type: ignore[call-overload]
def func(s1: pd.Series, s2: pd.Series, s3: pd.DataFrame) -> pd.DataFrame:
s3['col2'] = s1 + s2.str.len()
return s3
# Create a Spark DataFrame that has three columns including a struct column.
df = spark.createDataFrame(
[[1, "a string", ("a nested string",)]],
"long_col long, string_col string, struct_col struct")
df.printSchema()
# root
# |-- long_column: long (nullable = true)
# |-- string_column: string (nullable = true)
# |-- struct_column: struct (nullable = true)
# | |-- col1: string (nullable = true)
df.select(func("long_col", "string_col", "struct_col")).printSchema()
# |-- func(long_col, string_col, struct_col): struct (nullable = true)
# | |-- col1: string (nullable = true)
# | |-- col2: long (nullable = true)
略https://spark.apache.org/docs/latest/api/python/user_guide/sql/arrow_pandas.html
from typing import Iterator, Tuple
import pandas as pd
from pyspark.sql.functions import pandas_udf
pdf = pd.DataFrame([1, 2, 3], columns=["x"])
df = spark.createDataFrame(pdf)
# Declare the function and create the UDF
@pandas_udf("long") # type: ignore[call-overload]
def multiply_two_cols(
iterator: Iterator[Tuple[pd.Series, pd.Series]]) -> Iterator[pd.Series]:
for a, b in iterator:
yield a * b
df.select(multiply_two_cols("x", "x")).show()
# +-----------------------+
# |multiply_two_cols(x, x)|
# +-----------------------+
# | 1|
# | 4|
# | 9|
# +-----------------------+
import pandas as pd
df1 = spark.createDataFrame(
[(20000101, 1, 1.0), (20000101, 2, 2.0), (20000102, 1, 3.0), (20000102, 2, 4.0)],
("time", "id", "v1"))
df2 = spark.createDataFrame(
[(20000101, 1, "x"), (20000101, 2, "y")],
("time", "id", "v2"))
def asof_join(left: pd.DataFrame, right: pd.DataFrame) -> pd.DataFrame:
return pd.merge_asof(left, right, on="time", by="id")
df1.groupby("id").cogroup(df2.groupby("id")).applyInPandas(
asof_join, schema="time int, id int, v1 double, v2 string").show()
# +--------+---+---+---+
# | time| id| v1| v2|
# +--------+---+---+---+
# |20000101| 1|1.0| x|
# |20000102| 1|3.0| x|
# |20000101| 2|2.0| y|
# |20000102| 2|4.0| y|
# +--------+---+---+---+
--------------------------------------------------------------------------------
pyspark.sql.SparkSession
spark = SparkSession.builder \... .master("local") \... .appName("Word Count") \... .config("spark.some.config.option", "some-value") \... .getOrCreate()
Parquet - Hadoop柱状存储格式
Parquet是一种面向列存存储的文件格式,Cloudera的大数据在线分析(OLAP)项目Impala中使用该格式作为列存储。
# To create DataFrame using SparkSession
people = spark.read.parquet("...")
department = spark.read.parquet("...")
people.filter(people.age > 30).join(department, people.deptId == department.id) \
.groupBy(department.name, "gender").agg({"salary": "avg", "age": "max"})
pyspark.sql.Observation
它将计算操作期间流经数据集的所有数据的定义聚合(度量)。
from pyspark.sql.functions import col, count, lit, max
>>> from pyspark.sql import Observation
>>> df = spark.createDataFrame([["Alice", 2], ["Bob", 5]], ["name", "age"])
>>> observation = Observation("my metrics")
>>> observed_df = df.observe(observation, count(lit(1)).alias("count"), max(col("age")))
>>> observed_df.count()
2
>>> observation.get
{'count': 2, 'max(age)': 5}
pyspark.sql.SparkSession.builder.getOrCreate
pyspark.sql.Row
row = Row(name="Alice", age=11)
>>> row
Row(name='Alice', age=11)
>>> row['name'], row['age']
('Alice', 11)
>>> row.name, row.age
('Alice', 11)
s1 = SparkSession.builder.config("k1", "v1").getOrCreate()
>>> s1.conf.get("k1") == "v1"
True
pyspark.sql.SparkSession.builder.master
builder.master(master: str) → pyspark.sql.session.SparkSession.Builder
Sets the Spark master URL to connect to, such as “local” to run locally, “local[4]” to run locally with 4 cores, or “spark://master:7077” to run on a Spark standalone cluster.
pyspark.sql.SparkSession.createDataFrame
--------------------------------------------------------------------------------
5、Pandas API on Spark
略https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/options.html
>>> import pyspark.pandas as ps
>>>
>>> psdf = ps.range(10)
>>> sdf = psdf.to_spark().filter("id > 5")
>>> sdf.show()
+---+
| id|
+---+
| 6|
| 7|
| 8|
| 9|
+---+
>>> # Create a pandas-on-Spark DataFrame with an explicit index.
... psdf = ps.DataFrame({'id': range(10)}, index=range(10))
>>> # Keep the explicit index.
... sdf = psdf.to_spark(index_col='index')
>>> # Call Spark APIs
... sdf = sdf.filter("id > 5")
>>> # Uses the explicit index to avoid to create default index.
... sdf.pandas_api(index_col='index')
id
index
6 6
7 7
8 8
9 9
transform and apply
>>> psdf = ps.DataFrame({'a': [1,2,3], 'b':[4,5,6]})
>>> def pandas_plus(pser):
... return pser + 1 # should always return the same length as input.
...
>>> psdf.transform(pandas_plus)
From/to other DBMSes
The APIs to read/write from/to external DBMSes are as follows:
read_sql_table(table_name, con[, schema, …]) Read SQL database table into a DataFrame.
read_sql_query(sql, con[, index_col]) Read SQL query into a DataFrame.
read_sql(sql, con[, index_col, columns]) Read SQL query or database table into a DataFrame.
import sqlite3
con = sqlite3.connect('example.db')
cur = con.cursor()
# Create table
cur.execute(
'''CREATE TABLE stocks
(date text, trans text, symbol text, qty real, price real)''')
# Insert a row of data
cur.execute("INSERT INTO stocks VALUES ('2006-01-05','BUY','RHAT',100,35.14)")
# Save (commit) the changes
con.commit()
con.close()
import os
from pyspark.sql import SparkSession
(SparkSession.builder
.master("local")
.appName("SQLite JDBC")
.config(
"spark.jars",
"{}/sqlite-jdbc-3.34.0.jar".format(os.getcwd()))
.config(
"spark.driver.extraClassPath",
"{}/sqlite-jdbc-3.34.0.jar".format(os.getcwd()))
.getOrCreate())
import pyspark.pandas as ps
df = ps.read_sql("stocks", con="jdbc:sqlite:{}/example.db".format(os.getcwd()))
df
df.price += 1
df.spark.to_spark_io(
format="jdbc", mode="append",
dbtable="stocks", url="jdbc:sqlite:{}/example.db".format(os.getcwd()))
ps.read_sql("stocks", con="jdbc:sqlite:{}/example.db".format(os.getcwd()))
from pyspark.sql import SparkSession
builder = SparkSession.builder.appName("pandas-on-spark")
builder = builder.config("spark.sql.execution.arrow.pyspark.enabled", "true")
# Pandas API on Spark automatically uses this Spark session with the configurations set.
builder.getOrCreate()
Best Practices
from pyspark.sql import SparkSession
builder = SparkSession.builder.appName("pandas-on-spark")
builder = builder.config("spark.sql.execution.arrow.pyspark.enabled", "true")
# Pandas API on Spark automatically uses this Spark session with the configurations set.
builder.getOrCreate()
import pyspark.pandas as ps
...
或
from pyspark import SparkConf, SparkContext
conf = SparkConf()
conf.set('spark.executor.memory', '2g')
# Pandas API on Spark automatically uses this Spark context with the configurations set.
SparkContext(conf=conf)
import pyspark.pandas as ps
...
>>> import pyspark.pandas as ps
>>> psdf = ps.DataFrame({'id': range(10)})
>>> psdf = psdf[psdf.id > 5]
>>> psdf.spark.explain()
== Physical Plan ==
*(1) Filter (id#1L > 5)
+- *(1) Scan ExistingRDD[__index_level_0__#0L,id#1L]
>>> psdf['id'] = psdf['id'] + (10 * psdf['id'] + psdf['id'])
>>> psdf = psdf.groupby('id').head(2)
>>> psdf.spark.explain()
== Physical Plan ==
*(3) Project [__index_level_0__#0L, id#31L]
+- *(3) Filter (isnotnull(__row_number__#44) AND (__row_number__#44 <= 2))
+- Window [row_number() windowspecdefinition(__groupkey_0__#36L, __natural_order__#16L ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS __row_number__#44], [__groupkey_0__#36L], [__natural_order__#16L ASC NULLS FIRST]
+- *(2) Sort [__groupkey_0__#36L ASC NULLS FIRST, __natural_order__#16L ASC NULLS FIRST], false, 0
+- Exchange hashpartitioning(__groupkey_0__#36L, 200), true, [id=#33]
+- *(1) Project [__index_level_0__#0L, (id#1L + ((id#1L * 10) + id#1L)) AS __groupkey_0__#36L, (id#1L + ((id#1L * 10) + id#1L)) AS id#31L, __natural_order__#16L]
+- *(1) Project [__index_level_0__#0L, id#1L, monotonically_increasing_id() AS __natural_order__#16L]
+- *(1) Filter (id#1L > 5)
+- *(1) Scan ExistingRDD[__index_level_0__#0L,id#1L]
>>> psdf = psdf.spark.local_checkpoint() # or psdf.spark.checkpoint()
>>> psdf.spark.explain()
== Physical Plan ==
*(1) Project [__index_level_0__#0L, id#31L]
+- *(1) Scan ExistingRDD[__index_level_0__#0L,id#31L,__natural_order__#59L]
Avoid shuffling
Avoid computation on single partition
Avoid reserved column names
Do not use duplicated column names
Use distributed or distributed-sequence
Reduce the operations on different
Use pandas API on Spark directly whenever
2022.10.26
Index of /docs