1、列重命名:
train_data = train_data.toDF('imei', 'pkgName', 'timestamp')
2、删除某一列:
df = df.drop('col')
3、选取list中指定的列:
df = spark_session.createDataFrame(
[(1, 1.0, 5), (1, 2.0, 7), (2, 3.0, 9), (2, 5.0, 11), (2, 10.0, 13)],
("id", "v", 'c'))
df.show(5)
sel_list = ['id', 'c']
df.select(*sel_list).show()
4.1、自定义函数,使用udf,而且有多个输入量:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType,StringType
def get_hours(cur_day, cur_hour):
return cur_day * 24 + cur_hour
get_hours_udf = udf(get_hours,IntegerType())
train_data2 = train_data2.withColumn('hours',get_hours_udf(train_data2['day'],train_data2["hour"]))
4.2 自定义函数,基于pandas_udf。这里基于的是Pyspark2。需要安装pyarrow==0.14.1(需要指定版本,太高了不一定行)
import pandas as pd
from pyspark.sql.functions import col, pandas_udf
from pyspark.sql.types import LongType
df = spark_session.createDataFrame(
[(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)],
("id", "v"))
df.show(5)
@pandas_udf(LongType()) #输入一列。这种是返回Series的方法
def multiply_func1(all_data):
def helper(input_data):
return input_data * input_data
res = [helper(d) for d in all_data]
return pd.Series(res)
@pandas_udf(LongType()) #输入两列。这种是直接搞函数的方法。更加推荐这种写法
def multiply_func2(all_data1, all_data2):
return all_data1 * all_data2 + 5
df = df.withColumn('test1', multiply_func1(col("v")) )
df.show()
df = df.withColumn("ss2", multiply_func2((col("v")), col("test1")))
df.show()
输出如下:
+---+----+
| id| v|
+---+----+
| 1| 1.0|
| 1| 2.0|
| 2| 3.0|
| 2| 5.0|
| 2|10.0|
+---+----+
+---+----+----+
| id| v|test|
+---+----+----+
| 1| 1.0| 1|
| 1| 2.0| 4|
| 2| 3.0| 9|
| 2| 5.0| 25|
| 2|10.0| 100|
+---+----+----+
+---+----+----+----+
| id| v|test| t|
+---+----+----+----+
| 1| 1.0| 1| 1|
| 1| 2.0| 4| 8|
| 2| 3.0| 9| 27|
| 2| 5.0| 25| 125|
| 2|10.0| 100|1000|
+---+----+----+----+
注:如果如果输入或者输出包括String类型,似乎只能用返回Series的形式:
import pandas as pd
from pyspark.sql.functions import col, pandas_udf
from pyspark.sql.types import LongType
df = spark_session.createDataFrame(
[(1, 1.0), (1, 2.0), (2, 3.0), (2, 5.0), (2, 10.0)],
("id", "v"))
df.show(5)
@pandas_udf(StringType()) #输入一列。这种是返回Series的方法
def multiply_func1(all_data):
return str(all_data * 2)
df = df.withColumn('test1', multiply_func1(col("v")) )
df.show()
# df = df.withColumn("ss2", multiply_func2((col("v")), col("test1")))
# df.show()
#报错
import pandas as pd
from pyspark.sql.functions import col, pandas_udf
from pyspark.sql.types import LongType
df = spark_session.createDataFrame(
[("1", 1.0), ("1", 2.0), ("2", 3.0), ("2", 5.0), ("2", 10.0)],
("height", "v"))
df.show(5)
@pandas_udf(IntegerType()) #输入一列。这种是返回Series的方法
def multiply_func1(all_data):
return int(all_data)
df.withColumn('v', multiply_func1(col("height"))).show() #不报错
df.withColumn('test1', multiply_func1(col("height"))).show() #报错
5、groupby聚合并拼接:
from pyspark.sql.functions import window, column, desc, col,collect_list
u1 = train_data2.groupBy(['user','hour','day']).agg(collect_list(train_data2["pkgName"])).alias('pkgName_list')
#还有collect_set
6、列数据类型转换:
data_df = data_df.withColumn("APP_HOBY_CASH_LOAN", data_df["APP_HOBY_CASH_LOAN"].cast(StringType()))
7、从pyspark dataframe中抽取符合某条件的对象
train_data2= train_data.filter(train_data.day == "2020-05-04")
8、pyspark DataFrame存入hive的某个分区中
spark_df = spark_df.withColumn("day", F.lit("2021-11-11"))
spark_df.write.saveAsTable("DB.table", format="hive", mode="overwrite", partitionBy="day")
#DB.table为预期保存在数仓的位置和名称,day为分区。这样即可做到把2021-11-11的数据存在了指定分区
9、pyspark DataFrame将用分隔符分割的列表转换为vector(形如将1*3*5*8*6*2变成1,3,5,8,6,2的vector)
from pyspark.ml.linalg import VectorUDT, Vectors
from pyspark.sql.functions import udf, col
from pyspark.sql.types import ArrayType, DoubleType,FloatType
#这里假定data_df的user_vector中每一列是形如1*3*5*8*6*2的string
def get_vector(user_vector_string):
res = []
u1 = user_vector_string.split("*")
for u in u1:
res.append(float(u))
return res
get_vector_udf = udf(get_vector, ArrayType(FloatType()))
data_df2 = data_df.withColumn("vector", get_vector_udf(data_df['user_vector']))
list_to_vector_udf = udf(lambda l: Vectors.dense(l), VectorUDT())
data_df3= data_df2.withColumn("features", list_to_vector_udf(data_df2['vector']))
#之后就可以对features中的值进行操作,比如kmeans聚类等
10、pyspark DataFrame,列名批量重新命名
res_col_list = [...,...,...,] #需要的列名
data_df = data_df.toDF(*res_col_list)
11、pyspark DataFrame,在udf中输入参数
from pyspark.sql import functions as F
#此函数的含义为用fill_data替换-7。
def nan_data_process_udf(fill_data):
def nan_data_process(col_data, fill_data):
if int(col_data) == -7:
return fill_data
else:
return col_data
return F.udf(lambda x: nan_data_process(x, fill_data))
#然后:
data_df = data_df.withColumn(cur_col, nan_data_process_udf(cur_avg)(F.col(cur_col)))
#其中cur_avg就是输入的入参
12、pyspark,选取某个list中的列
columns = ['home','house','office','work']
#select the list of columns
df_tables_full.select('time_event','kind','schema','table',*columns).show()
df_tables_full = df_tables_full.select('time_event','kind','schema','table',*columns)
Pyspark 3 专用的
1、使用apply处理,且在原来的基础上增加一列
import pyspark.pandas as ps
def func(s) -> str:
return str(s) + "100"
df = ps.DataFrame({'A': range(10)})
#注意这里的df,是,而传统的从csv读取的dataFrame,是##。因此对于后者,需要调用.to_pandas_on_spark() 方#法进行转换。
print(df.head(5))
res = df.A.apply(func)
# print(res)
# print(type(res))
res.name = "func" #重命名结果列
df = df.join(res)
print(df.head(5))
# 输出:
# A
# 0 0
# 1 1
# 2 2
# 3 3
# 4 4
# A func
# 0 0 0100
# 1 1 1100
# 3 3 3100
# 2 2 2100
# 5 5 5100