# 大坑
# pyspark所有的结果只要不show结果,看行数等操作,都是定义表,并没有计算结果
# 所以在join时,为了保证数据的准确性,养成好习惯:1、小表关联大表 2、大表关联小表[‘A_KEY’ rename]
from pyspark.sql import functions as F
from pyspark.sql.functions import *
from pyspark.sql.types import DateType
from pyspark.sql.window import *
from pyspark.sql.functions import lower, col # 小写
from pyspark.sql.functions import upper, col # 大写
from pyspark.sql.functions import lit # 增加列
from pyspark.sql.functions import when # ifelse
from pyspark.sql.functions import split, explode, concat, concat_ws # split(列数据的分割), explode(一行分成多行) concat,concat_ws(列数据合并)
from pyspark.sql.types import StringType # 导入数据类型
from pyspark.sql.functions import UserDefinedFunction # 定义函数
from pyspark.sql.functions import desc #降序排列
from pyspark.sql.functions import trim # 去空格
a.createOrReplaceTempView("a")
# 生成pyspark的dataframe
a = spark.sql("select * from a").cache()
# 查看数据head
a.show(10)
a.take(10)
# 去重
df = df.dropDuplicates()
df.select('A_field').distinct().count()
# 行数
a.count()
len(ORD_pro.collect()) #运行速度会更快
# 查看列名
a.columns
# 查看字段类型
a.dtypes
# 查看数据结构
a.printSchema()
# 小数转换成字符串
PriceBook_STBT = ST.select( ST["BT"].cast("int").cast("string"),ST["ST"].cast("int").cast("string"))
priceBook_BT = kadl.select(kadl["BT"].cast("int").cast("string"),kadl["AICPGP"].cast("string"))
# 修改列名
ST_SKU_1= ST_SKU_1.withColumnRenamed("CUST_ID",'ShipToNumber').withColumnRenamed("SKU",'SKUNumber')
ORD_table.selectExpr("SVC_NOTIF_KEY as SVC_NOTIF_KEY_1", "NOTIF_NBR as NOTIF_NBR_1").show()
# 选择某列summariy
a.select('CONTACT_KEY').describe().show()
# 删除某列
b1 = b.drop("CONTACT_KEY").show()
# 筛选满足条件的行数
a.filter(a.CONTACT_KEY == 504943)
# 多条件筛选(and 必须换行)
EQUIP_pro_1.filter( (EQUIP_pro_1.INSTALL_DT >= '20200101') & (EQUIP_pro_1.INSTALL_DT <= '20200331')) # 条件之间用括号分开
## like 筛选
Total_Booking5 = Total_Booking4.filter(~ col('Login').like('%thermofisher.com%') )
# 筛选以a开头的记录
df_1 = df.filter(lower(df.current_pagename_new).like('products:%')
# 筛选空值null
EQUIP_pro.filter("INSTALL_DT is NULL").select('INSTALL_DT').count()
Q3_Campaign_1.filter(Q3_Campaign_1.login.isNotNull())
#新生成一列(判断某个字段每个值是否是空值)
df.withColumn("is_person_name_null",col("USER_NM").isNull()/isNotNull()
#新生成一列常数项
frame.withColumn("contant", F.lit(10))
# 新生成一列(根据计算所得)
frame.withColumn("name_length", F.length(frame.name))
# 时间戳转换成日期格式
方法一:
df= df('UPDT_DT',F.to_date(df))
df= df.withColumn('CRT_DT',F.to_date(df.CRT_DT))
方法二:
df.select('UPDT_DT').withColumn("UPDT_DT_1",col("UPDT_DT").cast("date"))
# 字符串(string)转换成日期(data)格式
df.withColumn("BPEFTJ_1",df['BPEFTJ'].cast(DateType()))
# 字段转换成小写
from pyspark.sql.functions import lower, col
df = spark.table('df').withColumn('CONTACT_ID_1', lower(col('df.CONTACT_ID')))
# 小写去空格
df.withColumn("USER_NM", upper(trim(col("USER_NM")))).show() # 操作在dataframe上
dataframeColnames.createOrReplaceTempView("dataframeColnames")
import pyspark.sql.functions as F
from pyspark.sql.functions import col
def single_space(col):
return F.trim(F.regexp_replace(col, " +", " "))
# 去除开头和结尾的空格
def remove_all_whitespace(col):
return F.regexp_replace(col, "\\s+", "")
# 去除中间的空格
spark.table('WEB_USER').withColumn('USER_NM_1', lower(remove_all_whitespace(single_space(col("USER_NM"))))).show() # 操作Table上
# 排序
WEB_USER_3.sort('CONTACT_ID_1','USER_NM_1',ascending = False).show() #降序排列 默认为升序 (同升同降)
WEB_USER_3.sort(WEB_USER_3.CONTACT_ID_1.desc(),WEB_USER_3.USER_NM_1.asc()).show() # 自定义升降
#分组
WEB_USER_3.groupBy('CONTACT_ID_1').agg(f.count/countDistinct('CONTACT_ID_1').alias('count')).sort(desc('count')).show()
df.groupBy("login").count().sort(desc("count")).show()
df.groupBy('level').agg(sf.concat_ws(',', sf.collect_list(df.name))).show()
# 分组选择最新日期
df.groupBy("SKU").agg(F.max("BPEFTJ").alias("BPEFTJ")).show() # 只会显示SKU 与 BPEFTJ两列
df.join(df.groupBy("SKU").agg(F.max("BPEFTJ").alias("BPEFTJ")),["SKU","BPEFTJ"],how="inner") # 会显示所有列
#获取系统当前日期
WOP_Price.withColumn("OP_Data",current_date())
#转换日期
TO_CHAR(COMPLT_DATETIME, 'YYYYMMDD' ) COMPLT_DATETIME
# ifelse
方法一:
from pyspark.sql.functions import when
df = df.withColumn("profile", when(df.age >= 40,"Senior") .otherwise("Executive"))
order3 = order2.withColumn("Order Cancelled(Y/N/P)", when( (order2.status == -1) & (order2.received_quantity > 0) ,"P").when(order2.status == -1,"Y").otherwise("N"))
方法二:
#定义函数
def somefunc(value):
if (value=='a') | (value=='b') :
return 'Yes'
else:
return 'No'
# F.udf(函数,输出类型)
udfsomefunc = F.udf(somefunc, StringType())
a2 = a1.withColumn("abc", udfsomefunc("SCb_Name")).select('SCb_Name','SGN','abc')
# 新生成一列 (查看每个字段的字符长度)
frame3_1 = WEB_USER_3.withColumn("name_length", f.length(WEB_USER_3.USER_NM_1))
ST_SKU_1.withColumn('Input',F.lit('Viewed')).show()
from pyspark.sql.functions import lit
new_df = df1.withColumn('newCol', lit(0)).show() # 新列为0
new_df = df.withColumn('new_column_1', lit(None).cast(StringType())) #新列为NULL
# 表连接
df = df1.join(df2, ta.name == tb.name, how='inner'/'outer'/'left'/'right'/left_anti)
df = df.join(df,['BT'],how='inner')
df.show()
#全连接(并集)
jd = df.join(defaults, on="foo", how='outer')
# 列数据的分割
from pyspark.sql.functions import split, explode, concat, concat_ws
df_split = df.withColumn("s", split(df['score'], " ")) #切分字段score,生成为s
df_split.show()
#列数据合并
ST_SKU_2.withColumn('STSKU',concat(ST_SKU_2['ShipToNumber'],ST_SKU_2['SKUNumber'])) (没有分隔符)
ST_SKU_2.withColumn('STSKU',concat_ws("-",ST_SKU_2['ShipToNumber'],ST_SKU_2['SKUNumber'])) (指定分隔符)
# 把数据拉竖(R:melt)
from pyspark.sql.functions import split, explode, concat, concat_ws
df1 = df.withColumn("SKU", explode(split(df['prod_list'], ",")))
from pyspark.sql.functions import pandas_udf,pandasUDFType
@pandas_udf("user string,PL string,Order_Number integer",pandasUDFType.GROUPED_MAP)
def data_partiotion(df):
V=df.select('Order_Number')
return spark.createDataFrame()
df.withColumn("datetime", col("datetime").cast("timestamp"))
.groupBy("userId", "memberId")
.agg(max_("datetime"))
#注意事项
1 filter (命名)
test = WEB_USER.groupBy('USER_NM').agg(F.count('USER_NM').alias('count')).sort(desc('count'))
test.filter(test.count > 1).show() 会报错:'>' not supported between instances of 'method' and 'int'
修改成:test.filter(test['count'] > 1).show()
报错原因:'count'为默认方法,名字冲突
### 去中间空格
def remove_all_whitespace(col):
return F.regexp_replace(col, "\\s+", "")
df = df.withColumn('Materia_Number',remove_all_whitespace(trim(upper(col('Materia_Number')))))
## python 转换成 pyspark
sentenceData = spark.createDataFrame(df)
# union (合并两个dataframe)
Delear = df.union(df1).union(df2).union(df3).union(df4).distinct().withColumn("op_date",current_date())
# 保存成表
df.write.mode("append").saveAsTable("db.df_a")
df.write.mode(SaveMode.Overwrite).saveAsTable("testdb.testtable")
# 值不在dataframe中
a1 = a.filter(~col('USER_NM').isin(['[email protected]']))
# 值不在dataframe中,需要列表,把dataframe 某列转换成list
#方法一、
mvv_list = bb1.select('USER_NM').collect()
mvv_array = [i.USER_NM for i in mvv_list]
df = a.withColumn('is_user_nm_null',col('USER_NM').isin(mvv_array))
display(df.groupBy('is_user_nm_null').count().sort(desc('count')))
# a1 = a.filter(~col('USER_NM').isin(mvv_array))
#方法二、
df = df.withColumn("AddCol", when(df.col("Pclass").contains("3"),"three").otherwise("notthree"))
f.withColumn('cat',
F.when(df.device.isin(phone_list), 'phones').otherwise(
F.when(df.device.isin(pc_list), 'pc').otherwise(
F.when(df.device.isin(security_list), 'security')))
).groupBy('id').pivot('cat').agg(F.count('cat')).show()
#方法三
a1 = a.join(bb1,['USER_NM'],how='outer') 并集
jd = a1.withColumn("ShipToNumber", coalesce("ShipToNumber", "shipToName_1")).withColumn("ShipToName", coalesce("ShipToName", "shipToName_1"))
-- 主表仍然是a1表,当ShipToNumber 与 shipToName_1 同时存在/同时不存在,选择当ShipToNumber。当当ShipToNumber为null,而当ShipToNumber_1 非空,选择当ShipToNumber_1
### 选择最新日期的一条
Customer_info_5_2.withColumn("row_num", row_number().over(Window.partitionBy("ShipToNumber").orderBy(desc("Login_Last_Order_Date")))).filter(col('row_num') == '1')
###
a2.withColumn("row_number",F.row_number().over(Window.partitionBy("CONTACT_KEY").orderBy(desc("CITY")))).filter(col('row_number') == 1)
### 格式化
"','".join(df_1.columns)
### 表的列名
D_Equip_2.schema.names
### 格式化修改字段类型(py.spark)
for item in aaa.columns:
aaa=aaa.withColumn(item, col(item).cast("string"))
aaa
### 格式化修改字段类型(py)
df=df.astype(str)
### 格式化修去除字段名空格(py.spark)
for item in df.columns:
df=df.withColumnRenamed(item, item.replace(" ",''))
print(df.columns)
### 手动生成df
%scala
val df = Seq(
(1, "First Value", java.sql.Date.valueOf("2010-01-01"))
).toDF("int_column", "string_column", "date_column")
df.createOrReplaceTempView("df")
###重命名
wop_latest1 = wop_latest[['SKU Number','Sales Order Number']].rename(columns={"SKU Number": "SKUNumber", "Sales Order Number": "SalesOrderNumber"})
#### 当同一CONTACT_KEY来源多个Soruce时,合并。eg:pre-sales/SVC
result_4.drop('Source').join(result_4.groupBy("CONTACT_KEY").agg(concat_ws("/",collect_set("Source")).alias('Source')),['CONTACT_KEY'],how='left')