假设我在Amazon S3上有销售表的Parquet数据文件的路径,包含ID主键、门店ID、日期、销售员姓名和销售额,需要分别用PySpark的SparkSQL和Dataframe API统计出每个月所有门店和各门店销售额最高的人,不一定是一个人,以及他所在的门店ID和月总销售额。
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum, max, date_format, col
from pyspark.sql.window import Window
# 初始化Spark会话
spark = SparkSession.builder.appName("SalesAnalysis").getOrCreate()
# 读取S3上的Parquet文件
df = spark.read.parquet("s3://path/to/sales/data")
# 处理日期字段并计算每月各门店各销售员的销售额总和
sales_aggregated = df.withColumn("month", date_format(col("日期"), "yyyy-MM")) \
.groupBy("门店ID", "month", "销售员姓名") \
.agg(sum("销售额").alias("sales_total"))
# 定义窗口规范(按门店和月份分区)
window_spec = Window.partitionBy("门店ID", "month")
# 使用窗口函数计算最大销售额和月总销售额
result_df = sales_aggregated \
.withColumn("max_sales", max("sales_total").over(window_spec)) \
.withColumn("monthly_total", sum("sales_total").over(window_spec)) \
.filter(col("sales_total") == col("max_sales")) \
.select("month", "门店ID", "monthly_total", "销售员姓名", "sales_total") \
.orderBy("month", "门店ID", "销售员姓名")
# 显示结果
result_df.show()
# 注册DataFrame为临时视图
df.createOrReplaceTempView("sales_data")
# 执行SQL查询
sql_result = spark.sql("""
WITH sales_aggregated AS (
SELECT
门店ID,
date_format(日期, 'yyyy-MM') AS month,
销售员姓名,
SUM(销售额) AS sales_total
FROM sales_data
GROUP BY 门店ID, date_format(日期, 'yyyy-MM'), 销售员姓名
)
SELECT
month,
门店ID,
monthly_total,
销售员姓名,
sales_total
FROM (
SELECT
month,
门店ID,
销售员姓名,
sales_total,
MAX(sales_total) OVER (PARTITION BY 门店ID, month) AS max_sales,
SUM(sales_total) OVER (PARTITION BY 门店ID, month) AS monthly_total
FROM sales_aggregated
)
WHERE sales_total = max_sales
ORDER BY month, 门店ID, 销售员姓名
""")
# 显示结果
sql_result.show()
date_format
处理日期字段为年月格式。两种实现方式均会输出以下列:
month
:年月格式(yyyy-MM)门店ID
:门店标识monthly_total
:该门店当月的总销售额销售员姓名
:当月销售额最高的销售员sales_total
:该销售员当月的销售额(等于当月最高销售额)假设我在Amazon S3上有销售表的Parquet数据文件的路径,包含ID主键、门店ID、日期、销售员姓名和销售额,需要分别用PySpark的SparkSQL和Dataframe API统计出按月统计的同比和环比数据,当前月如果不是月底的话,同比或环比数据需要取得上个月或者去年1日到对应的日期的总销售额值。
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
spark = SparkSession.builder.appName("SalesAnalysis").getOrCreate()
# 读取Parquet数据
df = spark.read.parquet("s3://your-bucket/path/to/sales_data")
# 获取当前日期信息
current_date = spark.sql("SELECT current_date()").first()[0]
current_year = current_date.year
current_month = current_date.month
current_day = current_date.day
# 数据预处理
processed_df = (
df.withColumn("date", F.col("date").cast("date"))
.withColumn("last_day", F.last_day("date"))
.withColumn("max_day", F.dayofmonth("last_day"))
.withColumn("cutoff_day", F.least(F.lit(current_day), F.col("max_day")))
.filter(F.dayofmonth("date") <= F.col("cutoff_day"))
)
# 按月聚合销售额
monthly_sales = (
processed_df
.groupBy(F.year("date").alias("year"), F.month("date").alias("month"))
.agg(F.sum("sales").alias("total_sales"))
)
# 计算前月和去年同月信息
monthly_sales = (
monthly_sales
.withColumn("prev_month_year",
F.when(F.col("month") == 1, F.col("year") - 1).otherwise(F.col("year")))
.withColumn("prev_month_month",
F.when(F.col("month") == 1, 12).otherwise(F.col("month") - 1))
.withColumn("prev_year_year", F.col("year") - 1)
.withColumn("prev_year_month", F.col("month"))
)
# 创建临时视图
monthly_sales.createOrReplaceTempView("monthly_sales")
# 通过自连接获取比较数据
final_result = (
monthly_sales.alias("curr")
.join(
monthly_sales.alias("prev_month"),
(F.col("curr.prev_month_year") == F.col("prev_month.year")) &
(F.col("curr.prev_month_month") == F.col("prev_month.month")),
"left"
)
.join(
monthly_sales.alias("prev_year"),
(F.col("curr.prev_year_year") == F.col("prev_year.year")) &
(F.col("curr.prev_year_month") == F.col("prev_year.month")),
"left"
)
.select(
F.col("curr.year"),
F.col("curr.month"),
F.col("curr.total_sales"),
F.col("prev_month.total_sales").alias("prev_month_sales"),
F.col("prev_year.total_sales").alias("prev_year_sales")
)
)
# 计算增长率
final_result = final_result.withColumn(
"month_over_month",
((F.col("total_sales") - F.col("prev_month_sales")) / F.col("prev_month_sales") * 100
).withColumn(
"year_over_year",
((F.col("total_sales") - F.col("prev_year_sales")) / F.col("prev_year_sales") * 100
)
# 显示结果
final_result.show()
# 注册预处理后的视图
processed_df.createOrReplaceTempView("processed_sales")
# 执行SQL查询
sql_query = """
WITH monthly_sales AS (
SELECT
YEAR(date) AS year,
MONTH(date) AS month,
SUM(sales) AS total_sales
FROM processed_sales
GROUP BY YEAR(date), MONTH(date)
),
comparison_data AS (
SELECT
curr.year,
curr.month,
curr.total_sales,
prev_month.total_sales AS prev_month_sales,
prev_year.total_sales AS prev_year_sales
FROM monthly_sales curr
LEFT JOIN monthly_sales prev_month
ON (curr.year = prev_month.year AND curr.month = prev_month.month + 1)
OR (curr.month = 1 AND prev_month.month = 12 AND curr.year = prev_month.year + 1)
LEFT JOIN monthly_sales prev_year
ON curr.year = prev_year.year + 1 AND curr.month = prev_year.month
)
SELECT
year,
month,
total_sales,
ROUND((total_sales - prev_month_sales) / prev_month_sales * 100, 2) AS mom_growth,
ROUND((total_sales - prev_year_sales) / prev_year_sales * 100, 2) AS yoy_growth
FROM comparison_data
ORDER BY year, month
"""
spark.sql(sql_query).show()
数据预处理:
聚合计算:
增长率计算:
特殊处理: