假设:
所有购买都遵循定义的流程:
用户首先点击产品以获取详细信息
用户将产品添加到购物车
用户订购产品
由于我们获得的数据是点击次数和进一步活动的数据,因此我们假设用户在访问网站后肯定会点击。
例如:对于任何特定产品/辅助工具的session_clicks:评级 = p(aid|click),
同样,对于任何特定产品/辅助工具的session_carts:评级 = p(aid|carts|clicks) {这是因为观察到产品已在购物车中的实例}
对于任何特定产品/辅助session_orders:评级 = p(aid|订单|购物车|点击)
在此笔记本中 - 对于任何给定的项目评级 =(该点的累积分布)*(项目触发事件的次数)/(会话中特定事件的总数)
虽然它似乎忽略了点击转换为“购物车”/“订单”的可能性,但将来可能会包含这一点以提高性能
时间戳仅用于获取点击/购物车/订单数,而不细分为一天/月/年的时间
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
sample_sibmission = "/kaggle/input/otto-recommender-system/sample_submission.csv"
test = "/kaggle/input/otto-recommender-system/test.jsonl"
train = "/kaggle/input/otto-recommender-system/train.jsonl"
# df_train = pd.read_parquet(otto-chunk-data-inparquet-format)
%%time
## Installing Apache Spark
!pip install pyspark --quiet
# %%time
# ## Installing polars
# !pip install polars
#Generic Libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
#Apache Spark Libraries
import pyspark
from pyspark.context import SparkContext
from pyspark.sql import SparkSession
#Apache Spark SQL Functions
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import explode, first, col, monotonically_increasing_id, lit, when, cume_dist, count, sum
#Apache Spark ML CLassifier Libraries
from pyspark.ml.classification import DecisionTreeClassifier,RandomForestClassifier,NaiveBayes
#Apache Spark Evaluation Library
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
#Apache Spark Features libraries
from pyspark.ml.feature import StandardScaler,StringIndexer, VectorAssembler, VectorIndexer, OneHotEncoder, Normalizer, MinMaxScaler
#Apache Spark Pipelin Library
from pyspark.ml import Pipeline
# Apache Spark `DenseVector`
from pyspark.ml.linalg import DenseVector
#Data Split Libraries
import sklearn
from sklearn.model_selection import train_test_split
#Polars to read files quickly
# Import the requisite packages
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
#Tabulating Data
from tabulate import tabulate
#Garbage
import gc
gc.collect()
import multiprocessing
cores = multiprocessing.cpu_count()
print(cores)
spark = SparkSession.builder.\
appName("FirstSparkApplication").\
config ("spark.executor.memory", "25g").\
config ("spark.default.parallelism","400").\
config ("spark.default.partitions","10000").\
config ("spark.sql.inMemoryColumnarStorage.compressed", True).\
config ("spark.sql.inMemoryColumnarStorage.batchsize", 10000).\
getOrCreate()
spark.sparkContext.setLogLevel('WARN')
spark.sparkContext.version
spark.sparkContext.setCheckpointDir("/kaggle/temp/")
gc.collect()
%%time
"""Defining Model
"""
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.sql import Row
als = ALS(rank=20, #10 was by default
maxIter=2 , regParam=0.01,
userCol="session_type", itemCol="aid", ratingCol="interest", #ratings/possible_interest
coldStartStrategy="drop",
implicitPrefs=True)
gc.collect()
%%time
"""importing and transforming Training Data
"""
prod_df_1 = spark.read.json(train)
df_1 = prod_df_1.na.drop() #Wall time: 2min 15s
df_sel = df_1.select("session", explode("events").alias("events")).select("events.*","*").sort("ts").select("aid","ts","session","type")
df_indexed = df_sel.withColumn("typeindex", when(df_sel.type == "clicks",0)
.when(df_sel.type == "carts",1)
.otherwise(2))
df_session_type = df_indexed.withColumn("session_type",(df_indexed.session)*10 + df_indexed.typeindex).select("aid","ts","session","session_type","type")
## Aggregating using "Window Partition"
windowPartition = Window.partitionBy("session_type").orderBy("aid")
df_freq=df_session_type.withColumn("cume_dist",
cume_dist().over(windowPartition)).withColumn("frequency",count(col("ts")).over(windowPartition))
df_final = df_freq.withColumn("sumOfEvents",sum(col("frequency")).over(windowPartition))
df_train = df_final.select("aid","session_type",(df_final.cume_dist*(df_final.frequency / df_final.sumOfEvents)).alias('interest'))
df_train_check = df_train.checkpoint()
df_train_check.write.parquet("train.parquet")
# parDF1=spark.read.parquet("/temp/out/people.parquet")
#Wall time: 9min 41s with parallelism = 400/patition =4/executor_memory = 2G
#Wall time: 13min 33s with parallelism = 400/patition =10000/executor_memory = 25G
gc.collect()
# %%time
# model = als.fit(df_train_check)
%%time
"""Importing and transforming data from Test File
"""
prod_df_test = spark.read.json(test)
df_test = prod_df_test.na.drop() #Wall time: 2min 15s
df_sel_test = df_test.select("session", explode("events").alias("events")).select("events.*","*").sort("ts").select("aid","ts","session","type")
df_indexed_test = df_sel_test.withColumn("typeindex", when(df_sel_test.type == "clicks",0)
.when(df_sel_test.type == "carts",1)
.otherwise(2))
df_session_type_test = df_indexed_test.withColumn("session_type",(df_indexed_test.session)*10 + df_indexed_test.typeindex).select("aid","ts","session","session_type","type")
## Aggregating using "Window Partition"
windowPartition = Window.partitionBy("session_type").orderBy("aid")
df_freq_test=df_session_type_test.withColumn("cume_dist",
cume_dist().over(windowPartition)).withColumn("frequency",
count(col("ts")).over(windowPartition))
df_final_test = df_freq_test.withColumn("sumOfEvents",sum(col("frequency")).over(windowPartition))
df_test = df_final_test.select("aid","session_type",(df_final_test.cume_dist*(df_final_test.frequency / df_final_test.sumOfEvents)).alias('interest')) #(df_final_test.cume_dist*(df_final_test.frequency / df_final_test.sumOfEvents)).alias('interest'))
df_test_check=df_test.checkpoint() #Wall time: 29.1 s #Wall time: 24.8 s
df_test_check.write.parquet("test.parquet")