kaggle学习笔记-otto-baseline9-使用事件的观测概率作为评级

假设:
所有购买都遵循定义的流程:
用户首先点击产品以获取详细信息
用户将产品添加到购物车
用户订购产品
由于我们获得的数据是点击次数和进一步活动的数据,因此我们假设用户在访问网站后肯定会点击。
例如:对于任何特定产品/辅助工具的session_clicks:评级 = p(aid|click),

同样,对于任何特定产品/辅助工具的session_carts:评级 = p(aid|carts|clicks) {这是因为观察到产品已在购物车中的实例}

对于任何特定产品/辅助session_orders:评级 = p(aid|订单|购物车|点击)

在此笔记本中 - 对于任何给定的项目评级 =(该点的累积分布)*(项目触发事件的次数)/(会话中特定事件的总数)

虽然它似乎忽略了点击转换为“购物车”/“订单”的可能性,但将来可能会包含这一点以提高性能

时间戳仅用于获取点击/购物车/订单数,而不细分为一天/月/年的时间



import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
sample_sibmission = "/kaggle/input/otto-recommender-system/sample_submission.csv"
test = "/kaggle/input/otto-recommender-system/test.jsonl"
train = "/kaggle/input/otto-recommender-system/train.jsonl"
# df_train = pd.read_parquet(otto-chunk-data-inparquet-format)
%%time
## Installing Apache Spark
!pip install pyspark --quiet
# %%time
# ## Installing polars
# !pip install polars

#Generic Libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

#Apache Spark Libraries
import pyspark
from pyspark.context import SparkContext
from pyspark.sql import SparkSession

#Apache Spark SQL Functions
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import explode, first, col, monotonically_increasing_id, lit, when, cume_dist, count, sum

#Apache Spark ML CLassifier Libraries
from pyspark.ml.classification import DecisionTreeClassifier,RandomForestClassifier,NaiveBayes

#Apache Spark Evaluation Library
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

#Apache Spark Features libraries
from pyspark.ml.feature import StandardScaler,StringIndexer, VectorAssembler, VectorIndexer, OneHotEncoder, Normalizer, MinMaxScaler

#Apache Spark Pipelin Library
from pyspark.ml import Pipeline

# Apache Spark `DenseVector`
from pyspark.ml.linalg import DenseVector

#Data Split Libraries
import sklearn
from sklearn.model_selection import train_test_split

#Polars to read files quickly


# Import the requisite packages
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator



#Tabulating Data
from tabulate import tabulate

#Garbage
import gc
gc.collect()
import multiprocessing
cores = multiprocessing.cpu_count()
print(cores)
spark = SparkSession.builder.\
                appName("FirstSparkApplication").\
                config ("spark.executor.memory", "25g").\
                config ("spark.default.parallelism","400").\
                config ("spark.default.partitions","10000").\
                config ("spark.sql.inMemoryColumnarStorage.compressed", True).\
                config ("spark.sql.inMemoryColumnarStorage.batchsize", 10000).\
                getOrCreate()
spark.sparkContext.setLogLevel('WARN')
spark.sparkContext.version
spark.sparkContext.setCheckpointDir("/kaggle/temp/")
gc.collect()
%%time
"""Defining Model
"""

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.sql import Row

als = ALS(rank=20, #10 was by default
          maxIter=2 , regParam=0.01, 
          userCol="session_type", itemCol="aid", ratingCol="interest", #ratings/possible_interest
          coldStartStrategy="drop",
          implicitPrefs=True)
gc.collect()
%%time
"""importing and transforming Training Data
"""
prod_df_1 = spark.read.json(train)
df_1 = prod_df_1.na.drop() #Wall time: 2min 15s
df_sel = df_1.select("session", explode("events").alias("events")).select("events.*","*").sort("ts").select("aid","ts","session","type")

df_indexed = df_sel.withColumn("typeindex", when(df_sel.type == "clicks",0)
                                 .when(df_sel.type == "carts",1)
                                 .otherwise(2))

df_session_type = df_indexed.withColumn("session_type",(df_indexed.session)*10 + df_indexed.typeindex).select("aid","ts","session","session_type","type")

## Aggregating using "Window Partition"
windowPartition = Window.partitionBy("session_type").orderBy("aid")

df_freq=df_session_type.withColumn("cume_dist",
              cume_dist().over(windowPartition)).withColumn("frequency",count(col("ts")).over(windowPartition))

df_final = df_freq.withColumn("sumOfEvents",sum(col("frequency")).over(windowPartition))

df_train = df_final.select("aid","session_type",(df_final.cume_dist*(df_final.frequency / df_final.sumOfEvents)).alias('interest'))

df_train_check = df_train.checkpoint()
df_train_check.write.parquet("train.parquet") 
# parDF1=spark.read.parquet("/temp/out/people.parquet")

#Wall time: 9min 41s with parallelism = 400/patition =4/executor_memory = 2G
#Wall time: 13min 33s with parallelism = 400/patition =10000/executor_memory = 25G
gc.collect()
# %%time
# model = als.fit(df_train_check)
%%time
"""Importing and transforming data from Test File
"""

prod_df_test = spark.read.json(test)
df_test = prod_df_test.na.drop() #Wall time: 2min 15s
df_sel_test = df_test.select("session", explode("events").alias("events")).select("events.*","*").sort("ts").select("aid","ts","session","type")
df_indexed_test = df_sel_test.withColumn("typeindex", when(df_sel_test.type == "clicks",0)
                                 .when(df_sel_test.type == "carts",1)
                                 .otherwise(2))
df_session_type_test = df_indexed_test.withColumn("session_type",(df_indexed_test.session)*10 + df_indexed_test.typeindex).select("aid","ts","session","session_type","type")
## Aggregating using "Window Partition"
windowPartition = Window.partitionBy("session_type").orderBy("aid")

df_freq_test=df_session_type_test.withColumn("cume_dist",
              cume_dist().over(windowPartition)).withColumn("frequency", 
                                                            count(col("ts")).over(windowPartition))

df_final_test = df_freq_test.withColumn("sumOfEvents",sum(col("frequency")).over(windowPartition))
df_test = df_final_test.select("aid","session_type",(df_final_test.cume_dist*(df_final_test.frequency / df_final_test.sumOfEvents)).alias('interest')) #(df_final_test.cume_dist*(df_final_test.frequency / df_final_test.sumOfEvents)).alias('interest'))
df_test_check=df_test.checkpoint() #Wall time: 29.1 s #Wall time: 24.8 s
df_test_check.write.parquet("test.parquet") 

你可能感兴趣的:(推荐系统学习笔记,kaggle,机器学习笔记,学习,python,人工智能)