DEBUG = False
!pip install polars
!pip install snoop
from collections import defaultdict, Counter
import gc
from snoop import pp
import polars as pl
import pandas as pd
import numpy as np
import random
from polars.testing import assert_frame_equal, assert_series_equal
from datetime import datetime
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.max_colwidth', None)
cfg = pl.Config.restore_defaults()
pl.Config.set_tbl_rows(50)
pl.Config.set_fmt_str_lengths(1000)
if DEBUG: fraction_of_sessions_to_use = 0.00001
else: fraction_of_sessions_to_use = 1
train_ms = pl.scan_parquet('/kaggle/input/otto-radek-style-polars/train_ms.parquet')
test_ms = pl.scan_parquet('/kaggle/input/otto-radek-style-polars/test_ms.parquet')
sample_sub = pl.scan_csv('/kaggle/input/otto-recommender-system/sample_submission.csv')
子集训练和测试并将它们连接在一起
%%time
lucky_sessions_train = (
train_ms
.select([
pl.col('session').unique().sample(frac=fraction_of_sessions_to_use, seed=42)
])
.collect()
.to_series().to_list()
)
lucky_sessions_test = (
test_ms
.select([
pl.col('session').unique().sample(frac=fraction_of_sessions_to_use, seed=42)
])
.collect()
.to_series().to_list()
)
subset_of_train = (
train_ms
.filter(pl.col('session').is_in(lucky_sessions_train))
)
subset_of_test = (
test_ms
.filter(pl.col('session').is_in(lucky_sessions_test))
)
subsets = pl.concat([subset_of_train, subset_of_test]).collect()
sessions = subsets.select('session').unique().to_series().to_list()
pp(lucky_sessions_train[:3], len(lucky_sessions_train), lucky_sessions_test[:3], len(lucky_sessions_test),
subset_of_train.collect().height, subset_of_test.collect().height, subsets.height)
创建共同访问矩阵
共同访问矩阵只是探索以下想法/问题的名称
一个辅助设备/产品与同一会话中或所有会话中的其他辅助设备/产品之间是否存在任何关系?
是否有一些辅助工具与某些辅助工具更相似,而与其他辅助工具更不同?
当查看此辅助工具时,是否有可能,某些辅助工具比其他辅助工具更有可能被点击/购物车/订购?
我们能否为每个会话将辅助工具配对在一起并计算配对的次数?
由于一个辅助(例如,“122”)可以有许多配对伙伴,通过计算配对(“122”,配对伙伴)的出现次数,我们能否找到援助“122”最常见的配对伙伴?
下一次点击、购物车或订单可能是测试会话的最后一次辅助(或所有辅助)最常见的配对伙伴吗?
建立共同访问矩阵的第一个挑战是如何配对
在拉狄克的笔记本中,配对逻辑如下
仅使用每个会话的最后 30 个辅助工具相互配对
删除相同伙伴的对
保留一天内右伴侣在左伴侣之后的配对
我们可以调整配对逻辑来改变我们的共同访问矩阵
%%time
next_AIDs = defaultdict(Counter)
chunk_size = 300000
for i in range(0, len(sessions), chunk_size):
current_chunk = (
subsets
.filter(pl.col('session').is_between(sessions[i], sessions[np.min([i+chunk_size-1, len(sessions)-1])], closed='both'))
.unique() # no duplicates
.groupby('session').tail(30)
)
current_chunk = (
current_chunk
.join(current_chunk, on='session', suffix='_right')
.sort(['session', 'aid', 'aid_right']) # nice view
.filter(pl.col('aid') != pl.col('aid_right')) # no need for pairs of themselves
.with_columns([
((pl.col('ts_right') - pl.col('ts'))/(24*60*60*1000)).alias('days_elapsed') # differentiate aid_right is after or before aid in days
])
.filter((pl.col('days_elapsed')>=0) & (pl.col('days_elapsed') <=1)) # only pairs whose aid_rights are after aid within 24 hrs
)
# defaultdict + Counter is super faster than pure polars solution
for aid_x, aid_y in zip(current_chunk.select('aid').to_series().to_list(), current_chunk.select('aid_right').to_series().to_list()):
next_AIDs[aid_x][aid_y] += 1
print(f'{int(np.ceil(i/chunk_size))} out of {int(np.ceil(len(sessions)/chunk_size))} - {np.min([i+chunk_size-1, len(sessions)-1])} sessions are done')
len(next_AIDs)
polars版本和pandas版本差不多,在这里看拉狄克的速度
del train_ms, subset_of_train, subsets
gc.collect()
使用共同访问矩阵在测试会话中辅助人员少于 20 人时提供帮助候选人
拉狄克在这里向我们展示了两件事:
如何创建特征(基于时间、类型和出现的权重)以在测试会话中选择 20 个辅助工具
如何从共同访问矩阵中选择候选人?
在测试会话中为每种辅助工具选取 20 种最常见的辅助工具,并将它们放入候选列表中
从候选列表中选择 40 种最常见的辅助工具,如果它们是新参加会话的,请将它们添加到测试会话的辅助工具中
然后选择前 20 个辅助工具
%%time
lists_aids_types = (
test_ms
.unique() #
.groupby('session')
.agg([
pl.col('aid').list().alias('test_session_AIDs'),
pl.col('type').list().alias('test_session_types'),
])
.collect()
)
lists_aids_types.head()
%%time
labels = []
session_types = ['clicks', 'carts', 'orders']
no_data = 0
no_data_all_aids = 0
type_weight_multipliers = {0: 1, 1: 6, 2: 3}
test_session_AIDs = lists_aids_types.select('test_session_AIDs').to_series().to_list()
test_session_types = lists_aids_types.select('test_session_types').to_series().to_list()
# take each session's aids and types
for AIDs, types in zip(test_session_AIDs, test_session_types):
# if the session has more than 20 aids
if len(AIDs) >= 20:
# np.logspace: Return numbers spaced evenly on a log scale.
# `-1` is to ensure the weights ranges between [0,1]
# the weights is given to AIDs based on the time order or chronological order
weights=np.logspace(start=0.1,stop=1,num=len(AIDs),base=2, endpoint=True)-1
# create a defaultdict for this session only
# anything added into this dict will have a default value 0
# try `aids_temp[1]` and `aids_temp`
aids_temp=defaultdict(lambda: 0)
# in each sess, an aid may occur multiples in multiple types at different time,
# the line below is to take all 3 factors into account to value the importance of this aid to the session
# each unique aid and its aggregated weight are stored in a defaultdict
for aid,w,t in zip(AIDs,weights,types):
aids_temp[aid]+= w * type_weight_multipliers[t]
# let's
sorted_aids=[k for k, v in sorted(aids_temp.items(), key=lambda item: -item[1])]
# when using the polars below to replace the line above, it is actually 2 times slower
# aid = [key for (key, value) in aids_temp.items()]
# adwt = [value for (key, value) in aids_temp.items()]
# sorted_aids = (
# pl.DataFrame([aid, adwt], columns=['aid', 'weight'])
# .sort('weight', reverse=True)
# .select('aid').to_series().to_list()
# )
# take the 20 aids with the largest weights from this session as one list and append it into a new list `labels`
labels.append(sorted_aids[:20])
# when this session has less than 20 aids
else:
# reverse the order of AIDs (a list of aids of this session) and remove the duplicated aids
AIDs = list(dict.fromkeys(AIDs[::-1])) # python version
# If using this polars below to replace the line above, it is infinitely slower
# AIDs = pl.Series('aid', AIDs).unique().reverse().to_list() # polars version
# keep track of the length of new AIDs above
AIDs_len_start = len(AIDs)
candidates = []
# take each unique aid of this session, access its the 20 most common pair-partners and their counts
# insert the list of the 20 most common pair-partner aids into another list `candidates` (only a pure list )
# in the end, this `candidates` list is a lot and has many duplicates too
for AID in AIDs:
if AID in next_AIDs: candidates += [aid for aid, count in next_AIDs[AID].most_common(20)]
# take the 40 most common aids from `candidates`, and if they are already inside AIDs of this session,
# then insert them into AIDs (still a pure list because of `+`, and `append` can't do it)
AIDs += [AID for AID, cnt in Counter(candidates).most_common(40) if AID not in AIDs]
# but we still only take the first 20 aids from AIDs as this session's prediction and store it in `labels`
labels.append(AIDs[:20])
# if no candidates are generated, count 1 to `no_data`
# if candidates == []: no_data += 1 # this variable is actually not used by Radek
# keep an account of the num of aids in this session and all sessions which adding no candidates
if AIDs_len_start == len(AIDs): no_data_all_aids += 1
sample_sub.fetch().head()
创建提交标签
%%time
(
pl.DataFrame({'session': lists_aids_types.select('session').to_series().to_list(),
'labels': labels})
.with_columns([
pl.col('labels').arr.eval(pl.element().cast(pl.Utf8)).arr.join(' '),
(pl.col('session')+"_clicks").alias('clicks'),
(pl.col('session')+"_carts").alias('carts'),
(pl.col('session')+"_orders").alias('orders'),
])
.select([
'session',
pl.concat_list(['clicks', 'carts', 'orders']).alias('session_type'),
'labels'
])
.explode('session_type')
.sort('session')
.select(pl.exclude('session'))
.write_csv('submission.csv')
)
print(f'Test sessions that we did not manage to extend based on the co-visitation matrix: {no_data_all_aids}')
pl.read_csv('submission.csv').shape
sample_sub.collect().shape
# from matplotlib import pyplot as plt
# plt.hist([len(l) for l in labels]);
# plt.suptitle('Distribution of predicted sequence lengths');