Surprise is an easy-to-use Python scikit for recommender systems.
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate
# 加载movielens数据集
data = Dataset.load_builtin('ml-100k')
# SVD实例化
algo = SVD()
# 5折验证,并输出结果
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
Dataset ml-100k could not be found. Do you want to download it? [Y/n] y
Trying to download dataset from http://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to C:\Users\Administrator/.surprise_data/ml-100k
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).
Fold 1 Fold 2 Fold 3 Fold 4 Fold 5 Mean Std
RMSE (testset) 0.9429 0.9262 0.9353 0.9313 0.9427 0.9357 0.0065
MAE (testset) 0.7420 0.7306 0.7381 0.7343 0.7415 0.7373 0.0044
Fit time 6.75 6.65 6.81 6.97 6.79 6.79 0.10
Test time 0.29 0.28 0.31 0.24 0.28 0.28 0.03
{'fit_time': (6.748954772949219,
6.648886442184448,
6.814781904220581,
6.970685958862305,
6.785797357559204),
'test_mae': array([0.74200524, 0.73058076, 0.73807502, 0.73425662, 0.74150664]),
'test_rmse': array([0.94290798, 0.92623843, 0.9352968 , 0.93130338, 0.94273246]),
'test_time': (0.2868227958679199,
0.2778284549713135,
0.3148069381713867,
0.23685264587402344,
0.28182458877563477)}
from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split
# 加载数据
data = Dataset.load_builtin('ml-100k')
# 25%的数据用于测试
trainset, testset = train_test_split(data, test_size=.25)
# 实例化
algo = SVD()
algo.fit(trainset)
predictions = algo.test(testset)
# RMSE
accuracy.rmse(predictions)
RMSE: 0.9392
0.9391726088618421
我们也可以简单地将算法拟合到整个数据集,而不是运行交叉验证。 这可以通过使用build_full_trainset()方法来完成,该方法将构建一个trainset对象:
from surprise import KNNBasic
from surprise import Dataset
# 加载数据
data = Dataset.load_builtin('ml-100k')
# 恢复训练集
trainset = data.build_full_trainset()
# 实例化协同过滤、训练
algo = KNNBasic()
algo.fit(trainset)
Computing the msd similarity matrix...
Done computing similarity matrix.
uid = str(196) # 原始用户id
iid = str(302) # 原始物品ID
# 预测用户对物品的评分
pred = algo.predict(uid, iid, r_ui=4, verbose=True)
user: 196 item: 302 r_ui = 4.00 est = 4.06 {'actual_k': 40, 'was_impossible': False}
算法类名 说明
random_pred.NormalPredictor 根据训练集的分布特征随机给出一个预测值
baseline_only.BaselineOnly 给定用户和Item,给出基于baseline的估计值
knns.KNNBasic 最基础的协同过滤
knns.KNNWithMeans 将每个用户评分的均值考虑在内的协同过滤实现
knns.KNNBaseline 考虑基线评级的协同过滤
matrix_factorization.SVD SVD实现
matrix_factorization.SVDpp SVD++,即LFM+SVD
matrix_factorization.NMF 基于矩阵分解的协同过滤
slope_one.SlopeOne 一个简单但精确的协同过滤算法
co_clustering.CoClustering 基于协同聚类的协同过滤算法
相似度度量标准 度量标准说明
cosine 计算所有用户(或物品)对之间的余弦相似度。
msd 计算所有用户(或物品)对之间的均方差异相似度。
pearson 计算所有用户(或物品)对之间的Pearson相关系数。
pearson_baseline 计算所有用户(或物品)对之间的(缩小的)Pearson相关系数,使用基线进行居中而不是平均值。
评估准则 准则说明
rmse 计算RMSE(均方根误差)。
mae 计算MAE(平均绝对误差)。
fcp 计算FCP(协调对的分数)。
import os
from surprise import BaselineOnly # 给定用户和Item,给出基于baseline的估计值
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
# 路径
file_path = os.path.expanduser('~/.surprise_data/ml-100k/ml-100k/u.data')
# 'user item rating timestamp', '\t' 分割.
reader = Reader(line_format='user item rating timestamp', sep='\t')
data = Dataset.load_from_file(file_path, reader=reader)
# 使用数据
cross_validate(BaselineOnly(), data, verbose=True)
# 25%的数据用于测试
trainset, testset = train_test_split(data, test_size=.25)
blo = BaselineOnly()
blo.fit(trainset)
blo.predict(196, 302, 4, verbose=True)
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE, MAE of algorithm BaselineOnly on 5 split(s).
Fold 1 Fold 2 Fold 3 Fold 4 Fold 5 Mean Std
RMSE (testset) 0.9470 0.9402 0.9467 0.9442 0.9418 0.9440 0.0027
MAE (testset) 0.7528 0.7427 0.7502 0.7480 0.7474 0.7482 0.0034
Fit time 0.23 0.28 0.33 0.25 0.24 0.27 0.04
Test time 0.28 0.32 0.25 0.19 0.24 0.26 0.04
Estimating biases using als...
3.8799791205908227
import pandas as pd
from surprise import NormalPredictor
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
# 制造数据
ratings_dict = {'itemID': [1, 1, 1, 2, 2],
'userID': [9, 32, 2, 45, 'user_foo'],
'rating': [3, 2, 4, 3, 1]}
df = pd.DataFrame(ratings_dict)
# 设置rating为1到5
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)
print(type(data))
# 评估
cross_validate(NormalPredictor(), data, cv=3)
{'fit_time': (0.0, 0.0, 0.0),
'test_mae': array([1.82749483, 1.36961054, 1.08665964]),
'test_rmse': array([2.42042007, 1.3756825 , 1.08665964]),
'test_time': (0.0, 0.0009999275207519531, 0.0)}
对于交叉验证,我们可以使用cross_validate()函数为我们完成所有艰苦的工作。 但是为了更好地控制,我们还可以实现交叉验证迭代器,并使用迭代器的split()方法和算法的test()方法对每个拆分进行预测。
from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import KFold
# 加载数据
data = Dataset.load_builtin('ml-100k')
# 3折交叉验证
kf = KFold(n_splits=3)
algo = SVD()
for trainset, testset in kf.split(data):
# 训练、预测
algo.fit(trainset)
predictions = algo.test(testset)
# 评估
accuracy.rmse(predictions, verbose=True) # verbose: 如果True, 会打印结果.
RMSE: 0.9460
RMSE: 0.9494
RMSE: 0.9457
movielens-100K数据集已经提供了5个训练和测试文件(u1.base,u1.test … u5.base,u5.test)。
surprise可以通过使用surprise.model_selection.split.PredefinedKFold对象来处理这种情况:
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import PredefinedKFold
files_dir = os.path.expanduser('~/.surprise_data/ml-100k/ml-100k/')
reader = Reader('ml-100k')
# [(u1.base, u1.test), (u2.base, u2.test), ... (u5.base, u5.test)]
train_file = files_dir + 'u%d.base'
test_file = files_dir + 'u%d.test'
folds_files = [(train_file % i, test_file % i) for i in (1, 2, 3, 4, 5)]
data = Dataset.load_from_folds(folds_files, reader=reader)
pkf = PredefinedKFold()
algo = SVD()
for trainset, testset in pkf.split(data):
algo.fit(trainset)
predictions = algo.test(testset)
accuracy.rmse(predictions, verbose=True)
# print(predictions)
cross_validate()函数报告给定参数集的交叉验证过程的准确度度量。
如果想知道哪个参数组合产生最佳结果,GridSearchCV类就可以解决问题。
给定参数的字典,该类详尽地尝试所有参数组合并报告任何精度测量的最佳参数(在不同的分裂上取平均值)。
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import GridSearchCV
data = Dataset.load_builtin('ml-100k')
param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
'reg_all': [0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
gs.fit(data)
print(gs.best_score['rmse']) # 分数
print(gs.best_params['rmse']) # 参数
import pandas as pd
results_df = pd.DataFrame.from_dict(gs.cv_results)
results_df
0.9642869135146698
{'reg_all': 0.4, 'n_epochs': 10, 'lr_all': 0.005}
mean_fit_time | mean_test_mae | mean_test_rmse | mean_test_time | param_lr_all | param_n_epochs | param_reg_all | params | rank_test_mae | rank_test_rmse | split0_test_mae | split0_test_rmse | split1_test_mae | split1_test_rmse | split2_test_mae | split2_test_rmse | std_fit_time | std_test_mae | std_test_rmse | std_test_time | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.496731 | 0.806236 | 0.997472 | 0.511683 | 0.002 | 5 | 0.4 | {'reg_all': 0.4, 'n_epochs': 5, 'lr_all': 0.002} | 7 | 7 | 0.807283 | 0.997869 | 0.807397 | 0.999162 | 0.804027 | 0.995386 | 0.034429 | 0.001562 | 0.001567 | 0.062379 |
1 | 1.403456 | 0.782359 | 0.974123 | 0.497358 | 0.005 | 5 | 0.4 | {'reg_all': 0.4, 'n_epochs': 5, 'lr_all': 0.005} | 2 | 2 | 0.783014 | 0.974045 | 0.784169 | 0.976183 | 0.779894 | 0.972142 | 0.003297 | 0.001806 | 0.001650 | 0.062849 |
2 | 2.811914 | 0.786120 | 0.978227 | 0.492694 | 0.002 | 10 | 0.4 | {'reg_all': 0.4, 'n_epochs': 10, 'lr_all': 0.002} | 4 | 4 | 0.786966 | 0.978410 | 0.787427 | 0.979666 | 0.783967 | 0.976606 | 0.003398 | 0.001534 | 0.001256 | 0.064478 |
3 | 2.794590 | 0.773040 | 0.964287 | 0.537333 | 0.005 | 10 | 0.4 | {'reg_all': 0.4, 'n_epochs': 10, 'lr_all': 0.005} | 1 | 1 | 0.773070 | 0.963666 | 0.775167 | 0.966541 | 0.770884 | 0.962653 | 0.008335 | 0.001749 | 0.001647 | 0.012490 |
4 | 1.410455 | 0.814898 | 1.003614 | 0.484698 | 0.002 | 5 | 0.6 | {'reg_all': 0.6, 'n_epochs': 5, 'lr_all': 0.002} | 8 | 8 | 0.816255 | 1.004197 | 0.815952 | 1.005326 | 0.812487 | 1.001319 | 0.005308 | 0.001710 | 0.001687 | 0.060862 |
5 | 1.470082 | 0.793487 | 0.983101 | 0.542994 | 0.005 | 5 | 0.6 | {'reg_all': 0.6, 'n_epochs': 5, 'lr_all': 0.005} | 5 | 5 | 0.794289 | 0.983202 | 0.795240 | 0.985284 | 0.790931 | 0.980816 | 0.023524 | 0.001848 | 0.001825 | 0.058165 |
6 | 2.980475 | 0.796703 | 0.986454 | 0.527671 | 0.002 | 10 | 0.6 | {'reg_all': 0.6, 'n_epochs': 10, 'lr_all': 0.002} | 6 | 6 | 0.797903 | 0.986878 | 0.797934 | 0.988105 | 0.794272 | 0.984379 | 0.087440 | 0.001719 | 0.001550 | 0.018768 |
7 | 2.823572 | 0.784945 | 0.974213 | 0.494693 | 0.005 | 10 | 0.6 | {'reg_all': 0.6, 'n_epochs': 10, 'lr_all': 0.005} | 3 | 3 | 0.785202 | 0.973794 | 0.787113 | 0.976659 | 0.782519 | 0.972187 | 0.003396 | 0.001884 | 0.001850 | 0.057241 |
# 选择最优参数对应模型
algo = gs.best_estimator['rmse']
algo.fit(data.build_full_trainset())
algo.predict(193, 302, 4, verbose=True)
user: 193 item: 302 r_ui = 4.00 est = 3.53 {'was_impossible': False}
Prediction(uid=193, iid=302, r_ui=4, est=3.52986, details={'was_impossible': False})
在命令行中使用
surprise -algo SVD -params “{‘n_epochs’: 5, ‘verbose’: True}” -load-builtin ml-100k -n-folds 3
surprise -h
Surprise提供了一堆内置算法。 所有算法都来自AlgoBase基类,其中实现了一些关键方法(例如predict,fit和test)。 可以在prediction_algorithms包文档中找到可用预测算法的列表和详细信息。
每个算法都是全局Surprise命名空间的一部分,因此您只需要从Surprise包中导入它们的名称
from surprise import KNNBasic
algo = KNNBasic()
这些算法中的一些可以使用 baseline estimates,一些可以使用similarity measure。
\sum_{r_{ui} \in R_{train}} \left(r_{ui} - (\mu + b_u + b_i)\right)^2 +
\lambda \left(b_u^2 + b_i^2 \right)
可以通过两种不同的方式估算基线:
使用随机梯度下降(SGD)。
使用交替最小二乘法(ALS)。
print('Using ALS')
bsl_options = {'method': 'als',
'n_epochs': 5,
'reg_u': 12,
'reg_i': 5
}
algo = BaselineOnly(bsl_options=bsl_options)
Using ALS
print('Using SGD')
bsl_options = {'method': 'sgd',
'learning_rate': .00005,
}
algo = BaselineOnly(bsl_options=bsl_options)
Using SGD
bsl_options = {'method': 'als',
'n_epochs': 20,
}
sim_options = {'name': 'pearson_baseline'}
algo = KNNBasic(bsl_options=bsl_options, sim_options=sim_options)
许多算法使用相似性度量来估计评级。 它们的配置方式与基线评级类似:只需在创建算法时传递sim_options参数即可。 此参数是包含以下(所有可选)键的字典:
‘name’:相似性模块中定义的相似性名称。 默认为’MSD’。
‘user_based’:是否在用户之间或项目之间计算相似性。 这对预测算法的性能有很大影响。 默认为True。
‘min_support’:公共项的最小数量(当’user_based’为’True’时)或最小公共用户数(当’user_based’为’False’时),相似度不为零。 简单地说,如果| Iuv | ‘shrinkage’:要应用的收缩参数(仅与pearson_baseline相似性相关)。 默认值为100。 如何使用Surprise构建自定义预测算法 创建自己的预测算法非常简单:算法只不过是一个派生自AlgoBase的类,它具有估计方法。 prediction 不可用时 https://surprise.readthedocs.io/en/stable/prediction_algorithms_package.html https://surprise.readthedocs.io/en/stable/model_selection.html KFold A basic cross-validation iterator. RepeatedKFold Repeated KFold cross validator. ShuffleSplit A basic cross-validation iterator with random trainsets and testsets. LeaveOneOut Cross-validation iterator where each user has exactly one rating in the testset. PredefinedKFold A cross-validation iterator to when a dataset has been loaded with the load_from_folds method. surprise.model_selection.validation.cross_validate(algo, data, measures=[u’rmse’, u’mae’], cv=None, return_train_measures=False, n_jobs=1, pre_dispatch=u’2*n_jobs’, verbose=False) surprise.model_selection.search.RandomizedSearchCV(algo_class, param_distributions, n_iter=10, measures=[u’rmse’, u’mae’], cv=None, refit=False, return_train_measures=False, n_jobs=1, pre_dispatch=u’2*n_jobs’, random_state=None, joblib_verbose=0) surprise.model_selection.search.GridSearchCV(algo_class, param_grid, measures=[u’rmse’, u’mae’], cv=None, refit=False, return_train_measures=False, n_jobs=1, pre_dispatch=u’2*n_jobs’, joblib_verbose=0) https://surprise.readthedocs.io/en/stable/similarities.html# cosine: Compute the cosine similarity between all pairs of users (or items). msd: Compute the Mean Squared Difference similarity between all pairs of users (or items). pearson: Compute the Pearson correlation coefficient between all pairs of users (or items). pearson_baseline: Compute the (shrunk) Pearson correlation coefficient between all pairs of users (or items) using baselines for centering instead of means. https://surprise.readthedocs.io/en/stable/accuracy.html rmse: Compute RMSE (Root Mean Squared Error). mae: Compute MAE (Mean Absolute Error). fcp: Compute FCP (Fraction of Concordant Pairs). https://surprise.readthedocs.io/en/stable/dataset.html Dataset.load_builtin: Load a built-in dataset. Dataset.load_from_file: Load a dataset from a (custom) file. Dataset.load_from_folds: Load a dataset where folds (for cross-validation) are predefined by some files. Dataset.folds: Generator function to iterate over the folds of the Dataset. DatasetAutoFolds.split: Split the dataset into folds for future cross-validation. https://surprise.readthedocs.io/en/stable/trainset.html .Trainset(ur, ir, n_users, n_items, n_ratings, rating_scale, offset, raw2inner_id_users, raw2inner_id_items) The mean of all ratings μ. Generator function to iterate over all items. Generator function to iterate over all ratings. Generator function to iterate over all users. Return a list of ratings that can be used as a testset in the test() method. Return a list of ratings that can be used as a testset in the test() method. Return the mean of all ratings. Indicate if the item is part of the trainset. Indicate if the user is part of the trainset. Convert an item raw id to an inner id. Convert a user raw id to an inner id. Convert an item inner id to a raw id. Convert a user inner id to a raw id. 用户和项目具有原始ID和内部ID。 一些方法将使用/返回原始id(例如predict()方法),而另一些方法将使用/返回内部id。 原始ID是评级文件或pandas数据框中定义的ID。 它们可以是字符串或数字。 请注意,如果从作为标准方案的文件中读取评级,则将它们表示为字符串。 重要的是要知道您是否正在使用例如 predict()或其他接受原始id作为参数的方法。 在trainset创建时,每个原始id都映射到一个名为inner id的唯一整数,这更适合于Surprise操作。 原始ID和内部ID之间的转换可以使用 https://surprise.readthedocs.io/en/stable/reader.html surprise.reader.Reader(name=None, line_format=u’user item rating’, sep=None, rating_scale=(1, 5), skip_lines=0) name (string, optional) – If specified, a Reader for one of the built-in datasets is returned and any other parameter is ignored. Accepted values are ‘ml-100k’, ‘ml-1m’, and ‘jester’. Default is None. line_format (string) – The fields names, in the order at which they are encountered on a line. Please note that line_format is always space-separated (use the sep parameter). Default is ‘user item rating’. sep (char) – the separator between fields. Example : ‘;’. rating_scale (tuple, optional) – The rating scale used for every rating. Default is (1, 5). skip_lines (int, optional) – Number of lines to skip at the beginning of the file. Default is 0. https://surprise.readthedocs.io/en/stable/evaluate.html 1 surprise.evaluate.GridSearch(algo_class, param_grid, measures=[u’rmse’, u’mae’], n_jobs=1, pre_dispatch=u’2*n_jobs’, seed=None, verbose=1, joblib_verbose=0) dict of arrays – A dict that contains all parameters and accuracy information for each combination. Can be imported into a pandas DataFrame. dict of AlgoBase – Using an accuracy measure as key, get the estimator that gave the best accuracy results for the chosen measure. dict of floats – Using an accuracy measure as key, get the best score achieved for that measure. dict of dicts – Using an accuracy measure as key, get the parameters combination that gave the best accuracy results for the chosen measure. dict of ints – Using an accuracy measure as key, get the index that can be used with cv_results that achieved the highest accuracy for that measure. Runs the grid search on dataset. 2 surprise.evaluate.evaluate(algo, data, measures=[u’rmse’, u’mae’], with_dump=False, dump_dir=None, verbose=1) https://surprise.readthedocs.io/en/stable/dump.html surprise.dump.dump(file_name, predictions=None, algo=None, verbose=0) surprise.dump.load(file_name) Estimating biases using als…sim_options = {'name': 'cosine',
'user_based': False # compute similarities between items
}
algo = KNNBasic(sim_options=sim_options)
sim_options = {'name': 'pearson_baseline',
'shrinkage': 0 # no shrinkage
}
algo = KNNBasic(sim_options=sim_options)
3 How to build your own prediction algorithm
这是predict()方法调用的方法。 它接受一个内部用户id,一个内部项ID,并返回估计值。from surprise import AlgoBase
from surprise import Dataset
from surprise.model_selection import cross_validate
import numpy as np
class MyOwnAlgorithm(AlgoBase):
def __init__(self):
AlgoBase.__init__(self)
def fit(self, trainset):
AlgoBase.fit(self, trainset)
self.the_mean = np.mean([r for (_, _, r) in
self.trainset.all_ratings()])
return self
def estimate(self, u, i):
sum_means = self.trainset.global_mean
div = 1
if self.trainset.knows_user(u):
sum_means += np.mean([r for (_, r) in self.trainset.ur[u]])
div += 1
if self.trainset.knows_item(i):
sum_means += np.mean([r for (_, r) in self.trainset.ir[i]])
div += 1
return sum_means / div
data = Dataset.load_builtin('ml-100k')
algo = MyOwnAlgorithm()
cross_validate(algo, data, verbose=True)
Evaluating RMSE, MAE of algorithm MyOwnAlgorithm on 5 split(s).
Fold 1 Fold 2 Fold 3 Fold 4 Fold 5 Mean Std
RMSE (testset) 1.0179 1.0165 1.0175 1.0216 1.0156 1.0178 0.0021
MAE (testset) 0.8380 0.8356 0.8376 0.8414 0.8364 0.8378 0.0020
Fit time 0.04 0.06 0.06 0.07 0.08 0.06 0.01
Test time 2.94 2.86 2.95 3.05 3.05 2.97 0.07
{'fit_time': (0.03598380088806152,
0.06396150588989258,
0.05696725845336914,
0.06996297836303711,
0.07695245742797852),
'test_mae': array([0.83803386, 0.83556254, 0.83764556, 0.84141284, 0.83639388]),
'test_rmse': array([1.01792507, 1.01651414, 1.0175074 , 1.02157154, 1.01555266]),
'test_time': (2.9401426315307617,
2.862196445465088,
2.9531378746032715,
3.045079231262207,
3.051081657409668)}
from surprise import PredictionImpossible
class MyOwnAlgorithm(AlgoBase):
def __init__(self, sim_options={}, bsl_options={}):
AlgoBase.__init__(self, sim_options=sim_options,
bsl_options=bsl_options)
def fit(self, trainset):
AlgoBase.fit(self, trainset)
self.bu, self.bi = self.compute_baselines()
self.sim = self.compute_similarities()
return self
def estimate(self, u, i):
if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
raise PredictionImpossible('User and/or item is unkown.')
# 计算u和v之间的相似性,其中v表示评价项目i的所有其他用户。
neighbors = [(v, self.sim[u, v]) for (v, r) in self.trainset.ir[i]]
# 根据相似度排序
neighbors = sorted(neighbors, key=lambda x: x[1], reverse=True)
print('The 3 nearest neighbors of user', str(u), 'are:')
for v, sim_uv in neighbors[:3]:
print('user {0:} with sim {1:1.2f}'.format(v, sim_uv))
4 prediction_algorithms package
5 The model_selection package
Cross validation iterators 使用前需要实例化
Cross validation
Parameter search
6 similarities module
7 accuracy module
8 dataset module
9 Trainset class
lobal_mean
all_items()
Yields: Inner id of items.all_ratings()
Yields: A tuple (uid, iid, rating) where ids are inner ids (see this note).all_users()
Yields: Inner id of users.build_anti_testset(fill=None)
The ratings are all the ratings that are not in the trainset, i.e. all the ratings rui where the user u is known, the item i is known, but the rating rui is not in the trainset. As rui is unknown, it is either replaced by the fill value or assumed to be equal to the mean of all ratings global_mean.
Parameters: fill (float) – The value to fill unknown ratings. If None the global mean of all ratings global_mean will be used.
Returns: A list of tuples (uid, iid, fill) where ids are raw ids.build_testset()
The ratings are all the ratings that are in the trainset, i.e. all the ratings returned by the all_ratings() generator. This is useful in cases where you want to to test your algorithm on the trainset.global_mean
It’s only computed once.knows_item(iid)
An item is part of the trainset if the item was rated at least once.
Parameters: iid (int) – The (inner) item id. See this note.
Returns: True if item is part of the trainset, else False.knows_user(uid)
A user is part of the trainset if the user has at least one rating.
Parameters: uid (int) – The (inner) user id. See this note.
Returns: True if user is part of the trainset, else False.to_inner_iid(riid)
Parameters: riid (str) – The item raw id.
Returns: The item inner id.
Return type: int
Raises: ValueError – When item is not part of the trainset.to_inner_uid(ruid)
Parameters: ruid (str) – The user raw id.
Returns: The user inner id.
Return type: int
Raises: ValueError – When user is not part of the trainset.to_raw_iid(iiid)
Parameters: iiid (int) – The item inner id.
Returns: The item raw id.
Return type: str
Raises: ValueError – When iiid is not an inner id.to_raw_uid(iuid)
Parameters: iuid (int) – The user inner id.
Returns: The user raw id.
Return type: str
Raises: ValueError – When iuid is not an inner id10 Reader
11 evaluate module
cv_results
best_estimator
best_score
best_params
best_index
evaluate(data)
Class instance attributes can be accessed after the evaluate is done.
Parameters: data (Dataset) – The dataset on which to evaluate the algorithm.12 dump module
实例
# -*- coding:utf-8 -*-
import os
import io
from surprise import KNNBaseline
from surprise import Dataset
# 训练推荐模型 步骤:1
def getSimModle():
# 默认载入movielens数据集
data = Dataset.load_builtin('ml-100k')
trainset = data.build_full_trainset()
#使用pearson_baseline方式计算相似度 False以item为基准计算相似度 本例为电影之间的相似度
sim_options = {'name': 'pearson_baseline', 'user_based': False}
##使用KNNBaseline算法
algo = KNNBaseline(sim_options=sim_options)
#训练模型
algo.fit(trainset)
return algo
# 获取id到name的互相映射 步骤:2
def read_item_names():
"""
获取电影名到电影id 和 电影id到电影名的映射
"""
file_name = (os.path.expanduser('~') + '/.surprise_data/ml-100k/ml-100k/u.item')
rid_to_name = {}
name_to_rid = {}
with io.open(file_name, 'r', encoding='ISO-8859-1') as f:
for line in f:
line = line.split('|')
rid_to_name[line[0]] = line[1]
name_to_rid[line[1]] = line[0]
return rid_to_name, name_to_rid
# 基于之前训练的模型 进行相关电影的推荐 步骤:3
def showSimilarMovies(algo, rid_to_name, name_to_rid):
# 获得电影Toy Story (1995)的raw_id
toy_story_raw_id = name_to_rid['Toy Story (1995)']
#把电影的raw_id转换为模型的内部id
toy_story_inner_id = algo.trainset.to_inner_iid(toy_story_raw_id)
#通过模型获取推荐电影 这里设置的是10部
toy_story_neighbors = algo.get_neighbors(toy_story_inner_id, 10)
#模型内部id转换为实际电影id
neighbors_raw_ids = [algo.trainset.to_raw_iid(inner_id) for inner_id in toy_story_neighbors]
#通过电影id列表 或得电影推荐列表
neighbors_movies = [rid_to_name[raw_id] for raw_id in neighbors_raw_ids]
print('The 10 nearest neighbors of Toy Story are:')
for movie in neighbors_movies:
print(movie)
if __name__ == '__main__':
# 获取id到name的互相映射
rid_to_name, name_to_rid = read_item_names()
# 训练推荐模型
algo = getSimModle()
##显示相关电影
showSimilarMovies(algo, rid_to_name, name_to_rid)
Computing the pearson_baseline similarity matrix…
Done computing similarity matrix.
The 10 nearest neighbors of Toy Story are:
Beauty and the Beast (1991)
Raiders of the Lost Ark (1981)
That Thing You Do! (1996)
Lion King, The (1994)
Craft, The (1996)
Liar Liar (1997)
Aladdin (1992)
Cool Hand Luke (1967)
Winnie the Pooh and the Blustery Day (1968)
Indiana Jones and the Last Crusade (1989)