本例使用MovieLens ml-100k数据集实现ALS矩阵分解算法,附上数据集下载链接.
打开jupyter notebook,新建一个Python3 notebook:
import os
import sys
# 动态加载pyspark目录
SPARK_HOME = os.environ.get('SPARK_HOME')
sys.path.insert(0,os.path.join(SPARK_HOME, 'python'))
sys.path.insert(0,os.path.join(SPARK_HOME, 'python/lib/py4j-0.10.7-src.zip'))
# exec(open(os.path.join(SPARK_HOME, 'python/pyspark/shell.py')).read())
# 创建实例
from pyspark import SparkContext
sc = SparkContext('local','ALS')
# 读取u.data
# 默认是从hdfs上读取文件,因此加上file:,表示本地路径
user_data = sc.textFile('file:/usr/local/test/u.data')
user_data.first()
'196\t242\t3\t881250949'
# 4项对应user_id、item_id 、rating和timestamp,以\t作为分隔符
# 时间戳用不上,截取前三项
rate = user_data.map(lambda x: x.split("\t")[0:3])
rate.first()
['196', '242', '3']
# 转换成Rating格式
from pyspark.mllib.recommendation import Rating
rate_data = rate.map(lambda x: Rating(int(x[0]), int(x[1]), int(x[2])))
rate_data.first()
Rating(user=196, product=242, rating=3.0)
# 模型训练
from pyspark.mllib.recommendation import ALS
# checkpoint机制
sc.setCheckpointDir('checkpoint/')
ALS.checkpointInterval = 2
# 设置矩阵的分解维度为20,最大迭代次数为5,正则化系数为0.02
model = ALS.train(ratings=rate_data, rank=20, iterations=5, lambda_=0.02)
# 预测用户666都电影666的评分
model.predict(666,666)
1.6793250024079442
# 预测用户666最喜欢的前十部电影
model.recommendProducts(666,10)
[Rating(user=666, product=1131, rating=5.103325669008839),
Rating(user=666, product=262, rating=5.037359302350613),
Rating(user=666, product=242, rating=5.029183432693649),
Rating(user=666, product=302, rating=4.951169172263912),
Rating(user=666, product=874, rating=4.930831851890693),
Rating(user=666, product=268, rating=4.918230254487493),
Rating(user=666, product=246, rating=4.886179007705294),
Rating(user=666, product=1449, rating=4.8740748227569455),
Rating(user=666, product=900, rating=4.870815539606349),
Rating(user=666, product=269, rating=4.870557667136832)]
# 预测每个用户最喜欢的三部电影
model.recommendProductsForUsers(3).collect()
[(451,
(Rating(user=451, product=350, rating=5.669488103740991),
Rating(user=451, product=1294, rating=5.5195576378824915),
Rating(user=451, product=260, rating=5.225247353937471))),
(454,
(Rating(user=454, product=1129, rating=4.471292220516824),
Rating(user=454, product=512, rating=4.423658991387322),
Rating(user=454, product=102, rating=4.397846063233699))),
(147,
(Rating(user=147, product=344, rating=6.150454318304409),
Rating(user=147, product=1038, rating=5.899384353951334),
Rating(user=147, product=533, rating=5.643529079524351))),
...
]
# 预测每部电影给分最高的三个用户
model.recommendUsersForProducts(3).collect()
[(1084,
(Rating(user=341, product=1084, rating=6.714810319516666),
Rating(user=675, product=1084, rating=6.4562103138149896),
Rating(user=511, product=1084, rating=6.283989191188922))),
(1410,
(Rating(user=93, product=1410, rating=5.288591420885184),
Rating(user=487, product=1410, rating=4.858704979801361),
Rating(user=493, product=1410, rating=4.5118411059921755))),
(667,
(Rating(user=614, product=667, rating=5.417582096167826),
Rating(user=762, product=667, rating=5.1406747758074065),
Rating(user=475, product=667, rating=4.989878412498716))),
...
]
用Spark学习矩阵分解推荐算法