最简单的推荐系统,起到抛砖引玉的作用,各位看官见谅!使用的是MovieLens里面的ml-100k的范例数据集。
path="hdfs://localhost:9000/user/fantastic_liar/"
rawUserData=sc.textFile(path+"data/u.data")
rawUserData.count()
100000
# 用户id,项目id,评价,日期时间
rawUserData.first()
'196\t242\t3\t881250949'
#导入Rating模块
from pyspark.mllib.recommendation import Rating
rawRatings=rawUserData.map(lambda line:line.split("\t")[:3])
rawRatings.take(5)
[['196', '242', '3'],
['186', '302', '3'],
['22', '377', '1'],
['244', '51', '2'],
['166', '346', '1']]
#ALS训练数据格式是RatingRDD数据类型,Rating定义如下(Rating(user,product,rating))
ratingsRDD=rawRatings.map(lambda x:(x[0],x[1],x[2]))
ratingsRDD.take(5)
[('196', '242', '3'),
('186', '302', '3'),
('22', '377', '1'),
('244', '51', '2'),
('166', '346', '1')]
numRatings=ratingsRDD.count()
numRatings
100000
#查看不重复用户数
numUsers=ratingsRDD.map(lambda x:x[0]).distinct().count()
print('numUsers:'+str(numUsers))
#查看不重复电影数
numMovies=ratingsRDD.map(lambda x:x[1]).distinct().count()
print('numMovies:'+str(numMovies))
numUsers:943
numMovies:1682
#训练模型
from pyspark.mllib.recommendation import ALS
ALS.train(ratings,rank,iterations=5,lambda_=0.01)
返回MatrixFactorizationModel
ALS.trainImplicit(ratings,rank,iterations=5,lambda_=0.01)
返回MatrixFactorizationModel
model=ALS.train(ratingsRDD,10,10,0.01)
print(model)
model.recommendProducts(user_id,num)
#使用模型进行推荐,给id为100的用户推荐前5部电影
model.recommendProducts(100,5)
[Rating(user=100, product=6, rating=6.241936757876449),
Rating(user=100, product=904, rating=5.539421122656328),
Rating(user=100, product=703, rating=5.422684536785385),
Rating(user=100, product=867, rating=5.396181773515554),
Rating(user=100, product=454, rating=5.235128109156419)]
#查看针对用户推荐产品的评分
model.predict(100,1141)
3.252229749490977
#针对电影200id 推荐前5个用户
model.recommendUsers(product=200,num=5)
[Rating(user=153, product=200, rating=7.209422239878188),
Rating(user=818, product=200, rating=6.953184913847615),
Rating(user=362, product=200, rating=6.2459555022979565),
Rating(user=252, product=200, rating=6.239842450193151),
Rating(user=547, product=200, rating=6.146069045954929)]
itemRDD=sc.textFile(path+"data/u.item")
print("电影数量:"+str(itemRDD.count()))
itemRDD.first()
电影数量:1682
'1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)|0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0'
#根据行数据进行处理
movieTitle=itemRDD.map(lambda line:line.split("|")).map(lambda a:(float(a[0]),a[1])).collectAsMap()
print(list(movieTitle.items())[:5])
[(1.0, 'Toy Story (1995)'), (2.0, 'GoldenEye (1995)'), (3.0, 'Four Rooms (1995)'), (4.0, 'Get Shorty (1995)'), (5.0, 'Copycat (1995)')]
### 针对用户id100 推荐前5个电影,并显示电影名
recommendP=model.recommendProducts(100,5)
for p in recommendP:
print('对用户:'+str(p[0])+\
' 推荐电影:'+str(movieTitle[p[1]])+\
' 推荐评分:'+str(p[2]))
对用户:100 推荐电影:Shanghai Triad (Yao a yao yao dao waipo qiao) (1995) 推荐评分:6.241936757876449
对用户:100 推荐电影:Ma vie en rose (My Life in Pink) (1997) 推荐评分:5.539421122656328
对用户:100 推荐电影:Widows' Peak (1994) 推荐评分:5.422684536785385
对用户:100 推荐电影:Whole Wide World, The (1996) 推荐评分:5.396181773515554
对用户:100 推荐电影:Bastard Out of Carolina (1996) 推荐评分:5.235128109156419